From 0a9c0f52c338f39bf4820a95f4256754539bc19f Mon Sep 17 00:00:00 2001 From: Groboclown Date: Fri, 9 Feb 2024 01:08:06 -0600 Subject: [PATCH 1/5] A mildly simple approach at introducing the 'combined diff format' This adds in a new diff format parser based on the combined diff format: https://git-scm.com/docs/git-diff Git generates this when you perform a 'git diff --cc' or 'git diff -c' invocation. It shows the differences when the file had more than 1 parent. Without this fix, such differences are ignored. This is to help support fixing [issue 1028](https://github.com/gitleaks/gitleaks/issues/1028) in GitLeaks. GitLeaks will also need to add the '--cc' argument, as mentioned by that issue. There are a few limitations with this. A big one is in the file header termination detection. For now, it only scans for the 1-parent style ('@@ -') and the 2-parent style ('@@@ -'). The combined diff format allows for having many more than just 2 parents, so this will skip those cases. I've updated the README to note about this limitation. This includes some unit test coverage. It's not exhaustive, but it does lend some credibility to the code. --- README.md | 5 + gitdiff/combined.go | 269 +++++++++ gitdiff/combined_test.go | 565 ++++++++++++++++++ gitdiff/file_header.go | 17 +- gitdiff/parser.go | 1 + gitdiff/parser_test.go | 139 +++++ .../apply/combined_fragment_add_middle.out | 5 + .../apply/combined_fragment_add_middle.patch | 9 + .../apply/combined_fragment_add_middle.src | 3 + gitdiff/testdata/three_combined_files.patch | 61 ++ 10 files changed, 1070 insertions(+), 4 deletions(-) create mode 100644 gitdiff/combined.go create mode 100644 gitdiff/combined_test.go create mode 100644 gitdiff/testdata/apply/combined_fragment_add_middle.out create mode 100644 gitdiff/testdata/apply/combined_fragment_add_middle.patch create mode 100644 gitdiff/testdata/apply/combined_fragment_add_middle.src create mode 100644 gitdiff/testdata/three_combined_files.patch diff --git a/README.md b/README.md index 1879ef3..cb526cc 100644 --- a/README.md +++ b/README.md @@ -99,3 +99,8 @@ this testing in the future. context of each fragment must exactly match the source file; `git apply` implements a search algorithm that tries different lines and amounts of context, with further options to normalize or ignore whitespace changes. + +7. Multi-parent differences (produced by `git --cc`) are supported with two + parents. The full specification for the difference allows for more than + two, but that is not completely supported. + diff --git a/gitdiff/combined.go b/gitdiff/combined.go new file mode 100644 index 0000000..a16f6a3 --- /dev/null +++ b/gitdiff/combined.go @@ -0,0 +1,269 @@ +package gitdiff + +import ( + "io" + "strings" +) + +// ParseCombinedTextFragments parses text fragments with 2 or more parents until the +// next file header or the end of the stream and attaches them to the given file. It +// returns the number of fragments that were added. +func (p *parser) ParseCombinedTextFragments(f *File) (n int, err error) { + for { + frags, err := p.ParseCombinedTextFragmentHeader() + if err != nil { + return n, err + } + if len(frags) <= 0 { + return n, nil + } + + for _, frag := range frags { + if f.IsNew && frag.OldLines > 0 { + return n, p.Errorf(-1, "new file depends on old contents") + } + if f.IsDelete && frag.NewLines > 0 { + return n, p.Errorf(-1, "deleted file still has contents") + } + } + + if err := p.ParseCombinedTextChunk(frags); err != nil { + return n, err + } + + f.TextFragments = append(f.TextFragments, frags...) + n += len(frags) + } +} + +func (p *parser) ParseCombinedTextFragmentHeader() ([]*TextFragment, error) { + // There are (number of parents + 1) @ characters in the chunk header for combined diff format. + // This implementation is generic enough to use for both the standard '@@ ' text diff and for + // the combined diff. However, for stability and performance reasons, they are split into + // different implementations. + const ( + parentMark = '@' + minStartMark = "@@@" + trailingStartMark = "@ -" + ) + line := p.Line(0) + + if !strings.HasPrefix(line, minStartMark) { + return nil, nil + } + + // Find wrapping markers around the range, and, in doing so, count the number of parent files. + startEnd := strings.Index(line, trailingStartMark) + if startEnd < 0 { + return nil, nil + } + parentCount := 0 + endMark := " @" + for ; parentCount < startEnd; parentCount++ { + // check for valid combined form marker. + if line[parentCount] != parentMark { + return nil, nil + } + endMark += line[parentCount : parentCount+1] + } + + // Split up the line into sections. + // Keep the leading '-' on the first range. + startPos := startEnd + len(trailingStartMark) - 1 + parts := strings.SplitAfterN(p.Line(0), endMark, 2) + if len(parts) < 2 { + return nil, p.Errorf(0, "invalid fragment header") + } + comment := strings.TrimSpace(parts[1]) + + // Collect the file ranges. + header := parts[0][startPos : len(parts[0])-len(endMark)] + ranges := strings.Split(header, " ") + if len(ranges) != parentCount+1 { + return nil, p.Errorf(0, "invalid fragment header") + } + + // Parse the final merged range. + var err error + newPosition, newLines, err := parseRange(ranges[parentCount][1:]) + if err != nil { + return nil, p.Errorf(0, "invalid fragment header: %v", err) + } + + // Parse the parent file ranges. + frags := make([]*TextFragment, parentCount) + for i := 0; i < parentCount; i++ { + f := &TextFragment{ + Comment: comment, + NewPosition: newPosition, + NewLines: newLines, + } + if f.OldPosition, f.OldLines, err = parseRange(ranges[i][1:]); err != nil { + return nil, p.Errorf(0, "invalid fragment header: %v", err) + } + frags[i] = f + } + + if err := p.Next(); err != nil && err != io.EOF { + return nil, err + } + return frags, nil +} + +func (p *parser) ParseCombinedTextChunk(frags []*TextFragment) error { + if p.Line(0) == "" { + return p.Errorf(0, "no content following fragment header") + } + parentCount := len(frags) + var oldLines int64 = 0 + // Due to ParseCombinedTextFragmentHeader, the new line count is the same + // in all fragments. + // That first one looks like a nice choice. + newLines := frags[0].NewLines + + for _, frag := range frags { + oldLines += frag.OldLines + } + // Make an immutable copy of the old lines for later comparisons. + allOldLines := oldLines + + // Track whether any line included an alteration. + noLineChanges := true + + // Only count leading and trailing context when it applies to all the files. + var leadingContext int64 = 0 + var trailingContext int64 = 0 + + // Pre-allocate the per-filter altered check. + // It's only used within the per-line, but it's always re-initialized on each pass. + altered := make([]bool, parentCount) + +lineLoop: + for oldLines > 0 || newLines > 0 { + line := p.Line(0) + parentOps, data := line[0:parentCount], line[parentCount:] + // Each character in parentOps is for each parent, to show how target file line + // differs from each file of the parents. If a fragment has a '-', then it is + // a removal. If another fragment has a '+' but this one has a ' ', then + // it's also a removal. + if parentOps == "\n" { + // newer GNU diff versions create empty context lines + data = "\n" + parentOps = "" + } + + hasAdd := false + hasRemove := false + hasContext := false + for idx, op := range parentOps { + frag := frags[idx] + altered[idx] = false + + switch op { + case ' ': + // Context lines + hasContext = true + frag.Lines = append(frag.Lines, Line{OpContext, data}) + // Adjustment of the leading and trailing context count can only happen + // by analyzing all the file operations, so that happens after the line's + // operation checks. + case '-': + hasRemove = true + altered[idx] = true + oldLines-- + noLineChanges = false + frag.LinesDeleted++ + trailingContext = 0 + frag.Lines = append(frag.Lines, Line{OpDelete, data}) + case '+': + hasAdd = true + altered[idx] = true + noLineChanges = false + frag.LinesAdded++ + trailingContext = 0 + frag.Lines = append(frag.Lines, Line{OpAdd, data}) + case '\\': + // this may appear in middle of fragment if it's for a deleted line + if isNoNewlineMarker(line) { + removeLastNewline(frag) + // Move on to the next line. + continue lineLoop + } + fallthrough + default: + // TODO(bkeyes): if this is because we hit the next header, it + // would be helpful to return the miscounts line error. We could + // either test for the common headers ("@@ -", "diff --git", "@@@ -") or + // assume any invalid op ends the fragment; git returns the same + // generic error in all cases so either is compatible + return p.Errorf(0, "invalid line operation: %q", op) + } + } + + // The complex counting method. + + if hasAdd { + // If any number of parent files is marked as an add, then count that + // as a single add for the new count. + newLines-- + } + + // Lines with removes reduce the old line count once per removal operation, and + // the counting happens during each file's removal action. + if !hasRemove { + // For lines that have no removes, then this is either an add or a + // context line. In both cases, the files which had a blank operation count + // as an old line. + for _, fragChanged := range altered { + if !fragChanged { + oldLines-- + } + } + if !hasAdd && hasContext { + // Lines with no removes, no adds, and had at least 1 context entry + // means that this line a full context - no add and no remove. + if noLineChanges { + leadingContext++ + } else { + trailingContext++ + } + } + } + + if err := p.Next(); err != nil { + if err == io.EOF { + break + } + return err + } + } + + if oldLines != 0 || newLines != 0 { + hdr := max(allOldLines-oldLines, frags[0].NewLines-newLines) + 1 + return p.Errorf(-hdr, "fragment header miscounts lines: %+d old, %+d new", -oldLines, -newLines) + } + if noLineChanges { + return p.Errorf(0, "fragment contains no changes") + } + + // check for a final "no newline" marker since it is not included in the + // counters used to stop the loop above + if isNoNewlineMarker(p.Line(0)) { + for _, frag := range frags { + removeLastNewline(frag) + } + if err := p.Next(); err != nil && err != io.EOF { + return err + } + } + + // Because the leading and trailing context can only be determined on a whole line basis, + // and the value can change depending on later discoveries, this count only has meaning + // at the very end. + for _, frag := range frags { + frag.LeadingContext = leadingContext + frag.TrailingContext = trailingContext + } + + return nil +} diff --git a/gitdiff/combined_test.go b/gitdiff/combined_test.go new file mode 100644 index 0000000..e2209be --- /dev/null +++ b/gitdiff/combined_test.go @@ -0,0 +1,565 @@ +package gitdiff + +import ( + "io" + "reflect" + "testing" +) + +func TestParseCombinedTextFragmentHeader(t *testing.T) { + tests := map[string]struct { + Input string + Output []TextFragment + Err string + }{ + "shortest": { + Input: "@@@ -1 -1 +1 @@@\n", + Output: []TextFragment{{ + OldPosition: 1, + OldLines: 1, + NewPosition: 1, + NewLines: 1, + }, { + OldPosition: 1, + OldLines: 1, + NewPosition: 1, + NewLines: 1, + }}, + }, + "twoParent": { + Input: "@@@ -52,1 -50,2 +52,2 @@@\n", + Output: []TextFragment{{ + OldPosition: 52, + OldLines: 1, + NewPosition: 52, + NewLines: 2, + }, { + OldPosition: 50, + OldLines: 2, + NewPosition: 52, + NewLines: 2, + }}, + }, + "threeParent": { + Input: "@@@@ -52,1 -50,2 -38,5 +42,9 @@@@\n", + Output: []TextFragment{{ + OldPosition: 52, + OldLines: 1, + NewPosition: 42, + NewLines: 9, + }, { + OldPosition: 50, + OldLines: 2, + NewPosition: 42, + NewLines: 9, + }, { + OldPosition: 38, + OldLines: 5, + NewPosition: 42, + NewLines: 9, + }}, + }, + "trailingComment": { + Input: "@@@ -52,1 -50,2 +52,2 @@@ func test(n int) {\n", + Output: []TextFragment{{ + Comment: "func test(n int) {", + OldPosition: 52, + OldLines: 1, + NewPosition: 52, + NewLines: 2, + }, { + Comment: "func test(n int) {", + OldPosition: 50, + OldLines: 2, + NewPosition: 52, + NewLines: 2, + }}, + }, + "incompleteThree": { + Input: "@@@ -12,3 -5,9 +2\n", + Err: "gitdiff: line 1: invalid fragment header", + }, + "incompleteTwo": { + Input: "@@@ -12,3 +2,5\n", + Err: "gitdiff: line 1: invalid fragment header", + }, + "incompleteFour": { + Input: "@@@@ -12,3 -5,9 -2,6\n", + Err: "gitdiff: line 1: invalid fragment header", + }, + "incompleteFourDiffs": { + Input: "@@@@ -12,3 -5,9 -2,6 @@@@\n", + Err: "gitdiff: line 1: invalid fragment header", + }, + "badNumbers": { + Input: "@@@ -1a,2b -3c,4d +5e,6f @@@\n", + Err: "gitdiff: line 1: invalid fragment header: bad start of range: 5e: invalid syntax", + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + p := newTestParser(test.Input, true) + + frags, err := p.ParseCombinedTextFragmentHeader() + if test.Err != "" { + if err == nil || err == io.EOF { + t.Fatalf("expected error parsing header, but got %v", err) + } + if test.Err != err.Error() { + t.Errorf("incorrect error: expected %s actual %v", test.Err, err) + } + return + } + if err != nil { + t.Fatalf("error parsing header: %v", err) + } + + if !reflect.DeepEqual(joinFragmentPtr(test.Output), frags) { + t.Errorf("incorrect fragment\nexpected: %+v\nactual: %+v", test.Output, reduceFragmentPtr(frags)) + } + }) + } +} + +func TestCombinedParseTextChunk(t *testing.T) { + tests := map[string]struct { + Input string + Fragments []TextFragment + + Output []TextFragment + Err bool + }{ + "removeAddRemoveDAdd": { + Input: `- old line 1.0 ++ new line 1.1 + -old line 2.1 +++new line 1.2,2.2 +`, + Fragments: []TextFragment{ + {OldLines: 1, NewLines: 2}, + {OldLines: 2, NewLines: 2}, + }, + Output: []TextFragment{ + { + OldLines: 1, + NewLines: 2, + Lines: []Line{ + {OpDelete, "old line 1.0\n"}, + {OpAdd, "new line 1.1\n"}, + {OpContext, "old line 2.1\n"}, + {OpAdd, "new line 1.2,2.2\n"}, + }, + LinesAdded: 2, + LinesDeleted: 1, + LeadingContext: 0, + TrailingContext: 0, + }, + { + OldLines: 2, + NewLines: 2, + Lines: []Line{ + {OpContext, "old line 1.0\n"}, + {OpContext, "new line 1.1\n"}, + {OpDelete, "old line 2.1\n"}, + {OpAdd, "new line 1.2,2.2\n"}, + }, + LinesAdded: 1, + LinesDeleted: 1, + LeadingContext: 0, + TrailingContext: 0, + }, + }, + }, + "addRemoveAddRemove": { + Input: ` -remove line 1.1 + +add line 1.2 +- remove line 2.1 ++ add line 2.2 +`, + Fragments: []TextFragment{ + {OldLines: 2, NewLines: 2}, + {OldLines: 2, NewLines: 2}, + }, + Output: []TextFragment{ + { + OldLines: 2, + NewLines: 2, + Lines: []Line{ + {OpContext, "remove line 1.1\n"}, + {OpContext, "add line 1.2\n"}, + {OpDelete, "remove line 2.1\n"}, + {OpAdd, "add line 2.2\n"}, + }, + LinesAdded: 1, + LinesDeleted: 1, + LeadingContext: 0, + TrailingContext: 0, + }, + { + OldLines: 2, + NewLines: 2, + Lines: []Line{ + {OpDelete, "remove line 1.1\n"}, + {OpAdd, "add line 1.2\n"}, + {OpContext, "remove line 2.1\n"}, + {OpContext, "add line 2.2\n"}, + }, + LinesAdded: 1, + LinesDeleted: 1, + LeadingContext: 0, + TrailingContext: 0, + }, + }, + }, + "removeAdd3RemoveAdd": { + Input: `- remove line 1.1 ++ add line 1.2 + -remove line 2.1 + -remove line 2.2 + -remove line 2.3 + +add line 2.4 + `, + Fragments: []TextFragment{ + {OldLines: 2, NewLines: 2}, + {OldLines: 4, NewLines: 2}, + }, + Output: []TextFragment{ + { + OldLines: 2, + NewLines: 2, + Lines: []Line{ + {OpDelete, "remove line 1.1\n"}, + {OpAdd, "add line 1.2\n"}, + {OpContext, "remove line 2.1\n"}, + {OpContext, "remove line 2.2\n"}, + {OpContext, "remove line 2.3\n"}, + {OpContext, "add line 2.4\n"}, + }, + LinesAdded: 1, + LinesDeleted: 1, + LeadingContext: 0, + TrailingContext: 0, + }, + { + OldLines: 4, + NewLines: 2, + Lines: []Line{ + {OpContext, "remove line 1.1\n"}, + {OpContext, "add line 1.2\n"}, + {OpDelete, "remove line 2.1\n"}, + {OpDelete, "remove line 2.2\n"}, + {OpDelete, "remove line 2.3\n"}, + {OpAdd, "add line 2.4\n"}, + }, + LinesAdded: 1, + LinesDeleted: 3, + LeadingContext: 0, + TrailingContext: 0, + }, + }, + }, + "bothChangeEol": { + Input: `- remove line 1.1 + -remove line 2.1 +++add line 1.1,2.2`, + Fragments: []TextFragment{ + {OldLines: 1, NewLines: 1}, + {OldLines: 1, NewLines: 1}, + }, + Output: []TextFragment{ + { + OldLines: 1, + NewLines: 1, + Lines: []Line{ + {OpDelete, "remove line 1.1\n"}, + {OpContext, "remove line 2.1\n"}, + {OpAdd, "add line 1.1,2.2"}, + }, + LinesAdded: 1, + LinesDeleted: 1, + LeadingContext: 0, + TrailingContext: 0, + }, + { + OldLines: 1, + NewLines: 1, + Lines: []Line{ + {OpContext, "remove line 1.1\n"}, + {OpDelete, "remove line 2.1\n"}, + {OpAdd, "add line 1.1,2.2"}, + }, + LinesAdded: 1, + LinesDeleted: 1, + LeadingContext: 0, + TrailingContext: 0, + }, + }, + }, + "doubleRemove": { + Input: `--line 1.1,2.1 +- line 1.2 + -line 2.2 + `, + Fragments: []TextFragment{ + {OldLines: 2, NewLines: 0}, + {OldLines: 2, NewLines: 0}, + }, + Output: []TextFragment{ + { + OldLines: 2, + NewLines: 0, + Lines: []Line{ + {OpDelete, "line 1.1,2.1\n"}, + {OpDelete, "line 1.2\n"}, + {OpContext, "line 2.2\n"}, + }, + LinesAdded: 0, + LinesDeleted: 2, + LeadingContext: 0, + TrailingContext: 0, + }, + { + OldLines: 2, + NewLines: 0, + Lines: []Line{ + {OpDelete, "line 1.1,2.1\n"}, + {OpContext, "line 1.2\n"}, + {OpDelete, "line 2.2\n"}, + }, + LinesAdded: 0, + LinesDeleted: 2, + LeadingContext: 0, + TrailingContext: 0, + }, + }, + }, + "doubleAddAdd": { + Input: `++line 1.1,2.1 ++ line 1.2 +`, + Fragments: []TextFragment{ + {OldLines: 0, NewLines: 2}, + {OldLines: 1, NewLines: 2}, + }, + Output: []TextFragment{ + { + OldLines: 0, + NewLines: 2, + Lines: []Line{ + {OpAdd, "line 1.1,2.1\n"}, + {OpAdd, "line 1.2\n"}, + }, + LinesAdded: 2, + LinesDeleted: 0, + LeadingContext: 0, + TrailingContext: 0, + }, + { + OldLines: 1, + NewLines: 2, + Lines: []Line{ + {OpAdd, "line 1.1,2.1\n"}, + {OpContext, "line 1.2\n"}, + }, + LinesAdded: 1, + LinesDeleted: 0, + LeadingContext: 0, + TrailingContext: 0, + }, + }, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + p := newTestParser(test.Input, true) + + frags := joinFragmentPtr(test.Fragments) + err := p.ParseCombinedTextChunk(frags) + if test.Err { + if err == nil || err == io.EOF { + t.Fatalf("expected error parsing text chunk, but got %v", err) + } + return + } + if err != nil { + t.Fatalf("error parsing text chunk: %v", err) + } + + if !reflect.DeepEqual(test.Output, reduceFragmentPtr(frags)) { + t.Errorf("incorrect fragment\nexpected: %+v\nactual: %+v", test.Output, reduceFragmentPtr(frags)) + } + }) + } +} + +func TestCombinedParseTextFragments(t *testing.T) { + tests := map[string]struct { + Input string + File File + + Fragments []TextFragment + Err string + }{ + "multipleChanges": { + Input: `@@@ -54,0 -52,2 +51,3 @@@ file 1 +++line 1.1.1,1.2.1 ++ line 1.1.1 ++ line 1.1.2 +@@@ -134,1 -136,1 +136,0 @@@ file 2 +- line 2.1.1 + -line 2.2.1 +`, + Fragments: []TextFragment{ + { + Comment: "file 1", + OldPosition: 54, + OldLines: 0, + NewPosition: 51, + NewLines: 3, + Lines: []Line{ + {OpAdd, "line 1.1.1,1.2.1\n"}, + {OpAdd, "line 1.1.1\n"}, + {OpAdd, "line 1.1.2\n"}, + }, + LinesAdded: 3, + LinesDeleted: 0, + LeadingContext: 0, + TrailingContext: 0, + }, + { + Comment: "file 1", + OldPosition: 52, + OldLines: 2, + NewPosition: 51, + NewLines: 3, + Lines: []Line{ + {OpAdd, "line 1.1.1,1.2.1\n"}, + {OpContext, "line 1.1.1\n"}, + {OpContext, "line 1.1.2\n"}, + }, + LinesDeleted: 0, + LinesAdded: 1, + LeadingContext: 0, + TrailingContext: 0, + }, + { + Comment: "file 2", + OldPosition: 134, + OldLines: 1, + NewPosition: 136, + NewLines: 0, + Lines: []Line{ + {OpDelete, "line 2.1.1\n"}, + {OpContext, "line 2.2.1\n"}, + }, + LinesDeleted: 1, + LinesAdded: 0, + LeadingContext: 0, + TrailingContext: 0, + }, + { + Comment: "file 2", + OldPosition: 136, + OldLines: 1, + NewPosition: 136, + NewLines: 0, + Lines: []Line{ + {OpContext, "line 2.1.1\n"}, + {OpDelete, "line 2.2.1\n"}, + }, + LinesDeleted: 1, + LinesAdded: 0, + LeadingContext: 0, + TrailingContext: 0, + }, + }, + }, + "badNewFile": { + Input: `@@@@ -1 -1 -1 +1,2 @@@@ +---old line 1 ++ new line 1 ++ new line 2 + `, + File: File{ + IsNew: true, + }, + Err: "gitdiff: line 1: new file depends on old contents", + }, + "badDeletedFile": { + Input: `@@@ -1,2 -1 +1 @@@ +--old line 1 + context line + `, + File: File{ + IsDelete: true, + }, + Err: "gitdiff: line 1: deleted file still has contents", + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + p := newTestParser(test.Input, true) + + file := test.File + n, err := p.ParseCombinedTextFragments(&file) + if test.Err != "" { + if err == nil || err == io.EOF { + t.Fatalf("expected error parsing text fragments, but returned %v", err) + } + if test.Err != err.Error() { + t.Fatalf("Incorrect error text: expected %v, actual %v", test.Err, err) + } + return + } + if err != nil { + t.Fatalf("error parsing text fragments: %v", err) + } + + if len(test.Fragments) != n { + t.Fatalf("incorrect number of added fragments: expected %d, actual %d", len(test.Fragments), n) + } + + for i, frag := range test.Fragments { + if !reflect.DeepEqual(&frag, file.TextFragments[i]) { + t.Errorf("incorrect fragment at position %d\nexpected: %+v\nactual: %+v", i, frag, file.TextFragments[i]) + } + } + }) + } +} + +func joinFragmentPtr(frags []TextFragment) []*TextFragment { + ret := make([]*TextFragment, len(frags)) + for i, f := range frags { + r := copyFragment(&f) + ret[i] = &r + } + return ret +} + +func reduceFragmentPtr(frags []*TextFragment) []TextFragment { + ret := make([]TextFragment, len(frags)) + for i, f := range frags { + r := copyFragment(f) + ret[i] = r + } + return ret +} + +func copyFragment(f *TextFragment) TextFragment { + return TextFragment{ + Comment: f.Comment, + OldPosition: f.OldPosition, + OldLines: f.OldLines, + NewPosition: f.NewPosition, + NewLines: f.NewLines, + LinesAdded: f.LinesAdded, + LinesDeleted: f.LinesDeleted, + LeadingContext: f.LeadingContext, + TrailingContext: f.TrailingContext, + Lines: f.Lines, + } +} diff --git a/gitdiff/file_header.go b/gitdiff/file_header.go index 58904b4..f0eaf4c 100644 --- a/gitdiff/file_header.go +++ b/gitdiff/file_header.go @@ -29,6 +29,7 @@ func (p *parser) ParseNextFileHeader() (*File, string, error) { if frag != nil { return nil, "", p.Errorf(-1, "patch fragment without file header: %s", frag.Header()) } + // note: not checking for disconnected combined fragment header. // check for a git-generated patch file, err = p.ParseGitFileHeader() @@ -61,12 +62,17 @@ func (p *parser) ParseNextFileHeader() (*File, string, error) { } func (p *parser) ParseGitFileHeader() (*File, error) { - const prefix = "diff --git " - - if !strings.HasPrefix(p.Line(0), prefix) { + const prefix1 = "diff --git " + const prefix2 = "diff --cc " + var header string + + if strings.HasPrefix(p.Line(0), prefix1) { + header = p.Line(0)[len(prefix1):] + } else if strings.HasPrefix(p.Line(0), prefix2) { + header = p.Line(0)[len(prefix2):] + } else { return nil, nil } - header := p.Line(0)[len(prefix):] defaultName, err := parseGitHeaderName(header) if err != nil { @@ -270,6 +276,9 @@ func parseGitHeaderData(f *File, line, defaultName string) (end bool, err error) parse func(*File, string, string) error }{ {"@@ -", true, nil}, + // TODO '@@@ -' is the simplest form of the combined diff; it + // could be any number of '@' - one per parent + 1 '@'. + {"@@@ -", true, nil}, {"--- ", false, parseGitHeaderOldName}, {"+++ ", false, parseGitHeaderNewName}, {"old mode ", false, parseGitHeaderOldMode}, diff --git a/gitdiff/parser.go b/gitdiff/parser.go index 5ffa9bd..1008bae 100644 --- a/gitdiff/parser.go +++ b/gitdiff/parser.go @@ -51,6 +51,7 @@ func Parse(r io.Reader) (<-chan *File, error) { for _, fn := range []func(*File) (int, error){ p.ParseTextFragments, + p.ParseCombinedTextFragments, p.ParseBinaryFragments, } { n, err := fn(file) diff --git a/gitdiff/parser_test.go b/gitdiff/parser_test.go index 5040b0a..bf28d6a 100644 --- a/gitdiff/parser_test.go +++ b/gitdiff/parser_test.go @@ -96,6 +96,20 @@ index 9540595..30e6333 100644 +++ b/dir/file.txt @@ -1,2 +1,3 @@ context line +`, + Parse: func(p *parser) error { + _, err := p.ParseGitFileHeader() + return err + }, + EndLine: "@@ -1,2 +1,3 @@\n", + }, + "ParseCCFileHeader": { + Input: `diff --cc a/dir/file.txt b/dir/file.txt +index 9540595..30e6333 100644 +--- a/dir/file.txt ++++ b/dir/file.txt +@@ -1,2 +1,3 @@ +context line `, Parse: func(p *parser) error { _, err := p.ParseGitFileHeader() @@ -493,6 +507,131 @@ Date: Tue Apr 2 22:55:40 2019 -0700 }, Preamble: binaryPreamble, }, + "threeCombinedFiles": { + InputFile: "testdata/three_combined_files.patch", + Output: []*File{ + { + PatchHeader: &PatchHeader{ + SHA: "5d9790fec7d95aa223f3d20936340bf55ff3dcbe", + Author: &PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny@example.com", + }, + AuthorDate: asTime("2019-04-02T22:55:40-07:00"), + Title: "A file with multiple fragments, including multiple parents.", + Body: "The content is arbitrary.", + }, + OldName: "dir/file1.txt", + NewName: "dir/file1.txt", + OldMode: os.FileMode(0100644), + OldOIDPrefix: "ebe9fa54", + NewOIDPrefix: "fe103e1d", + TextFragments: textFragments, + }, + { + PatchHeader: &PatchHeader{ + SHA: "5d9790fec7d95aa223f3d20936340bf55ff3dcbe", + Author: &PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny@example.com", + }, + AuthorDate: asTime("2019-04-02T22:55:40-07:00"), + Title: "A file with multiple fragments, including multiple parents.", + Body: "The content is arbitrary.", + }, + OldName: "dir/file2.txt", + NewName: "dir/file2.txt", + OldMode: os.FileMode(0100644), + OldOIDPrefix: "417ebc70", + NewOIDPrefix: "67514b7f", + TextFragments: []*TextFragment{ + { + OldPosition: 3, + OldLines: 3, + NewPosition: 3, + NewLines: 0, + Comment: "fragment 1", + Lines: []Line{ + {OpContext, "context line\n"}, + {OpDelete, "old line 1\n"}, + {OpContext, "old line 2\n"}, + {OpContext, "context line\n"}, + }, + LinesAdded: 0, + LinesDeleted: 1, + LeadingContext: 1, + TrailingContext: 1, + }, + { + OldPosition: 2, + OldLines: 3, + NewPosition: 3, + NewLines: 0, + Comment: "fragment 1", + Lines: []Line{ + {OpContext, "context line\n"}, + {OpContext, "old line 1\n"}, + {OpDelete, "old line 2\n"}, + {OpContext, "context line\n"}, + }, + LinesAdded: 0, + LinesDeleted: 1, + LeadingContext: 1, + TrailingContext: 1, + }, + { + OldPosition: 31, + OldLines: 2, + NewPosition: 33, + NewLines: 1, + Comment: "fragment 2", + Lines: []Line{ + {OpContext, "context line\n"}, + {OpDelete, "old line 1\n"}, + {OpAdd, "new line 2\n"}, + }, + LinesAdded: 1, + LinesDeleted: 1, + LeadingContext: 1, + }, + { + OldPosition: 30, + OldLines: 2, + NewPosition: 33, + NewLines: 1, + Comment: "fragment 2", + Lines: []Line{ + {OpContext, "context line\n"}, + {OpContext, "old line 1\n"}, + {OpContext, "new line 2\n"}, + }, + LinesAdded: 0, + LinesDeleted: 0, + LeadingContext: 1, + }, + }, + }, + { + PatchHeader: &PatchHeader{ + SHA: "5d9790fec7d95aa223f3d20936340bf55ff3dcbe", + Author: &PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny@example.com", + }, + AuthorDate: asTime("2019-04-02T22:55:40-07:00"), + Title: "A file with multiple fragments, including multiple parents.", + Body: "The content is arbitrary.", + }, + OldName: "dir/file3.txt", + NewName: "dir/file3.txt", + OldMode: os.FileMode(0100644), + OldOIDPrefix: "1b39fa54", + NewOIDPrefix: "f5103e1d", + TextFragments: textFragments, + }, + }, + Preamble: binaryPreamble, + }, } for name, test := range tests { diff --git a/gitdiff/testdata/apply/combined_fragment_add_middle.out b/gitdiff/testdata/apply/combined_fragment_add_middle.out new file mode 100644 index 0000000..ded20d8 --- /dev/null +++ b/gitdiff/testdata/apply/combined_fragment_add_middle.out @@ -0,0 +1,5 @@ +line 1 +line 2 +new line a +new line b +line 3 diff --git a/gitdiff/testdata/apply/combined_fragment_add_middle.patch b/gitdiff/testdata/apply/combined_fragment_add_middle.patch new file mode 100644 index 0000000..96c2299 --- /dev/null +++ b/gitdiff/testdata/apply/combined_fragment_add_middle.patch @@ -0,0 +1,9 @@ +diff --cc a/gitdiff/testdata/apply/fragment_add_middle.src b/gitdiff/testdata/apply/fragment_add_middle.src +--- a/gitdiff/testdata/apply/fragment_add_middle.src ++++ b/gitdiff/testdata/apply/fragment_add_middle.src +@@@ -1,4 -1,4 +1,5 @@@ + line 1 + line 2 ++ new line a + +new line b + line 3 diff --git a/gitdiff/testdata/apply/combined_fragment_add_middle.src b/gitdiff/testdata/apply/combined_fragment_add_middle.src new file mode 100644 index 0000000..a92d664 --- /dev/null +++ b/gitdiff/testdata/apply/combined_fragment_add_middle.src @@ -0,0 +1,3 @@ +line 1 +line 2 +line 3 diff --git a/gitdiff/testdata/three_combined_files.patch b/gitdiff/testdata/three_combined_files.patch new file mode 100644 index 0000000..e1bb7be --- /dev/null +++ b/gitdiff/testdata/three_combined_files.patch @@ -0,0 +1,61 @@ +commit 5d9790fec7d95aa223f3d20936340bf55ff3dcbe +Author: Morton Haypenny +Date: Tue Apr 2 22:55:40 2019 -0700 + + A file with multiple fragments, including multiple parents. + + The content is arbitrary. + +diff --git a/dir/file1.txt b/dir/file1.txt +index ebe9fa54..fe103e1d 100644 +--- a/dir/file1.txt ++++ b/dir/file1.txt +@@ -3,6 +3,8 @@ fragment 1 + context line +-old line 1 +-old line 2 + context line ++new line 1 ++new line 2 ++new line 3 + context line +-old line 3 ++new line 4 ++new line 5 +@@ -31,2 +33,2 @@ fragment 2 + context line +-old line 4 ++new line 6 +diff --cc dir/file2.txt +index 417ebc70..67514b7f 100644 +--- a/dir/file2.txt ++++ b/dir/file2.txt +@@@ -3,3 -2,3 +3,0 @@@ fragment 1 + context line +- old line 1 + -old line 2 + context line +@@@ -31,2 -30,2 +33,1 @@@ fragment 2 + context line +- old line 1 ++ new line 2 +diff --git a/dir/file3.txt b/dir/file3.txt +index 1b39fa54..f5103e1d 100644 +--- a/dir/file3.txt ++++ b/dir/file3.txt +@@ -3,6 +3,8 @@ fragment 1 + context line +-old line 1 +-old line 2 + context line ++new line 1 ++new line 2 ++new line 3 + context line +-old line 3 ++new line 4 ++new line 5 +@@ -31,2 +33,2 @@ fragment 2 + context line +-old line 4 ++new line 6 From 73b0d0a4ef9d92efbdd9fded9059dafaedaa8c23 Mon Sep 17 00:00:00 2001 From: Groboclown Date: Fri, 9 Feb 2024 01:32:29 -0600 Subject: [PATCH 2/5] Strip out unneccesary files. These were originally going to be for patch testing, but those tests weren't written. Mostly because I'm not quite sure how to evaluate a multi-parent patch with just one source file. --- gitdiff/testdata/apply/combined_fragment_add_middle.out | 5 ----- .../testdata/apply/combined_fragment_add_middle.patch | 9 --------- gitdiff/testdata/apply/combined_fragment_add_middle.src | 3 --- 3 files changed, 17 deletions(-) delete mode 100644 gitdiff/testdata/apply/combined_fragment_add_middle.out delete mode 100644 gitdiff/testdata/apply/combined_fragment_add_middle.patch delete mode 100644 gitdiff/testdata/apply/combined_fragment_add_middle.src diff --git a/gitdiff/testdata/apply/combined_fragment_add_middle.out b/gitdiff/testdata/apply/combined_fragment_add_middle.out deleted file mode 100644 index ded20d8..0000000 --- a/gitdiff/testdata/apply/combined_fragment_add_middle.out +++ /dev/null @@ -1,5 +0,0 @@ -line 1 -line 2 -new line a -new line b -line 3 diff --git a/gitdiff/testdata/apply/combined_fragment_add_middle.patch b/gitdiff/testdata/apply/combined_fragment_add_middle.patch deleted file mode 100644 index 96c2299..0000000 --- a/gitdiff/testdata/apply/combined_fragment_add_middle.patch +++ /dev/null @@ -1,9 +0,0 @@ -diff --cc a/gitdiff/testdata/apply/fragment_add_middle.src b/gitdiff/testdata/apply/fragment_add_middle.src ---- a/gitdiff/testdata/apply/fragment_add_middle.src -+++ b/gitdiff/testdata/apply/fragment_add_middle.src -@@@ -1,4 -1,4 +1,5 @@@ - line 1 - line 2 -+ new line a - +new line b - line 3 diff --git a/gitdiff/testdata/apply/combined_fragment_add_middle.src b/gitdiff/testdata/apply/combined_fragment_add_middle.src deleted file mode 100644 index a92d664..0000000 --- a/gitdiff/testdata/apply/combined_fragment_add_middle.src +++ /dev/null @@ -1,3 +0,0 @@ -line 1 -line 2 -line 3 From 89af3f8dd6db5bfe721a9feca8238e130f388a3a Mon Sep 17 00:00:00 2001 From: Groboclown Date: Sun, 11 Feb 2024 18:59:55 -0600 Subject: [PATCH 3/5] Modifications based on real-world data I found some situations where the 'diff --cc' can return a line count number of 18446744073709551615, which is a uint64 value of a signed int64 -1. The code now adjusts for this strange result. Additionally, a scenario came up where the line counting algorithm was off by 1, which may be related to the -1 issue above. This was captured in the 'strangeCount' test. Both of these scenarios point to a bug in the git diff algorithm. It's due to this that the final line count check can't be used. --- gitdiff/combined.go | 151 +++++++++++++++++++--------- gitdiff/combined_test.go | 211 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 312 insertions(+), 50 deletions(-) diff --git a/gitdiff/combined.go b/gitdiff/combined.go index a16f6a3..ecba149 100644 --- a/gitdiff/combined.go +++ b/gitdiff/combined.go @@ -1,7 +1,9 @@ package gitdiff import ( + "fmt" "io" + "strconv" "strings" ) @@ -82,12 +84,33 @@ func (p *parser) ParseCombinedTextFragmentHeader() ([]*TextFragment, error) { if len(ranges) != parentCount+1 { return nil, p.Errorf(0, "invalid fragment header") } + frags, err := parseCombinedHeaderRanges(comment, ranges) + if err != nil { + return nil, p.Errorf(0, "invalid fragment header: %v", err) + } + + if err := p.Next(); err != nil && err != io.EOF { + return nil, err + } + return frags, nil +} - // Parse the final merged range. +func parseCombinedHeaderRanges(comment string, ranges []string) ([]*TextFragment, error) { + parentCount := len(ranges) - 1 + + // This needs to cope with a strange old-line range of '-1' that some versions + // of Git produce. When this happens, the old and new line counts must increment by 1. + var negativeCount int64 = 0 + + // Parse the merged range. var err error - newPosition, newLines, err := parseRange(ranges[parentCount][1:]) + newPosition, newLines, err := parseCombinedRange(ranges[parentCount][1:]) if err != nil { - return nil, p.Errorf(0, "invalid fragment header: %v", err) + return nil, err + } + if newLines < 0 { + negativeCount -= newLines + // The newLines count remains negative, so it adjusts to zero at the end. } // Parse the parent file ranges. @@ -98,34 +121,58 @@ func (p *parser) ParseCombinedTextFragmentHeader() ([]*TextFragment, error) { NewPosition: newPosition, NewLines: newLines, } - if f.OldPosition, f.OldLines, err = parseRange(ranges[i][1:]); err != nil { - return nil, p.Errorf(0, "invalid fragment header: %v", err) + if f.OldPosition, f.OldLines, err = parseCombinedRange(ranges[i][1:]); err != nil { + return nil, err + } + if f.OldLines < 0 { + negativeCount -= f.OldLines + // The OldLines count remains negative, so the final adjustment makes it zero. } frags[i] = f } - if err := p.Next(); err != nil && err != io.EOF { - return nil, err + // Adjust each fragment count based on the negative count. + for _, f := range frags { + f.OldLines += negativeCount + f.NewLines += negativeCount } + return frags, nil } +func parseCombinedRange(s string) (start int64, end int64, err error) { + parts := strings.SplitN(s, ",", 2) + + if start, err = strconv.ParseInt(parts[0], 10, 64); err != nil { + nerr := err.(*strconv.NumError) + return 0, 0, fmt.Errorf("bad start of range: %s: %v", parts[0], nerr.Err) + } + + if len(parts) > 1 { + if parts[1] == "18446744073709551615" { + // There are some versions of "git diff --cc" that return a uint64 version of -1, + // which is this number. That seems to mean that, for this specific file, + // there wasn't any change. + // This is the only version of this large number that's been seen, though it's possible that + // the git diff --cc format can return other negative numbers. In those cases, more complex + // logic would be needed to convert the uint64 to signed int64. + end = -1 + } else if end, err = strconv.ParseInt(parts[1], 10, 64); err != nil { + nerr := err.(*strconv.NumError) + return 0, 0, fmt.Errorf("bad end of range: %s: %v", parts[1], nerr.Err) + } + } else { + end = 1 + } + + return +} + func (p *parser) ParseCombinedTextChunk(frags []*TextFragment) error { if p.Line(0) == "" { return p.Errorf(0, "no content following fragment header") } parentCount := len(frags) - var oldLines int64 = 0 - // Due to ParseCombinedTextFragmentHeader, the new line count is the same - // in all fragments. - // That first one looks like a nice choice. - newLines := frags[0].NewLines - - for _, frag := range frags { - oldLines += frag.OldLines - } - // Make an immutable copy of the old lines for later comparisons. - allOldLines := oldLines // Track whether any line included an alteration. noLineChanges := true @@ -139,9 +186,17 @@ func (p *parser) ParseCombinedTextChunk(frags []*TextFragment) error { altered := make([]bool, parentCount) lineLoop: - for oldLines > 0 || newLines > 0 { + for { line := p.Line(0) + // Should be able to count the lines required by the range header, + // however there are some rare times when that does not correctly align. + // Therefore, the 'text.go' version of line count checking isn't used here. + if !areValidOps(line, parentCount) { + break + } + parentOps, data := line[0:parentCount], line[parentCount:] + // Each character in parentOps is for each parent, to show how target file line // differs from each file of the parents. If a fragment has a '-', then it is // a removal. If another fragment has a '+' but this one has a ' ', then @@ -170,7 +225,6 @@ lineLoop: case '-': hasRemove = true altered[idx] = true - oldLines-- noLineChanges = false frag.LinesDeleted++ trailingContext = 0 @@ -202,31 +256,15 @@ lineLoop: // The complex counting method. - if hasAdd { - // If any number of parent files is marked as an add, then count that - // as a single add for the new count. - newLines-- - } - // Lines with removes reduce the old line count once per removal operation, and // the counting happens during each file's removal action. - if !hasRemove { - // For lines that have no removes, then this is either an add or a - // context line. In both cases, the files which had a blank operation count - // as an old line. - for _, fragChanged := range altered { - if !fragChanged { - oldLines-- - } - } - if !hasAdd && hasContext { - // Lines with no removes, no adds, and had at least 1 context entry - // means that this line a full context - no add and no remove. - if noLineChanges { - leadingContext++ - } else { - trailingContext++ - } + if !hasRemove && !hasAdd && hasContext { + // Lines with no removes, no adds, and had at least 1 context entry + // means that this line a full context - no add and no remove. + if noLineChanges { + leadingContext++ + } else { + trailingContext++ } } @@ -238,15 +276,15 @@ lineLoop: } } - if oldLines != 0 || newLines != 0 { - hdr := max(allOldLines-oldLines, frags[0].NewLines-newLines) + 1 - return p.Errorf(-hdr, "fragment header miscounts lines: %+d old, %+d new", -oldLines, -newLines) - } + // There is a rare, but valid, scenario where the line counts don't match up with + // what was parsed. This seems related to the "-1" range value. Because of that, + // this function can't validate the header line count against the lines encountered. + if noLineChanges { return p.Errorf(0, "fragment contains no changes") } - // check for a final "no newline" marker since it is not included in the + // Check for a final "no newline" marker since it is not included in the // counters used to stop the loop above if isNoNewlineMarker(p.Line(0)) { for _, frag := range frags { @@ -267,3 +305,20 @@ lineLoop: return nil } + +func areValidOps(line string, count int) bool { + if len(line) < count { + // Generally, this happens with an empty line ('\n'), but could also be file corruption. + return false + } + for count > 0 { + count-- + c := line[count] + switch c { + case ' ', '+', '-', '\\': + default: + return false + } + } + return true +} diff --git a/gitdiff/combined_test.go b/gitdiff/combined_test.go index e2209be..a549583 100644 --- a/gitdiff/combined_test.go +++ b/gitdiff/combined_test.go @@ -75,6 +75,38 @@ func TestParseCombinedTextFragmentHeader(t *testing.T) { NewLines: 2, }}, }, + "negativeCount2": { + Input: "@@@ -229,4 -229,18446744073709551615 +228,3 @@@ Comment\n", + Output: []TextFragment{{ + Comment: "Comment", + OldPosition: 229, + OldLines: 5, + NewPosition: 228, + NewLines: 4, + }, { + Comment: "Comment", + OldPosition: 229, + OldLines: 0, + NewPosition: 228, + NewLines: 4, + }}, + }, + "negativeCount3": { + Input: "@@@ -229,4 -229,1 +228,18446744073709551615 @@@ Comment\n", + Output: []TextFragment{{ + Comment: "Comment", + OldPosition: 229, + OldLines: 5, + NewPosition: 228, + NewLines: 0, + }, { + Comment: "Comment", + OldPosition: 229, + OldLines: 2, + NewPosition: 228, + NewLines: 0, + }}, + }, "incompleteThree": { Input: "@@@ -12,3 -5,9 +2\n", Err: "gitdiff: line 1: invalid fragment header", @@ -300,7 +332,7 @@ func TestCombinedParseTextChunk(t *testing.T) { Input: `--line 1.1,2.1 - line 1.2 -line 2.2 - `, +`, Fragments: []TextFragment{ {OldLines: 2, NewLines: 0}, {OldLines: 2, NewLines: 0}, @@ -476,6 +508,181 @@ func TestCombinedParseTextFragments(t *testing.T) { }, }, }, + "strangeCount": { + // This test is why the implementation can't rely upon the lines changed. + // Its values are off by 1, according to the description of the meaning in the + // documentation. + Input: `@@@ -225,4 -237,2 +225,6 @@@ C0 + +line 1 + +line 2 + +line 3 + +line 4 +- line 5 +++line 6 +++line 7 ++ line 8 + -line 9 + -line 10 +`, + Fragments: []TextFragment{ + { + Comment: "C0", + OldPosition: 225, + OldLines: 4, + NewPosition: 225, + NewLines: 6, + Lines: []Line{ + {OpContext, "line 1\n"}, + {OpContext, "line 2\n"}, + {OpContext, "line 3\n"}, + {OpContext, "line 4\n"}, + {OpDelete, "line 5\n"}, + {OpAdd, "line 6\n"}, + {OpAdd, "line 7\n"}, + {OpAdd, "line 8\n"}, + {OpContext, "line 9\n"}, + {OpContext, "line 10\n"}, + }, + LinesDeleted: 1, + LinesAdded: 3, + }, + { + Comment: "C0", + OldPosition: 237, + OldLines: 2, + NewPosition: 225, + NewLines: 6, + Lines: []Line{ + {OpAdd, "line 1\n"}, + {OpAdd, "line 2\n"}, + {OpAdd, "line 3\n"}, + {OpAdd, "line 4\n"}, + {OpContext, "line 5\n"}, + {OpAdd, "line 6\n"}, + {OpAdd, "line 7\n"}, + {OpContext, "line 8\n"}, + {OpDelete, "line 9\n"}, + {OpDelete, "line 10\n"}, + }, + LinesDeleted: 2, + LinesAdded: 6, + }, + }, + }, + "negativeLines1": { + Input: `@@@ -229,4 -230,18446744073709551615 +228,3 @@@ C1 + +line 1 + +line 2 + +line 3 + +line 4 +- line 5 +`, + Fragments: []TextFragment{ + { + Comment: "C1", + OldPosition: 229, + OldLines: 5, + NewPosition: 228, + NewLines: 4, + Lines: []Line{ + {OpContext, "line 1\n"}, + {OpContext, "line 2\n"}, + {OpContext, "line 3\n"}, + {OpContext, "line 4\n"}, + {OpDelete, "line 5\n"}, + }, + LinesDeleted: 1, + LinesAdded: 0, + }, + { + Comment: "C1", + OldPosition: 230, + OldLines: 0, + NewPosition: 228, + NewLines: 4, + Lines: []Line{ + {OpAdd, "line 1\n"}, + {OpAdd, "line 2\n"}, + {OpAdd, "line 3\n"}, + {OpAdd, "line 4\n"}, + {OpContext, "line 5\n"}, + }, + LinesDeleted: 0, + LinesAdded: 4, + }, + }, + }, + "negativeLines2": { + Input: `@@@ -232,18446744073709551615 -227,1 +230,1 @@@ +++line 1 ++ line 2 + -line 3 +`, + Fragments: []TextFragment{ + { + OldPosition: 232, + OldLines: 0, + NewPosition: 230, + NewLines: 2, + Lines: []Line{ + {OpAdd, "line 1\n"}, + {OpAdd, "line 2\n"}, + {OpContext, "line 3\n"}, + }, + LinesDeleted: 0, + LinesAdded: 2, + }, + { + OldPosition: 227, + OldLines: 2, + NewPosition: 230, + NewLines: 2, + Lines: []Line{ + {OpAdd, "line 1\n"}, + {OpContext, "line 2\n"}, + {OpDelete, "line 3\n"}, + }, + LinesDeleted: 1, + LinesAdded: 1, + }, + }, + }, + "negativeMergedLines": { + Input: `@@@ -1,2 -1,2 +1,18446744073709551615 @@@ +--line 1 +--line 2 +--line 3 +`, + File: File{IsDelete: true}, + Fragments: []TextFragment{ + { + OldPosition: 1, + OldLines: 3, + NewPosition: 1, + NewLines: 0, + Lines: []Line{ + {OpDelete, "line 1\n"}, + {OpDelete, "line 2\n"}, + {OpDelete, "line 3\n"}, + }, + LinesDeleted: 3, + LinesAdded: 0, + }, + { + OldPosition: 1, + OldLines: 3, + NewPosition: 1, + NewLines: 0, + Lines: []Line{ + {OpDelete, "line 1\n"}, + {OpDelete, "line 2\n"}, + {OpDelete, "line 3\n"}, + }, + LinesDeleted: 3, + LinesAdded: 0, + }, + }, + }, "badNewFile": { Input: `@@@@ -1 -1 -1 +1,2 @@@@ ---old line 1 @@ -524,7 +731,7 @@ func TestCombinedParseTextFragments(t *testing.T) { for i, frag := range test.Fragments { if !reflect.DeepEqual(&frag, file.TextFragments[i]) { - t.Errorf("incorrect fragment at position %d\nexpected: %+v\nactual: %+v", i, frag, file.TextFragments[i]) + t.Errorf("incorrect fragment at position %d\nexpected: %+v\nactual: %+v", i, frag, *file.TextFragments[i]) } } }) From a79f9cd52874ad2c40de7d509d1f42d423aadd43 Mon Sep 17 00:00:00 2001 From: Groboclown Date: Fri, 14 Jun 2024 23:45:13 -0500 Subject: [PATCH 4/5] Handle empty commit headers In some cases, especially for some merges, a commit can contain zero files. This can throw off the header parser by interpreting the following headers as the body text of the first commit. Down-stream, this causes the GitLeaks tool to identify the wrong commit ID with the leaked data. This patch currently fails the empty-commit check. A minor issue with the expected data eludes me at the moment. --- gitdiff/parser_test.go | 44 ++++++++++++++- gitdiff/patch_header.go | 28 +++++++--- gitdiff/patch_header_test.go | 33 ++++++++++++ gitdiff/testdata/empty-commit.patch | 83 +++++++++++++++++++++++++++++ 4 files changed, 181 insertions(+), 7 deletions(-) create mode 100644 gitdiff/testdata/empty-commit.patch diff --git a/gitdiff/parser_test.go b/gitdiff/parser_test.go index bf28d6a..1117080 100644 --- a/gitdiff/parser_test.go +++ b/gitdiff/parser_test.go @@ -630,7 +630,49 @@ Date: Tue Apr 2 22:55:40 2019 -0700 TextFragments: textFragments, }, }, - Preamble: binaryPreamble, + Preamble: textPreamble, + }, + "emptyCommit": { + InputFile: "testdata/empty-commit.patch", + Output: []*File{ + { + PatchHeader: &PatchHeader{ + SHA: "48f13f61afb200ad8386573632cba3abd1703af2", + Author: &PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny@example.com", + }, + AuthorDate: asTime("2017-11-03T12:29:49Z"), + Title: "A simple change", + Body: "The change is simple.", + }, + OldName: "dir/file1.txt", + NewName: "dir/file1.txt", + OldMode: os.FileMode(0100644), + OldOIDPrefix: "422cce7", + NewOIDPrefix: "24b39ed", + TextFragments: textFragments, + }, + { + PatchHeader: &PatchHeader{ + SHA: "183d9cdbecda47e6e95acf9e4e23fa3a71ba99ad", + Author: &PatchIdentity{ + Name: "Regina Smithee", + Email: "rsmithee@example.com", + }, + AuthorDate: asTime("2017-09-14T19:46:12Z"), + Title: "Simple change", + Body: "Simple change body.", + }, + OldName: "d1/file2.txt", + NewName: "d1/file2.txt", + OldMode: os.FileMode(0100755), + OldOIDPrefix: "c6513ff", + NewOIDPrefix: "bf4c6cc", + TextFragments: textFragments, + }, + }, + Preamble: textPreamble, }, } diff --git a/gitdiff/patch_header.go b/gitdiff/patch_header.go index a4671b7..e944667 100644 --- a/gitdiff/patch_header.go +++ b/gitdiff/patch_header.go @@ -294,10 +294,14 @@ func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) { if title != "" { // Don't check for an appendix - body, _ := scanMessageBody(s, indent, false) + body, _, remainder := scanMessageBody(s, indent, false) if s.Err() != nil { return nil, s.Err() } + if remainder != "" { + // There was another header immediately after this one. + return ParsePatchHeader(remainder) + } h.Body = body } @@ -326,15 +330,15 @@ func scanMessageTitle(s *bufio.Scanner) (title string, indent string) { return b.String(), indent } -func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) { +func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string, string) { // Body and appendix var body, appendix strings.Builder c := &body var empty int for i := 0; s.Scan(); i++ { - line := s.Text() + baseLine := s.Text() - line = strings.TrimRightFunc(line, unicode.IsSpace) + line := strings.TrimRightFunc(baseLine, unicode.IsSpace) line = strings.TrimPrefix(line, indent) if line == "" { @@ -342,6 +346,18 @@ func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (st continue } + if baseLine == line && indent != "" { + // The line does not start with the indent. + var remainder strings.Builder + remainder.WriteString(baseLine) + remainder.WriteByte('\n') + for ; s.Scan(); i++ { + remainder.WriteString(s.Text()) + remainder.WriteByte('\n') + } + return body.String(), appendix.String(), remainder.String() + } + // If requested, parse out "appendix" information (often added // by `git format-patch` and removed by `git am`). if separateAppendix && c == &body && line == "---" { @@ -359,7 +375,7 @@ func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (st c.WriteString(line) } - return body.String(), appendix.String() + return body.String(), appendix.String(), "" } func parseHeaderMail(mailLine string, r io.Reader) (*PatchHeader, error) { @@ -402,7 +418,7 @@ func parseHeaderMail(mailLine string, r io.Reader) (*PatchHeader, error) { h.SubjectPrefix, h.Title = parseSubject(subject) s := bufio.NewScanner(msg.Body) - h.Body, h.BodyAppendix = scanMessageBody(s, "", true) + h.Body, h.BodyAppendix, _ = scanMessageBody(s, "", true) if s.Err() != nil { return nil, s.Err() } diff --git a/gitdiff/patch_header_test.go b/gitdiff/patch_header_test.go index 9181b5c..1450535 100644 --- a/gitdiff/patch_header_test.go +++ b/gitdiff/patch_header_test.go @@ -441,6 +441,39 @@ Author: Morton Haypenny Title: expectedTitle, }, }, + "emptyCommit": { + Input: `commit 989f970ba7e43cd0eac6fcf71acfd9c92effc047 +Author: Morton Haypenny +Date: Sat Apr 12 05:20:49 2020 -0700 + + An empty commit + + With a body. + + +commit 61f5cd90bed4d204ee3feb3aa41ee91d4734855b +Author: Morton Haypenny +Date: Sat Apr 11 15:21:23 2020 -0700 + + A sample commit to test header parsing + + + The medium format shows the body, which + may wrap on to multiple lines. + + + Another body line. + + +`, + Header: PatchHeader{ + SHA: expectedSHA, + Author: expectedIdentity, + AuthorDate: expectedDate, + Title: expectedTitle, + Body: expectedBody, + }, + }, } for name, test := range tests { diff --git a/gitdiff/testdata/empty-commit.patch b/gitdiff/testdata/empty-commit.patch new file mode 100644 index 0000000..8896704 --- /dev/null +++ b/gitdiff/testdata/empty-commit.patch @@ -0,0 +1,83 @@ + +commit 48f13f61afb200ad8386573632cba3abd1703af2 +Author: Morton Haypenny +Date: Thu Nov 3 12:29:49 2017 +0000 + + A simple change + + The change is simple. + +diff --git a/dir/file1.txt b/dir/file1.txt +index 422cce7..24b39ed 100644 +--- a/dir/file1.txt ++++ b/dir/file1.txt +@@ -3,6 +3,8 @@ fragment 1 + context line +-old line 1 +-old line 2 + context line ++new line 1 ++new line 2 ++new line 3 + context line +-old line 3 ++new line 4 ++new line 5 +@@ -31,2 +33,2 @@ fragment 2 + context line +-old line 4 ++new line 6 + +commit dbbc5f1e926391d737e397ec60cc1ff94787e105 +Author: Regina Smithee +Date: Thu Sep 29 12:58:35 2017 +0000 + + Merged commit 3 + + Merged body 3. + + +commit f90c2e23af6618388cb4082f39849476e39105af +Merge: 989f970 e84c7ab +Author: Regina Smithee +Date: Thu Sep 15 16:37:30 2017 +0000 + + Merged commit 2 + + +commit 989f970ba7e43cd0eac6fcf71acfd9c92effc047 +Merge: 183d9cd e84c7ab +Author: Regina Smithee +Date: Thu Sep 15 14:41:57 2017 +0000 + + Merged commit 1 + + +commit 183d9cdbecda47e6e95acf9e4e23fa3a71ba99ad +Author: Regina Smithee +Date: Wed Sep 14 19:46:12 2017 +0000 + + Simple change + + Simple change body. + +diff --git a/d1/file2.txt b/d1/file2.txt +index c6513ff..bf4c6cc 100755 +--- a/d1/file2.txt ++++ b/d1/file2.txt +@@ -3,6 +3,8 @@ fragment 1 + context line +-old line 1 +-old line 2 + context line ++new line 1 ++new line 2 ++new line 3 + context line +-old line 3 ++new line 4 ++new line 5 +@@ -31,2 +33,2 @@ fragment 2 + context line +-old line 4 ++new line 6 From ed928274bb462517407313322788b74206d73114 Mon Sep 17 00:00:00 2001 From: Groboclown Date: Fri, 14 Jun 2024 23:45:13 -0500 Subject: [PATCH 5/5] Handle empty commit headers In some cases, especially for some merges, a commit can contain zero files. This can throw off the header parser by interpreting the following headers as the body text of the first commit. Down-stream, this causes the GitLeaks tool to identify the wrong commit ID with the leaked data. This patch currently fails the empty-commit check. A minor issue with the expected data eludes me at the moment. --- gitdiff/file_header.go | 2 + gitdiff/parser_test.go | 56 ++++++++++++++++++- gitdiff/patch_header.go | 28 +++++++--- gitdiff/patch_header_test.go | 33 ++++++++++++ gitdiff/testdata/empty-commit.patch | 83 +++++++++++++++++++++++++++++ go.mod | 2 + go.sum | 2 + 7 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 gitdiff/testdata/empty-commit.patch create mode 100644 go.sum diff --git a/gitdiff/file_header.go b/gitdiff/file_header.go index f0eaf4c..214454f 100644 --- a/gitdiff/file_header.go +++ b/gitdiff/file_header.go @@ -278,6 +278,8 @@ func parseGitHeaderData(f *File, line, defaultName string) (end bool, err error) {"@@ -", true, nil}, // TODO '@@@ -' is the simplest form of the combined diff; it // could be any number of '@' - one per parent + 1 '@'. + // The fix involves adding a separate '@' count discovery + // before this block (as '@' is the most commonly encountered structure). {"@@@ -", true, nil}, {"--- ", false, parseGitHeaderOldName}, {"+++ ", false, parseGitHeaderNewName}, diff --git a/gitdiff/parser_test.go b/gitdiff/parser_test.go index bf28d6a..852ba36 100644 --- a/gitdiff/parser_test.go +++ b/gitdiff/parser_test.go @@ -630,7 +630,49 @@ Date: Tue Apr 2 22:55:40 2019 -0700 TextFragments: textFragments, }, }, - Preamble: binaryPreamble, + Preamble: textPreamble, + }, + "emptyCommit": { + InputFile: "testdata/empty-commit.patch", + Output: []*File{ + { + PatchHeader: &PatchHeader{ + SHA: "48f13f61afb200ad8386573632cba3abd1703af2", + Author: &PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny@example.com", + }, + AuthorDate: asTime("2017-11-03T12:29:49Z"), + Title: "A simple change", + Body: "The change is simple.", + }, + OldName: "dir/file1.txt", + NewName: "dir/file1.txt", + OldMode: os.FileMode(0100644), + OldOIDPrefix: "422cce7", + NewOIDPrefix: "24b39ed", + TextFragments: textFragments, + }, + { + PatchHeader: &PatchHeader{ + SHA: "183d9cdbecda47e6e95acf9e4e23fa3a71ba99ad", + Author: &PatchIdentity{ + Name: "Regina Smithee", + Email: "rsmithee@example.com", + }, + AuthorDate: asTime("2017-09-14T19:46:12Z"), + Title: "Simple change", + Body: "Simple change body.", + }, + OldName: "d1/file2.txt", + NewName: "d1/file2.txt", + OldMode: os.FileMode(0100755), + OldOIDPrefix: "c6513ff", + NewOIDPrefix: "bf4c6cc", + TextFragments: textFragments, + }, + }, + Preamble: textPreamble, }, } @@ -660,6 +702,8 @@ Date: Tue Apr 2 22:55:40 2019 -0700 t.Fatalf("incorrect number of parsed files: expected %d, actual %d", len(test.Output), len(files)) } for i := range test.Output { + prepareFileDiff(test.Output[i]) + prepareFileDiff(files[i]) if !reflect.DeepEqual(test.Output[i], files[i]) { exp, _ := json.MarshalIndent(test.Output[i], "", " ") act, _ := json.MarshalIndent(files[i], "", " ") @@ -740,3 +784,13 @@ func asTime(s string) time.Time { } return t } + +// prepareFileDiff performs a minimal preparation for DeepEqual +func prepareFileDiff(f *File) { + if f == nil { + return + } + if f.PatchHeader != nil { + f.PatchHeader.AuthorDate = time.Unix(f.PatchHeader.AuthorDate.Unix(), 0) + } +} diff --git a/gitdiff/patch_header.go b/gitdiff/patch_header.go index a4671b7..e944667 100644 --- a/gitdiff/patch_header.go +++ b/gitdiff/patch_header.go @@ -294,10 +294,14 @@ func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) { if title != "" { // Don't check for an appendix - body, _ := scanMessageBody(s, indent, false) + body, _, remainder := scanMessageBody(s, indent, false) if s.Err() != nil { return nil, s.Err() } + if remainder != "" { + // There was another header immediately after this one. + return ParsePatchHeader(remainder) + } h.Body = body } @@ -326,15 +330,15 @@ func scanMessageTitle(s *bufio.Scanner) (title string, indent string) { return b.String(), indent } -func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) { +func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string, string) { // Body and appendix var body, appendix strings.Builder c := &body var empty int for i := 0; s.Scan(); i++ { - line := s.Text() + baseLine := s.Text() - line = strings.TrimRightFunc(line, unicode.IsSpace) + line := strings.TrimRightFunc(baseLine, unicode.IsSpace) line = strings.TrimPrefix(line, indent) if line == "" { @@ -342,6 +346,18 @@ func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (st continue } + if baseLine == line && indent != "" { + // The line does not start with the indent. + var remainder strings.Builder + remainder.WriteString(baseLine) + remainder.WriteByte('\n') + for ; s.Scan(); i++ { + remainder.WriteString(s.Text()) + remainder.WriteByte('\n') + } + return body.String(), appendix.String(), remainder.String() + } + // If requested, parse out "appendix" information (often added // by `git format-patch` and removed by `git am`). if separateAppendix && c == &body && line == "---" { @@ -359,7 +375,7 @@ func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (st c.WriteString(line) } - return body.String(), appendix.String() + return body.String(), appendix.String(), "" } func parseHeaderMail(mailLine string, r io.Reader) (*PatchHeader, error) { @@ -402,7 +418,7 @@ func parseHeaderMail(mailLine string, r io.Reader) (*PatchHeader, error) { h.SubjectPrefix, h.Title = parseSubject(subject) s := bufio.NewScanner(msg.Body) - h.Body, h.BodyAppendix = scanMessageBody(s, "", true) + h.Body, h.BodyAppendix, _ = scanMessageBody(s, "", true) if s.Err() != nil { return nil, s.Err() } diff --git a/gitdiff/patch_header_test.go b/gitdiff/patch_header_test.go index 9181b5c..1450535 100644 --- a/gitdiff/patch_header_test.go +++ b/gitdiff/patch_header_test.go @@ -441,6 +441,39 @@ Author: Morton Haypenny Title: expectedTitle, }, }, + "emptyCommit": { + Input: `commit 989f970ba7e43cd0eac6fcf71acfd9c92effc047 +Author: Morton Haypenny +Date: Sat Apr 12 05:20:49 2020 -0700 + + An empty commit + + With a body. + + +commit 61f5cd90bed4d204ee3feb3aa41ee91d4734855b +Author: Morton Haypenny +Date: Sat Apr 11 15:21:23 2020 -0700 + + A sample commit to test header parsing + + + The medium format shows the body, which + may wrap on to multiple lines. + + + Another body line. + + +`, + Header: PatchHeader{ + SHA: expectedSHA, + Author: expectedIdentity, + AuthorDate: expectedDate, + Title: expectedTitle, + Body: expectedBody, + }, + }, } for name, test := range tests { diff --git a/gitdiff/testdata/empty-commit.patch b/gitdiff/testdata/empty-commit.patch new file mode 100644 index 0000000..8896704 --- /dev/null +++ b/gitdiff/testdata/empty-commit.patch @@ -0,0 +1,83 @@ + +commit 48f13f61afb200ad8386573632cba3abd1703af2 +Author: Morton Haypenny +Date: Thu Nov 3 12:29:49 2017 +0000 + + A simple change + + The change is simple. + +diff --git a/dir/file1.txt b/dir/file1.txt +index 422cce7..24b39ed 100644 +--- a/dir/file1.txt ++++ b/dir/file1.txt +@@ -3,6 +3,8 @@ fragment 1 + context line +-old line 1 +-old line 2 + context line ++new line 1 ++new line 2 ++new line 3 + context line +-old line 3 ++new line 4 ++new line 5 +@@ -31,2 +33,2 @@ fragment 2 + context line +-old line 4 ++new line 6 + +commit dbbc5f1e926391d737e397ec60cc1ff94787e105 +Author: Regina Smithee +Date: Thu Sep 29 12:58:35 2017 +0000 + + Merged commit 3 + + Merged body 3. + + +commit f90c2e23af6618388cb4082f39849476e39105af +Merge: 989f970 e84c7ab +Author: Regina Smithee +Date: Thu Sep 15 16:37:30 2017 +0000 + + Merged commit 2 + + +commit 989f970ba7e43cd0eac6fcf71acfd9c92effc047 +Merge: 183d9cd e84c7ab +Author: Regina Smithee +Date: Thu Sep 15 14:41:57 2017 +0000 + + Merged commit 1 + + +commit 183d9cdbecda47e6e95acf9e4e23fa3a71ba99ad +Author: Regina Smithee +Date: Wed Sep 14 19:46:12 2017 +0000 + + Simple change + + Simple change body. + +diff --git a/d1/file2.txt b/d1/file2.txt +index c6513ff..bf4c6cc 100755 +--- a/d1/file2.txt ++++ b/d1/file2.txt +@@ -3,6 +3,8 @@ fragment 1 + context line +-old line 1 +-old line 2 + context line ++new line 1 ++new line 2 ++new line 3 + context line +-old line 3 ++new line 4 ++new line 5 +@@ -31,2 +33,2 @@ fragment 2 + context line +-old line 4 ++new line 6 diff --git a/go.mod b/go.mod index c547433..9f6af25 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/gitleaks/go-gitdiff go 1.13 + +require github.com/google/go-cmp v0.6.0 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..5a8d551 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=