From 3202072ef65ab911f13d0871c304e4789a8b42ea Mon Sep 17 00:00:00 2001 From: iambus Date: Mon, 13 Dec 2021 17:02:34 +0800 Subject: [PATCH 1/8] fix panic: runtime error: slice bounds out of range --- diffmatchpatch/diff.go | 16 ++++------------ diffmatchpatch/patch_test.go | 25 +++++++++++++++++++++++++ diffmatchpatch/stringutil.go | 18 ++++-------------- 3 files changed, 33 insertions(+), 26 deletions(-) diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 2a9f2dc..9b0e4ab 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go @@ -34,8 +34,6 @@ const ( DiffInsert Operation = 1 // DiffEqual item represents an equal diff. DiffEqual Operation = 0 - //IndexSeparator is used to seperate the array indexes in an index string - IndexSeparator = "," ) // Diff represents one diff operation @@ -406,17 +404,11 @@ func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff { hydrated := make([]Diff, 0, len(diffs)) for _, aDiff := range diffs { - chars := strings.Split(aDiff.Text, IndexSeparator) - text := make([]string, len(chars)) - - for i, r := range chars { - i1, err := strconv.Atoi(r) - if err == nil { - text[i] = lineArray[i1] - } + var sb strings.Builder + for _, i := range []rune(aDiff.Text) { + sb.WriteString(lineArray[i]) } - - aDiff.Text = strings.Join(text, "") + aDiff.Text = sb.String() hydrated = append(hydrated, aDiff) } return hydrated diff --git a/diffmatchpatch/patch_test.go b/diffmatchpatch/patch_test.go index b019f88..c564f8c 100644 --- a/diffmatchpatch/patch_test.go +++ b/diffmatchpatch/patch_test.go @@ -337,3 +337,28 @@ func TestPatchApply(t *testing.T) { assert.Equal(t, tc.ExpectedApplies, actualApplies, fmt.Sprintf("Test case #%d, %s", i, tc.Name)) } } + +func TestPatchMakeOutOfRangePanic(t *testing.T) { + text1 := ` + 1111111111111 000000 + ------------- ------ + xxxxxxxxxxxxx ------ + xxxxxxxxxxxxx ------ + xxxxxxxxxxxxx xxxxxx + xxxxxxxxxxxxx ...... + xxxxxxxxxxxxx 111111 + xxxxxxxxxxxxx ?????? + xxxxxxxxxxxxx 333333 + xxxxxxxxxxxxx 555555 + xxxxxxxxxx xxxxx + xxxxxxxxxx xxxxx + xxxxxxxxxx xxxxx + xxxxxxxxxx xxxxx +` + text2 := ` + 2222222222222 000000 + xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx` + dmp := New() + patches := dmp.PatchMake(text1, text2) + assert.Equal(t, 6, len(patches), "TestPatchMakeOutOfRangePanic") +} diff --git a/diffmatchpatch/stringutil.go b/diffmatchpatch/stringutil.go index 44c4359..da0257c 100644 --- a/diffmatchpatch/stringutil.go +++ b/diffmatchpatch/stringutil.go @@ -9,7 +9,6 @@ package diffmatchpatch import ( - "strconv" "strings" "unicode/utf8" ) @@ -89,18 +88,9 @@ func runesIndex(r1, r2 []rune) int { } func intArrayToString(ns []uint32) string { - if len(ns) == 0 { - return "" + runes := make([]rune, len(ns)) + for i := 0; i < len(ns); i++ { + runes[i] = rune(ns[i]) } - - indexSeparator := IndexSeparator[0] - - // Appr. 3 chars per num plus the comma. - b := []byte{} - for _, n := range ns { - b = strconv.AppendInt(b, int64(n), 10) - b = append(b, indexSeparator) - } - b = b[:len(b)-1] - return string(b) + return string(runes) } From ca1db569574d5f4009033235fb6b9246208b0187 Mon Sep 17 00:00:00 2001 From: moyrn Date: Tue, 1 Mar 2022 09:18:52 +0800 Subject: [PATCH 2/8] fix bug: CVE-2019-11254 upgrade gopkg.in/yaml.v2 --- go.mod | 2 +- go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index d46c77d..c731033 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ require ( github.com/kr/pretty v0.1.0 // indirect github.com/stretchr/testify v1.4.0 gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect - gopkg.in/yaml.v2 v2.2.4 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect ) go 1.12 diff --git a/go.sum b/go.sum index 4b80e08..8dd9f36 100644 --- a/go.sum +++ b/go.sum @@ -19,3 +19,5 @@ gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4 h1:/eiJrUcujPVeJ3xlSWaiNi3uSVmDGBK1pDHUHAnao1I= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= From 3403a1642ff93712b699ebd52ecf3c53c2b08107 Mon Sep 17 00:00:00 2001 From: Boyu Guo Date: Fri, 23 Sep 2022 10:44:03 +0800 Subject: [PATCH 3/8] update tests --- diffmatchpatch/diff_test.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index acb97e3..7c8c9a1 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -314,10 +314,10 @@ func TestDiffLinesToChars(t *testing.T) { dmp := New() for i, tc := range []TestCase{ - {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}}, - {"a", "b", "1", "2", []string{"", "a", "b"}}, + {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\x01\x02\x03\x03", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}}, + {"a", "b", "\x01", "\x02", []string{"", "a", "b"}}, // Omit final newline. - {"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}}, + {"alpha\nbeta\nalpha", "", "\x01\x02\x03", "", []string{"", "alpha\n", "beta\n", "alpha"}}, } { actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2) assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc)) @@ -330,14 +330,14 @@ func TestDiffLinesToChars(t *testing.T) { lineList := []string{ "", // Account for the initial empty element of the lines array. } - var charList []string + var charList []rune for x := 1; x < n+1; x++ { lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, strconv.Itoa(x)) + charList = append(charList, rune(x)) } lines := strings.Join(lineList, "") - chars := strings.Join(charList[:], ",") - assert.Equal(t, n, len(strings.Split(chars, ","))) + chars := string(charList) + assert.Equal(t, n, len(charList)) actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "") assert.Equal(t, chars, actualChars1) @@ -358,8 +358,8 @@ func TestDiffCharsToLines(t *testing.T) { for i, tc := range []TestCase{ { Diffs: []Diff{ - {DiffEqual, "1,2,1"}, - {DiffInsert, "2,1,2"}, + {DiffEqual, "\x01\x02\x01"}, + {DiffInsert, "\x02\x01\x02"}, }, Lines: []string{"", "alpha\n", "beta\n"}, @@ -378,13 +378,13 @@ func TestDiffCharsToLines(t *testing.T) { lineList := []string{ "", // Account for the initial empty element of the lines array. } - charList := []string{} + charList := []rune{} for x := 1; x <= n; x++ { lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, strconv.Itoa(x)) + charList = append(charList, rune(x)) } assert.Equal(t, n, len(charList)) - chars := strings.Join(charList[:], ",") + chars := string(charList) actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList) assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual) From ff0d34ba8093cd989afb5ead57451452a16df340 Mon Sep 17 00:00:00 2001 From: Boyu Guo Date: Tue, 27 Sep 2022 01:17:47 +0800 Subject: [PATCH 4/8] skip invalid unicode code point to fix TestMassiveRuneDiffConversion --- diffmatchpatch/diff.go | 27 ++++++++++++++++----------- diffmatchpatch/diff_test.go | 12 ++++++------ diffmatchpatch/index.go | 32 ++++++++++++++++++++++++++++++++ diffmatchpatch/index_test.go | 16 ++++++++++++++++ diffmatchpatch/stringutil.go | 8 -------- 5 files changed, 70 insertions(+), 25 deletions(-) create mode 100644 diffmatchpatch/index.go create mode 100644 diffmatchpatch/index_test.go diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 9b0e4ab..25cc51f 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go @@ -388,16 +388,21 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int, } // DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line. -// It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes. func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) { - chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) - return chars1, chars2, lineArray + chars1, chars2, lineArray := dmp.diffLinesToIndexes(text1, text2) + return indexesToString(chars1), indexesToString(chars2), lineArray } // DiffLinesToRunes splits two texts into a list of runes. func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) { + chars1, chars2, lineArray := dmp.diffLinesToIndexes(text1, text2) + return []rune(indexesToString(chars1)), []rune(indexesToString(chars2)), lineArray +} + +// diffLinesToIndexes splits two texts into a list of indexes +func (dmp *DiffMatchPatch) diffLinesToIndexes(text1, text2 string) ([]index, []index, []string) { chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) - return []rune(chars1), []rune(chars2), lineArray + return chars1, chars2, lineArray } // DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text. @@ -405,7 +410,7 @@ func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) [] hydrated := make([]Diff, 0, len(diffs)) for _, aDiff := range diffs { var sb strings.Builder - for _, i := range []rune(aDiff.Text) { + for _, i := range stringToIndex(aDiff.Text) { sb.WriteString(lineArray[i]) } aDiff.Text = sb.String() @@ -1301,7 +1306,7 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di } // diffLinesToStrings splits two texts into a list of strings. Each string represents one line. -func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) { +func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) ([]index, []index, []string) { // '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' @@ -1309,16 +1314,16 @@ func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, stri strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray) strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray) - return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray + return strIndexArray1, strIndexArray2, lineArray } // diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string. -func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []uint32 { +func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []index { // Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4 lineStart := 0 lineEnd := -1 - strs := []uint32{} + strs := []index{} for lineEnd < len(text)-1 { lineEnd = indexOf(text, "\n", lineStart) @@ -1332,11 +1337,11 @@ func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]str lineValue, ok := lineHash[line] if ok { - strs = append(strs, uint32(lineValue)) + strs = append(strs, index(lineValue)) } else { *lineArray = append(*lineArray, line) lineHash[line] = len(*lineArray) - 1 - strs = append(strs, uint32(len(*lineArray)-1)) + strs = append(strs, index(len(*lineArray)-1)) } } diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index 7c8c9a1..7955b1f 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -330,13 +330,13 @@ func TestDiffLinesToChars(t *testing.T) { lineList := []string{ "", // Account for the initial empty element of the lines array. } - var charList []rune + var charList []index for x := 1; x < n+1; x++ { lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, rune(x)) + charList = append(charList, index(x)) } lines := strings.Join(lineList, "") - chars := string(charList) + chars := indexesToString(charList) assert.Equal(t, n, len(charList)) actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "") @@ -378,13 +378,13 @@ func TestDiffCharsToLines(t *testing.T) { lineList := []string{ "", // Account for the initial empty element of the lines array. } - charList := []rune{} + charList := []index{} for x := 1; x <= n; x++ { lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, rune(x)) + charList = append(charList, index(x)) } assert.Equal(t, n, len(charList)) - chars := string(charList) + chars := indexesToString(charList) actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList) assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual) diff --git a/diffmatchpatch/index.go b/diffmatchpatch/index.go new file mode 100644 index 0000000..965a1c6 --- /dev/null +++ b/diffmatchpatch/index.go @@ -0,0 +1,32 @@ +package diffmatchpatch + +type index uint32 + +const runeSkipStart = 0xd800 +const runeSkipEnd = 0xdfff + 1 +const runeMax = 0x110000 // next invalid code point + +func stringToIndex(text string) []index { + runes := []rune(text) + indexes := make([]index, len(runes)) + for i, r := range runes { + if r < runeSkipEnd { + indexes[i] = index(r) + } else { + indexes[i] = index(r) - (runeSkipEnd - runeSkipStart) + } + } + return indexes +} + +func indexesToString(indexes []index) string { + runes := make([]rune, len(indexes)) + for i, index := range indexes { + if index < runeSkipStart { + runes[i] = rune(index) + } else { + runes[i] = rune(index + (runeSkipEnd - runeSkipStart)) + } + } + return string(runes) +} diff --git a/diffmatchpatch/index_test.go b/diffmatchpatch/index_test.go new file mode 100644 index 0000000..6f1d982 --- /dev/null +++ b/diffmatchpatch/index_test.go @@ -0,0 +1,16 @@ +package diffmatchpatch + +import ( + "github.com/stretchr/testify/assert" + "testing" +) + +func TestIndexConversion(t *testing.T) { + n := runeMax - (runeSkipEnd - runeSkipStart) + indexes := make([]index, n) + for i := 0; i < n; i++ { + indexes[i] = index(i) + } + indexes2 := stringToIndex(indexesToString(indexes)) + assert.EqualValues(t, indexes, indexes2) +} diff --git a/diffmatchpatch/stringutil.go b/diffmatchpatch/stringutil.go index da0257c..265f29c 100644 --- a/diffmatchpatch/stringutil.go +++ b/diffmatchpatch/stringutil.go @@ -86,11 +86,3 @@ func runesIndex(r1, r2 []rune) int { } return -1 } - -func intArrayToString(ns []uint32) string { - runes := make([]rune, len(ns)) - for i := 0; i < len(ns); i++ { - runes[i] = rune(ns[i]) - } - return string(runes) -} From 6dbe13c91e8e382b53e2cc8e8146fa6172fefec4 Mon Sep 17 00:00:00 2001 From: nrnrk Date: Sun, 16 Oct 2022 01:17:33 +0900 Subject: [PATCH 5/8] fix: use common lineHash to share indice between text1 and text2 Use common cache of line contents between two texts in `DiffLinesToChars` to get line diffs correctly. In some cases, line diffs cannot be retrieved correctly in the standard way (https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs#line-mode). In the below case, we failed to get line diffs correctly before this fix. ```go:main.go package main import ( "fmt" "github.com/sergi/go-diff/diffmatchpatch" ) const ( text1 = `hoge: step11: - arrayitem1 - arrayitem2 step12: step21: hoge step22: -93 fuga: flatitem ` text2 = `hoge: step11: - arrayitem4 - arrayitem2 - arrayitem3 step12: step21: hoge step22: -92 fuga: flatitem ` ) func main() { dmp := diffmatchpatch.New() a, b, c := dmp.DiffLinesToChars(text1, text2) diffs := dmp.DiffMain(a, b, false) diffs = dmp.DiffCharsToLines(diffs, c) // diffs = dmp.DiffCleanupSemantic(diffs) fmt.Println(diffs) } ``` ```text:output [{Insert hoge: step11: hoge: } {Equal hoge: } {Insert hoge: } {Equal step11: } {Insert hoge: } {Equal - arrayitem1 } {Insert hoge: } {Equal - arrayitem2 } {Insert hoge: } {Equal step12: } {Insert hoge: } {Equal step21: hoge } {Insert hoge: } {Equal step22: -93 } {Delete fuga: flatitem }] ``` Note: This fix corresponds to a javascript implementation. (ref: https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L466) --- diffmatchpatch/diff.go | 8 ++++---- diffmatchpatch/diff_test.go | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 2a9f2dc..4f7b424 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go @@ -1313,17 +1313,17 @@ func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, stri // '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' + lineHash := make(map[string]int) //Each string has the index of lineArray which it points to - strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray) - strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray) + strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray, lineHash) + strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray, lineHash) return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray } // diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string. -func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []uint32 { +func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []uint32 { // Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. - lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4 lineStart := 0 lineEnd := -1 strs := []uint32{} diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index acb97e3..d6fed50 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -318,6 +318,8 @@ func TestDiffLinesToChars(t *testing.T) { {"a", "b", "1", "2", []string{"", "a", "b"}}, // Omit final newline. {"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}}, + // Same lines in Text1 and Text2 + {"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}}, } { actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2) assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc)) From 74798f5aa5d5c9720e7ff8895fee08e19db05cc7 Mon Sep 17 00:00:00 2001 From: Darkhan Kubigenov Date: Sun, 22 Jan 2023 01:50:45 +0000 Subject: [PATCH 6/8] Add a failing line diff test case Current implementation produces wrong result because it calls `DiffMain` on the following 2 arguments: * `1,2,3,4,5,6,7,8,9,10` * `1,2,3,4,5,6,7,8,9,11` This numbers represent indices into the lines array. The algorithm finds that equal part of those strings is `1,2,3,4,5,6,7,8,9,1` and which is followed by `Delete 0` and `Insert `1`. --- diffmatchpatch/diff_test.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index d6fed50..95a7c8d 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -1468,6 +1468,38 @@ func TestMassiveRuneDiffConversion(t *testing.T) { assert.NotEmpty(t, diffs) } +func TestDiffPartialLineIndex(t *testing.T) { + dmp := New() + t1, t2, tt := dmp.DiffLinesToChars( +`line 1 +line 2 +line 3 +line 4 +line 5 +line 6 +line 7 +line 8 +line 9 +line 10 text1`, +`line 1 +line 2 +line 3 +line 4 +line 5 +line 6 +line 7 +line 8 +line 9 +line 10 text2`) + diffs := dmp.DiffMain(t1, t2, false) + diffs = dmp.DiffCharsToLines(diffs, tt) + assert.Equal(t, []Diff{ + Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"}, + Diff{DiffDelete, "line 10 text1"}, + Diff{DiffInsert, "line 10 text2"}, + }, diffs) +} + func BenchmarkDiffMain(bench *testing.B) { s1 := "`Twas brillig, and the slithy toves\nDid gyre and gimble in the wabe:\nAll mimsy were the borogoves,\nAnd the mome raths outgrabe.\n" s2 := "I am the very model of a modern major general,\nI've information vegetable, animal, and mineral,\nI know the kings of England, and I quote the fights historical,\nFrom Marathon to Waterloo, in order categorical.\n" From a674b3095807a3679f154c5a4a6df738aeffdfdf Mon Sep 17 00:00:00 2001 From: Darkhan Kubigenov Date: Sat, 21 Jan 2023 23:12:39 +0000 Subject: [PATCH 7/8] Fix line diff by using runes without separators [The suggested approach](https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs#line-mode ) for doing line level diffing is the following set of steps: 1. `ti1, ti2, linesIdx = DiffLinesToChars(t1, t2)` 2. `diffs = DiffMain(ti1, ti2)` 3. `DiffCharsToLines(diff, linesIdx)` The original implementation in `google/diff-match-patch` uses unicode codepoints for storing indices in `ti1` and `ti2` joined by an empty string. Current implementation in this repo stores them as integers joined by a comma. While this implementation makes `ti1` and `ti2` more readable, it introduces bugs when trying to rely on it when doing line level diffing with `DiffMain`. The root cause of the issue is that an integer line index might span more than one character/rune, and `DiffMain` can assume that two different lines having the same index prefix match partially. For example, indices 123 and 129 will have partial match `12`. In that example, the diff will show lines 3 and 9 which is not correct. A simple failing test case demonstrating this issue is available at `TestDiffPartialLineIndex`. In this PR I am adjusting the algorithm to use the same approach as in [diff-match-patch](https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L508-L510 ) by storing each line index as a rune. While a rune in Golang is a type alias to uint32, not every uint32 can be a valid rune. During string to rune slice conversion invalid runes will be replaced with `utf.RuneError`. The integer to rune generation logic is based on the table in https://en.wikipedia.org/wiki/UTF-8#Encoding The first 127 lines will work the fastest as they are represented as a single bytes. Higher numbers are represented as 2-4 bytes. In addition to that, the range `U+D800 - U+DFFF` contains [invalid codepoints](https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling). and all codepoints higher or equal to `0xD800` are incremented by `0xDFFF - 0xD800`. The maximum representable integer using this approach is 1'112'060. This improves on Javascript implementation which currently [bails out](https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L503-L505 ) when files have more than 65535 lines. --- diffmatchpatch/diff.go | 13 ++-- diffmatchpatch/diff_test.go | 38 ++++++------ diffmatchpatch/stringutil.go | 100 +++++++++++++++++++++++++++--- diffmatchpatch/stringutil_test.go | 18 ++++++ go.mod | 2 +- 5 files changed, 133 insertions(+), 38 deletions(-) diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 4f7b424..915d509 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go @@ -34,8 +34,6 @@ const ( DiffInsert Operation = 1 // DiffEqual item represents an equal diff. DiffEqual Operation = 0 - //IndexSeparator is used to seperate the array indexes in an index string - IndexSeparator = "," ) // Diff represents one diff operation @@ -406,14 +404,11 @@ func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff { hydrated := make([]Diff, 0, len(diffs)) for _, aDiff := range diffs { - chars := strings.Split(aDiff.Text, IndexSeparator) - text := make([]string, len(chars)) + runes := []rune(aDiff.Text) + text := make([]string, len(runes)) - for i, r := range chars { - i1, err := strconv.Atoi(r) - if err == nil { - text[i] = lineArray[i1] - } + for i, r := range runes { + text[i] = lineArray[runeToInt(r)] } aDiff.Text = strings.Join(text, "") diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index 95a7c8d..4532b79 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -314,12 +314,12 @@ func TestDiffLinesToChars(t *testing.T) { dmp := New() for i, tc := range []TestCase{ - {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}}, - {"a", "b", "1", "2", []string{"", "a", "b"}}, + {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\x01\x02\x03\x03", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}}, + {"a", "b", "\x01", "\x02", []string{"", "a", "b"}}, // Omit final newline. - {"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}}, + {"alpha\nbeta\nalpha", "", "\x01\x02\x03", "", []string{"", "alpha\n", "beta\n", "alpha"}}, // Same lines in Text1 and Text2 - {"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}}, + {"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "\x01\x02\x03", "\x01\x04\x03\x05", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}}, } { actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2) assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc)) @@ -332,14 +332,13 @@ func TestDiffLinesToChars(t *testing.T) { lineList := []string{ "", // Account for the initial empty element of the lines array. } - var charList []string + var charList []rune for x := 1; x < n+1; x++ { lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, strconv.Itoa(x)) + charList = append(charList, rune(x)) } lines := strings.Join(lineList, "") - chars := strings.Join(charList[:], ",") - assert.Equal(t, n, len(strings.Split(chars, ","))) + chars := string(charList) actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "") assert.Equal(t, chars, actualChars1) @@ -360,8 +359,8 @@ func TestDiffCharsToLines(t *testing.T) { for i, tc := range []TestCase{ { Diffs: []Diff{ - {DiffEqual, "1,2,1"}, - {DiffInsert, "2,1,2"}, + {DiffEqual, "\x01\x02\x01"}, + {DiffInsert, "\x02\x01\x02"}, }, Lines: []string{"", "alpha\n", "beta\n"}, @@ -380,13 +379,12 @@ func TestDiffCharsToLines(t *testing.T) { lineList := []string{ "", // Account for the initial empty element of the lines array. } - charList := []string{} + charList := []rune{} for x := 1; x <= n; x++ { lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, strconv.Itoa(x)) + charList = append(charList, rune(x)) } - assert.Equal(t, n, len(charList)) - chars := strings.Join(charList[:], ",") + chars := string(charList) actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList) assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual) @@ -1471,7 +1469,7 @@ func TestMassiveRuneDiffConversion(t *testing.T) { func TestDiffPartialLineIndex(t *testing.T) { dmp := New() t1, t2, tt := dmp.DiffLinesToChars( -`line 1 + `line 1 line 2 line 3 line 4 @@ -1481,7 +1479,7 @@ line 7 line 8 line 9 line 10 text1`, -`line 1 + `line 1 line 2 line 3 line 4 @@ -1494,10 +1492,10 @@ line 10 text2`) diffs := dmp.DiffMain(t1, t2, false) diffs = dmp.DiffCharsToLines(diffs, tt) assert.Equal(t, []Diff{ - Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"}, - Diff{DiffDelete, "line 10 text1"}, - Diff{DiffInsert, "line 10 text2"}, - }, diffs) + Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"}, + Diff{DiffDelete, "line 10 text1"}, + Diff{DiffInsert, "line 10 text2"}, + }, diffs) } func BenchmarkDiffMain(bench *testing.B) { diff --git a/diffmatchpatch/stringutil.go b/diffmatchpatch/stringutil.go index 44c4359..eb727bb 100644 --- a/diffmatchpatch/stringutil.go +++ b/diffmatchpatch/stringutil.go @@ -9,11 +9,16 @@ package diffmatchpatch import ( - "strconv" + "fmt" "strings" "unicode/utf8" ) +const UNICODE_INVALID_RANGE_START = 0xD800 +const UNICODE_INVALID_RANGE_END = 0xDFFF +const UNICODE_INVALID_RANGE_DELTA = UNICODE_INVALID_RANGE_END - UNICODE_INVALID_RANGE_START + 1 +const UNICODE_RANGE_MAX = 0x10FFFF + // unescaper unescapes selected chars for compatibility with JavaScript's encodeURI. // In speed critical applications this could be dropped since the receiving application will certainly decode these fine. Note that this function is case-sensitive. Thus "%3F" would not be unescaped. But this is ok because it is only called with the output of HttpUtility.UrlEncode which returns lowercase hex. Example: "%3f" -> "?", "%24" -> "$", etc. var unescaper = strings.NewReplacer( @@ -93,14 +98,93 @@ func intArrayToString(ns []uint32) string { return "" } - indexSeparator := IndexSeparator[0] - - // Appr. 3 chars per num plus the comma. - b := []byte{} + b := []rune{} for _, n := range ns { - b = strconv.AppendInt(b, int64(n), 10) - b = append(b, indexSeparator) + b = append(b, intToRune(n)) } - b = b[:len(b)-1] return string(b) } + +// These constants define the number of bits representable +// in 1,2,3,4 byte utf8 sequences, respectively. +const ONE_BYTE_BITS = 7 +const TWO_BYTE_BITS = 11 +const THREE_BYTE_BITS = 16 +const FOUR_BYTE_BITS = 21 + +// Helper for getting a sequence of bits from an integer. +func getBits(i uint32, cnt byte, from byte) byte { + return byte((i >> from) & ((1 << cnt) - 1)) +} + +// Converts an integer in the range 0~1112060 into a rune. +// Based on the ranges table in https://en.wikipedia.org/wiki/UTF-8 +func intToRune(i uint32) rune { + if i < (1 << ONE_BYTE_BITS) { + return rune(i) + } + + if i < (1 << TWO_BYTE_BITS) { + r, size := utf8.DecodeRune([]byte{0b11000000 | getBits(i, 5, 6), 0b10000000 | getBits(i, 6, 0)}) + if size != 2 || r == utf8.RuneError { + panic(fmt.Sprintf("Error encoding an int %d with size 2, got rune %v and size %d", size, r, i)) + } + return r + } + + // Last -3 here needed because for some reason 3rd to last codepoint 65533 in this range + // was returning utf8.RuneError during encoding. + if i < ((1 << THREE_BYTE_BITS) - UNICODE_INVALID_RANGE_DELTA - 3) { + if i >= UNICODE_INVALID_RANGE_START { + i += UNICODE_INVALID_RANGE_DELTA + } + + r, size := utf8.DecodeRune([]byte{0b11100000 | getBits(i, 4, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)}) + if size != 3 || r == utf8.RuneError { + panic(fmt.Sprintf("Error encoding an int %d with size 3, got rune %v and size %d", size, r, i)) + } + return r + } + + if i < (1<= UNICODE_INVALID_RANGE_END { + return result - UNICODE_INVALID_RANGE_DELTA + } + + return result + } + + if size == 4 { + result := uint32(bytes[0]&0b111)<<18 | uint32(bytes[1]&0b111111)<<12 | uint32(bytes[2]&0b111111)<<6 | uint32(bytes[3]&0b111111) + return result - UNICODE_INVALID_RANGE_DELTA - 3 + } + + panic(fmt.Sprintf("Unexpected state decoding rune=%v size=%d", r, size)) +} diff --git a/diffmatchpatch/stringutil_test.go b/diffmatchpatch/stringutil_test.go index ab2bc10..73ab6ca 100644 --- a/diffmatchpatch/stringutil_test.go +++ b/diffmatchpatch/stringutil_test.go @@ -114,3 +114,21 @@ func TestLastIndexOf(t *testing.T) { assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc)) } } + +// Exhaustive check for all ints from 0 to 1112060 for correctness of implementation +// of `intToRune() -> runeToInt()`. +// This test is slow and runs longer than 5 seconds but it does provide a safety +// guarantee that these 2 functions are correct for the ranges we support. +func TestRuneToInt(t *testing.T) { + + for i := uint32(0); i <= UNICODE_RANGE_MAX-UNICODE_INVALID_RANGE_DELTA-3; i += 1 { + r := intToRune(i) + ic := runeToInt(r) + + assert.Equal(t, i, ic, fmt.Sprintf("intToRune(%d)=%d and runeToInt(%d)=%d", i, r, r, ic)) + } + + assert.Panics(t, func() { + intToRune(UNICODE_RANGE_MAX - UNICODE_INVALID_RANGE_DELTA - 2) + }) +} diff --git a/go.mod b/go.mod index c731033..c7886ce 100644 --- a/go.mod +++ b/go.mod @@ -8,4 +8,4 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect ) -go 1.12 +go 1.13 From 4ebce9caa45902d8ca5f63a242d53e1e5c6072d3 Mon Sep 17 00:00:00 2001 From: michaelcheah <20640150+michaelcheah@users.noreply.github.com> Date: Fri, 24 Mar 2023 19:37:54 +0000 Subject: [PATCH 8/8] fix: colored display for diffs with newlines --- diffmatchpatch/diff.go | 27 +++++++++++++++++++++------ diffmatchpatch/diff_test.go | 11 +++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 4f7b424..a312c1a 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go @@ -1151,13 +1151,28 @@ func (dmp *DiffMatchPatch) DiffPrettyText(diffs []Diff) string { switch diff.Type { case DiffInsert: - _, _ = buff.WriteString("\x1b[32m") - _, _ = buff.WriteString(text) - _, _ = buff.WriteString("\x1b[0m") + lines := strings.Split(text, "\n") + for i, line := range lines { + _, _ = buff.WriteString("\x1b[32m") + _, _ = buff.WriteString(line) + if i < len(lines)-1 { + _, _ = buff.WriteString("\x1b[0m\n") + } else { + _, _ = buff.WriteString("\x1b[0m") + } + } + case DiffDelete: - _, _ = buff.WriteString("\x1b[31m") - _, _ = buff.WriteString(text) - _, _ = buff.WriteString("\x1b[0m") + lines := strings.Split(text, "\n") + for i, line := range lines { + _, _ = buff.WriteString("\x1b[31m") + _, _ = buff.WriteString(line) + if i < len(lines)-1 { + _, _ = buff.WriteString("\x1b[0m\n") + } else { + _, _ = buff.WriteString("\x1b[0m") + } + } case DiffEqual: _, _ = buff.WriteString(text) } diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index d6fed50..04329d6 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -1033,6 +1033,17 @@ func TestDiffPrettyText(t *testing.T) { Expected: "a\n\x1b[31mb\x1b[0m\x1b[32mc&d\x1b[0m", }, + { + Diffs: []Diff{ + {Type: DiffEqual, Text: "a\n"}, + {Type: DiffDelete, Text: "b\nc\n"}, + {Type: DiffEqual, Text: "def"}, + {Type: DiffInsert, Text: "\ng\nh"}, + {Type: DiffEqual, Text: "\ni"}, + }, + + Expected: "a\n\x1b[31mb\x1b[0m\n\x1b[31mc\x1b[0m\n\x1b[31m\x1b[0mdef\x1b[32m\x1b[0m\n\x1b[32mg\x1b[0m\n\x1b[32mh\x1b[0m\ni", + }, } { actual := dmp.DiffPrettyText(tc.Diffs) assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc))