From e1d19bfa241c8ecbcfd151ceef2fe410bd8bfff2 Mon Sep 17 00:00:00 2001 From: Lily Skye Date: Thu, 19 Feb 2026 16:22:28 -0700 Subject: [PATCH] use dlclark/regexp2 for highlighting so syntax yaml files can use lookahead/lookbehind --- go.mod | 1 + go.sum | 2 + pkg/highlight/highlight_test.go | 229 ++++++++++++++++++++++++++++++++ pkg/highlight/highlighter.go | 67 +++++----- pkg/highlight/parser.go | 42 +++--- pkg/highlight/regexp.go | 57 ++++++++ 6 files changed, 347 insertions(+), 51 deletions(-) create mode 100644 pkg/highlight/highlight_test.go create mode 100644 pkg/highlight/regexp.go diff --git a/go.mod b/go.mod index 2e243c271d..4cc348ee08 100644 --- a/go.mod +++ b/go.mod @@ -24,6 +24,7 @@ require ( require ( github.com/creack/pty v1.1.18 // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/dlclark/regexp2 v1.11.5 // indirect github.com/gdamore/encoding v1.0.0 // indirect github.com/lucasb-eyer/go-colorful v1.0.3 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/go.sum b/go.sum index 2acd88f55d..7e371eccf3 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZQ= +github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/gdamore/encoding v1.0.0 h1:+7OoQ1Bc6eTm5niUzBa0Ctsh6JbMW6Ra+YNuAtDBdko= diff --git a/pkg/highlight/highlight_test.go b/pkg/highlight/highlight_test.go new file mode 100644 index 0000000000..4ebb6f0e90 --- /dev/null +++ b/pkg/highlight/highlight_test.go @@ -0,0 +1,229 @@ +package highlight + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// resetGroups clears the global group registry so tests don't interfere with each other. +func resetGroups() { + Groups = make(map[string]Group) + numGroups = 0 +} + +// makeHighlighter parses a YAML syntax definition and returns a Highlighter. +func makeHighlighter(t *testing.T, yaml string) *Highlighter { + t.Helper() + data := []byte(yaml) + + header, err := MakeHeaderYaml(data) + if err != nil { + t.Fatalf("MakeHeaderYaml: %v", err) + } + + f, err := ParseFile(data) + if err != nil { + t.Fatalf("ParseFile: %v", err) + } + + def, err := ParseDef(f, header) + if err != nil { + t.Fatalf("ParseDef: %v", err) + } + + return NewHighlighter(def) +} + +// groupAt returns the Group name at a given character position in a LineMatch. +// It walks backwards from pos to find the most recent color change. +func groupAt(lm LineMatch, pos int) string { + best := -1 + for k := range lm { + if k <= pos && k > best { + best = k + } + } + if best < 0 { + return "" + } + return lm[best].String() +} + +func TestLookahead(t *testing.T) { + resetGroups() + assert := assert.New(t) + + h := makeHighlighter(t, ` +filetype: test-lookahead +detect: + filename: "\\.test$" +rules: + - identifier.function: "\\w+(?=\\()" +`) + + matches := h.HighlightString("foo(bar)") + assert.Len(matches, 1) + + lm := matches[0] + + // "foo" (positions 0-2) should be highlighted as identifier.function + assert.Equal("identifier.function", groupAt(lm, 0)) + assert.Equal("identifier.function", groupAt(lm, 1)) + assert.Equal("identifier.function", groupAt(lm, 2)) + + // "(" at position 3 should NOT be identifier.function (lookahead doesn't consume) + assert.NotEqual("identifier.function", groupAt(lm, 3)) + + // "bar" should not match (not followed by "(") + assert.NotEqual("identifier.function", groupAt(lm, 4)) +} + +func TestNegativeLookahead(t *testing.T) { + resetGroups() + assert := assert.New(t) + + h := makeHighlighter(t, ` +filetype: test-neg-lookahead +detect: + filename: "\\.test$" +rules: + - identifier: "foo(?!bar)" +`) + + matches := h.HighlightString("foobar foobaz") + assert.Len(matches, 1) + + lm := matches[0] + + // "foobar": "foo" is followed by "bar", so negative lookahead fails — no match + assert.NotEqual("identifier", groupAt(lm, 0)) + + // "foobaz": "foo" at position 7 is NOT followed by "bar", so it matches + assert.Equal("identifier", groupAt(lm, 7)) + assert.Equal("identifier", groupAt(lm, 8)) + assert.Equal("identifier", groupAt(lm, 9)) + + // "baz" at position 10 should not be highlighted + assert.NotEqual("identifier", groupAt(lm, 10)) +} + +func TestLookbehind(t *testing.T) { + resetGroups() + assert := assert.New(t) + + h := makeHighlighter(t, ` +filetype: test-lookbehind +detect: + filename: "\\.test$" +rules: + - identifier.field: "(?<=\\.)\\w+" +`) + + matches := h.HighlightString("obj.field") + assert.Len(matches, 1) + + lm := matches[0] + + // "obj" (positions 0-2) should NOT be highlighted + assert.NotEqual("identifier.field", groupAt(lm, 0)) + + // "." at position 3 should NOT be highlighted (lookbehind doesn't consume) + assert.NotEqual("identifier.field", groupAt(lm, 3)) + + // "field" (positions 4-8) should be highlighted + assert.Equal("identifier.field", groupAt(lm, 4)) + assert.Equal("identifier.field", groupAt(lm, 5)) + assert.Equal("identifier.field", groupAt(lm, 8)) +} + +func TestNegativeLookbehind(t *testing.T) { + resetGroups() + assert := assert.New(t) + + h := makeHighlighter(t, ` +filetype: test-neg-lookbehind +detect: + filename: "\\.test$" +rules: + - identifier: "(?= len(str) { - return CharacterCount(str) - } - return CharacterCount(str[:p]) -} - // A State represents the region at the end of a line type State *region @@ -82,30 +72,47 @@ func NewHighlighter(def *Def) *Highlighter { // color's group (represented as one byte) type LineMatch map[int]Group -func findIndex(regex *regexp.Regexp, skip *regexp.Regexp, str []byte) []int { - var strbytes []byte +func findIndex(regex *regexp2.Regexp, skip *regexp2.Regexp, str []byte) []int { + searchStr := string(str) if skip != nil { - strbytes = skip.ReplaceAllFunc(str, func(match []byte) []byte { - res := make([]byte, CharacterCount(match)) - return res - }) - } else { - strbytes = str + replaced, err := skip.ReplaceFunc(searchStr, func(m regexp2.Match) string { + return strings.Repeat(" ", m.Length) + }, 0, -1) + if err != nil { + log.Printf("highlight: regex timeout in skip replace for pattern %q: %v", skip.String(), err) + } else { + searchStr = replaced + } } - match := regex.FindIndex(strbytes) - if match == nil { + m, err := regex.FindStringMatch(searchStr) + if err != nil { + log.Printf("highlight: regex timeout finding match for pattern %q: %v", regex.String(), err) + return nil + } + if m == nil { return nil } - // return []int{match.Index, match.Index + match.Length} - return []int{runePos(match[0], str), runePos(match[1], str)} + return []int{charPosFromRunePos(m.Index, str), charPosFromRunePos(m.Index+m.Length, str)} } -func findAllIndex(regex *regexp.Regexp, str []byte) [][]int { - matches := regex.FindAllIndex(str, -1) - for i, m := range matches { - matches[i][0] = runePos(m[0], str) - matches[i][1] = runePos(m[1], str) +func findAllIndex(regex *regexp2.Regexp, str []byte) [][]int { + var matches [][]int + m, err := regex.FindStringMatch(string(str)) + if err != nil { + log.Printf("highlight: regex timeout finding matches for pattern %q: %v", regex.String(), err) + return nil + } + for m != nil { + matches = append(matches, []int{ + charPosFromRunePos(m.Index, str), + charPosFromRunePos(m.Index+m.Length, str), + }) + m, err = regex.FindNextMatch(m) + if err != nil { + log.Printf("highlight: regex timeout finding next match for pattern %q: %v", regex.String(), err) + break + } } return matches } diff --git a/pkg/highlight/parser.go b/pkg/highlight/parser.go index 804e6bc0bd..c98304130d 100644 --- a/pkg/highlight/parser.go +++ b/pkg/highlight/parser.go @@ -4,8 +4,8 @@ import ( "bytes" "errors" "fmt" - "regexp" + "github.com/dlclark/regexp2" "gopkg.in/yaml.v2" ) @@ -38,9 +38,9 @@ type Def struct { type Header struct { FileType string - FileNameRegex *regexp.Regexp - HeaderRegex *regexp.Regexp - SignatureRegex *regexp.Regexp + FileNameRegex *regexp2.Regexp + HeaderRegex *regexp2.Regexp + SignatureRegex *regexp2.Regexp } type HeaderYaml struct { @@ -62,7 +62,7 @@ type File struct { // the regular expression to match the pattern type pattern struct { group Group - regex *regexp.Regexp + regex *regexp2.Regexp } // rules defines which patterns and regions can be used to highlight @@ -82,9 +82,9 @@ type region struct { group Group limitGroup Group parent *region - start *regexp.Regexp - end *regexp.Regexp - skip *regexp.Regexp + start *regexp2.Regexp + end *regexp2.Regexp + skip *regexp2.Regexp rules *rules } @@ -109,13 +109,13 @@ func MakeHeader(data []byte) (*Header, error) { signatureRegexStr := string(lines[3]) if fnameRegexStr != "" { - header.FileNameRegex, err = regexp.Compile(fnameRegexStr) + header.FileNameRegex, err = compileRegex(fnameRegexStr) } if err == nil && headerRegexStr != "" { - header.HeaderRegex, err = regexp.Compile(headerRegexStr) + header.HeaderRegex, err = compileRegex(headerRegexStr) } if err == nil && signatureRegexStr != "" { - header.SignatureRegex, err = regexp.Compile(signatureRegexStr) + header.SignatureRegex, err = compileRegex(signatureRegexStr) } if err != nil { @@ -138,13 +138,13 @@ func MakeHeaderYaml(data []byte) (*Header, error) { header.FileType = hdrYaml.FileType if hdrYaml.Detect.FNameRegexStr != "" { - header.FileNameRegex, err = regexp.Compile(hdrYaml.Detect.FNameRegexStr) + header.FileNameRegex, err = compileRegex(hdrYaml.Detect.FNameRegexStr) } if err == nil && hdrYaml.Detect.HeaderRegexStr != "" { - header.HeaderRegex, err = regexp.Compile(hdrYaml.Detect.HeaderRegexStr) + header.HeaderRegex, err = compileRegex(hdrYaml.Detect.HeaderRegexStr) } if err == nil && hdrYaml.Detect.SignatureRegexStr != "" { - header.SignatureRegex, err = regexp.Compile(hdrYaml.Detect.SignatureRegexStr) + header.SignatureRegex, err = compileRegex(hdrYaml.Detect.SignatureRegexStr) } if err != nil { @@ -157,7 +157,7 @@ func MakeHeaderYaml(data []byte) (*Header, error) { // MatchFileName will check the given file name with the stored regex func (header *Header) MatchFileName(filename string) bool { if header.FileNameRegex != nil { - return header.FileNameRegex.MatchString(filename) + return matchString(header.FileNameRegex, filename) } return false @@ -165,7 +165,7 @@ func (header *Header) MatchFileName(filename string) bool { func (header *Header) MatchFileHeader(firstLine []byte) bool { if header.HeaderRegex != nil { - return header.HeaderRegex.Match(firstLine) + return matchBytes(header.HeaderRegex, firstLine) } return false @@ -179,7 +179,7 @@ func (header *Header) HasFileSignature() bool { // MatchFileSignature will check the given line with the stored regex func (header *Header) MatchFileSignature(line []byte) bool { if header.SignatureRegex != nil { - return header.SignatureRegex.Match(line) + return matchBytes(header.SignatureRegex, line) } return false @@ -363,7 +363,7 @@ func parseRules(input []any, curRegion *region) (ru *rules, err error) { ru.includes = append(ru.includes, object) } else { // Pattern - r, err := regexp.Compile(object) + r, err := compileRegex(object) if err != nil { return nil, err } @@ -419,7 +419,7 @@ func parseRegion(group string, regionInfo map[any]any, prevRegion *region) (r *r return nil, fmt.Errorf("Empty start in %s", group) } - r.start, err = regexp.Compile(start) + r.start, err = compileRegex(start) if err != nil { return nil, err } @@ -434,7 +434,7 @@ func parseRegion(group string, regionInfo map[any]any, prevRegion *region) (r *r return nil, fmt.Errorf("Empty end in %s", group) } - r.end, err = regexp.Compile(end) + r.end, err = compileRegex(end) if err != nil { return nil, err } @@ -449,7 +449,7 @@ func parseRegion(group string, regionInfo map[any]any, prevRegion *region) (r *r return nil, fmt.Errorf("Empty skip in %s", group) } - r.skip, err = regexp.Compile(skip) + r.skip, err = compileRegex(skip) if err != nil { return nil, err } diff --git a/pkg/highlight/regexp.go b/pkg/highlight/regexp.go new file mode 100644 index 0000000000..0ab5746d9d --- /dev/null +++ b/pkg/highlight/regexp.go @@ -0,0 +1,57 @@ +package highlight + +import ( + "log" + "time" + "unicode/utf8" + + "github.com/dlclark/regexp2" +) + +// compileRegex compiles a pattern using regexp2 with a 1-second match timeout +// for backtracking safety. +func compileRegex(pattern string) (*regexp2.Regexp, error) { + re, err := regexp2.Compile(pattern, regexp2.None) + if err != nil { + return nil, err + } + re.MatchTimeout = 1 * time.Second + return re, nil +} + +// matchString returns whether re matches s. On timeout, it logs the error +// and returns false. +func matchString(re *regexp2.Regexp, s string) bool { + m, err := re.MatchString(s) + if err != nil { + log.Printf("highlight: regex timeout matching pattern %q: %v", re.String(), err) + return false + } + return m +} + +// matchBytes returns whether re matches b. On timeout, it logs the error +// and returns false. +func matchBytes(re *regexp2.Regexp, b []byte) bool { + return matchString(re, string(b)) +} + +// charPosFromRunePos converts a rune index (as returned by regexp2) to a +// character index (which skips combining marks via isMark). For ASCII text +// (the vast majority of source code), rune pos == char pos. +func charPosFromRunePos(runeIdx int, str []byte) int { + charPos := 0 + runeCount := 0 + for i := 0; i < len(str); { + if runeCount >= runeIdx { + return charPos + } + r, size := utf8.DecodeRune(str[i:]) + i += size + runeCount++ + if !isMark(r) { + charPos++ + } + } + return charPos +}