Skip to content

Commit fffef31

Browse files
committed
Introduce Lexer type and other parsing utils
1 parent 619e2a8 commit fffef31

File tree

1 file changed

+149
-0
lines changed

1 file changed

+149
-0
lines changed

internal/filter/lexer.go

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
//go:generate goyacc -v parser.output -o parser.go parser.y
2+
3+
package filter
4+
5+
import (
6+
"errors"
7+
"fmt"
8+
"regexp"
9+
"strings"
10+
"text/scanner"
11+
)
12+
13+
// regex contains a compiled regexp and is used by the Lexer to match filter identifiers.
14+
// Currently, it allows to match any character except these inside the curly braces.
15+
var regex = regexp.MustCompile("[^!&|~<>=()]")
16+
17+
// Parse wraps the auto generated yyParse function.
18+
// It parses the given filter string and returns on success a Filter instance.
19+
func Parse(expr string) (rule Filter, err error) {
20+
lex := new(Lexer)
21+
lex.IsIdentRune = isIdentRune
22+
lex.Init(strings.NewReader(expr))
23+
24+
// scanner.Init sets the error function to nil, therefore, we have to register
25+
// our error function after the scanner initialization.
26+
lex.Scanner.Error = lex.ScanError
27+
28+
// Enable parsers error verbose to get more context of the parsing failures
29+
yyErrorVerbose = true
30+
31+
defer func() {
32+
// All the grammar rules panics when encountering any errors while reducing the filter rules, so try
33+
// to recover from it and return an error instead. Since we're used a named return values, we can set
34+
// the err value even in deferred function. See https://go.dev/blog/defer-panic-and-recover
35+
if r := recover(); r != nil {
36+
lex.err = errors.New(fmt.Sprint(r))
37+
}
38+
39+
err = lex.err
40+
}()
41+
42+
yyParse(lex)
43+
44+
return lex.rule, lex.err
45+
}
46+
47+
// Lexer is used to tokenize the filter input into a set literals.
48+
// This is just a wrapper around the Scanner type and implements the yyLexer interface used by the parser.
49+
type Lexer struct {
50+
scanner.Scanner
51+
52+
rule Filter
53+
err error
54+
}
55+
56+
func (l *Lexer) Lex(yyval *yySymType) int {
57+
token := l.Scan()
58+
lit := l.TokenText()
59+
yyval.text = lit
60+
if token == scanner.Ident {
61+
return T_IDENTIFIER
62+
}
63+
64+
if token == scanner.String {
65+
return T_STRING
66+
}
67+
68+
switch lit {
69+
case "&":
70+
return '&'
71+
case "|":
72+
return '|'
73+
case "~":
74+
return T_LIKE
75+
case "=":
76+
return T_EQUAL
77+
case "(":
78+
return '('
79+
case ")":
80+
return ')'
81+
case "!":
82+
next := l.Peek()
83+
switch next {
84+
case '=', '~':
85+
yyval.text = "!" + string(next)
86+
// Since we manually picked the next char input, we also need to advance the internal scanner
87+
// states by calling Scan. Otherwise, the same rune will be scanned multiple times.
88+
l.Scan()
89+
90+
if next == '~' {
91+
return T_UNLIKE
92+
} else {
93+
return T_UNEQUAL
94+
}
95+
default:
96+
return '!'
97+
}
98+
case "<":
99+
next := l.Peek()
100+
if next == '=' {
101+
yyval.text = "<="
102+
// Since we manually picked the next char input, we also need to advance the internal scanner
103+
// states by calling Scan. Otherwise, the same rune will be scanned multiple times.
104+
l.Scan()
105+
106+
return T_LESS_THAN_OR_EQUAL
107+
}
108+
109+
return T_LESS_THAN
110+
case ">":
111+
next := l.Peek()
112+
if next == '=' {
113+
yyval.text = ">="
114+
// Since we manually picked the next char input, we also need to advance the internal scanner
115+
// states by calling Scan. Otherwise, the same rune will be scanned multiple times.
116+
l.Scan()
117+
118+
return T_GREATER_THAN_OR_EQUAL
119+
}
120+
121+
return T_GREATER_THAN
122+
}
123+
124+
// No more inputs to scan that we are interested in.
125+
// Scan returns EOF as well if there's no more token to stream, but we just want to be explicit.
126+
return scanner.EOF
127+
}
128+
129+
// Error receives any syntax/semantic errors produced by the parser.
130+
// The parser never returns an error when it fails to parse, but will forward the errors to
131+
// our lexer with some additional context instead. This function then wraps the provided err
132+
// and adds line, column number and offset to the error string.
133+
func (l *Lexer) Error(s string) {
134+
l.err = errors.New(fmt.Sprintf("%d:%d (%d): %s", l.Line, l.Column, l.Offset, s))
135+
}
136+
137+
// isIdentRune provides custom implementation of scanner.IsIdentRune.
138+
// This function determines whether a given character is allowed to be part of an identifier.
139+
func isIdentRune(ch rune, i int) bool {
140+
return regex.MatchString(string(ch))
141+
}
142+
143+
// ScanError is used to capture all errors the Scanner encounters.
144+
// It's a rare case that the scanner actually will fail to scan the input string, but in these
145+
// cases it will just output to std.Err and we won't be able to notice this. Hence, this function
146+
// is registered by the filter.Parse function after the Lexer initialization.
147+
func (l *Lexer) ScanError(s *scanner.Scanner, msg string) {
148+
l.Error(msg)
149+
}

0 commit comments

Comments
 (0)