Skip to content

Improve parser and lexer #811

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 30 additions & 27 deletions file/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package file
import (
"fmt"
"strings"
"unicode/utf8"
)

type Error struct {
Expand All @@ -19,43 +18,47 @@ func (e *Error) Error() string {
return e.format()
}

var tabReplacer = strings.NewReplacer("\t", " ")

func (e *Error) Bind(source Source) *Error {
src := source.String()

var runeCount, lineStart int
e.Line = 1
for i, r := range source {
if i == e.From {
e.Column = 0
for i, r := range src {
if runeCount == e.From {
break
}
if r == '\n' {
lineStart = i
e.Line++
e.Column = 0
} else {
e.Column++
}
runeCount++
e.Column++
}

lineEnd := lineStart + strings.IndexByte(src[lineStart:], '\n')
if lineEnd < lineStart {
lineEnd = len(src)
}
if lineStart == lineEnd {
return e
}
if snippet, found := source.Snippet(e.Line); found {
snippet := strings.Replace(snippet, "\t", " ", -1)
srcLine := "\n | " + snippet
var bytes = []byte(snippet)
var indLine = "\n | "
for i := 0; i < e.Column && len(bytes) > 0; i++ {
_, sz := utf8.DecodeRune(bytes)
bytes = bytes[sz:]
if sz > 1 {
goto noind
} else {
indLine += "."
}
}
if _, sz := utf8.DecodeRune(bytes); sz > 1 {
goto noind
} else {
indLine += "^"
}
srcLine += indLine

noind:
e.Snippet = srcLine
const prefix = "\n | "
line := src[lineStart:lineEnd]
snippet := new(strings.Builder)
snippet.Grow(2*len(prefix) + len(line) + e.Column + 1)
snippet.WriteString(prefix)
tabReplacer.WriteString(snippet, line)
snippet.WriteString(prefix)
for i := 0; i < e.Column; i++ {
snippet.WriteByte('.')
}
snippet.WriteByte('^')
e.Snippet = snippet.String()
return e
}

Expand Down
52 changes: 20 additions & 32 deletions file/source.go
Original file line number Diff line number Diff line change
@@ -1,48 +1,36 @@
package file

import (
"strings"
"unicode/utf8"
)
import "strings"

type Source []rune
type Source struct {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe just abandon Source altogether? And simply use string?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, I can do that really quick or in a separate PR if you prefer for easier reviewing.

raw string
}

func NewSource(contents string) Source {
return []rune(contents)
return Source{
raw: contents,
}
}

func (s Source) String() string {
return string(s)
return s.raw
}

func (s Source) Snippet(line int) (string, bool) {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is no longer used but I'm keeping it in case someone was using it.

if s == nil {
if s.raw == "" {
return "", false
}
lines := strings.Split(string(s), "\n")
lineOffsets := make([]int, len(lines))
var offset int
for i, line := range lines {
offset = offset + utf8.RuneCountInString(line) + 1
lineOffsets[i] = offset
}
charStart, found := getLineOffset(lineOffsets, line)
if !found || len(s) == 0 {
return "", false
var start int
for i := 1; i < line; i++ {
pos := strings.IndexByte(s.raw[start:], '\n')
if pos < 0 {
return "", false
}
start += pos + 1
}
charEnd, found := getLineOffset(lineOffsets, line+1)
if found {
return string(s[charStart : charEnd-1]), true
}
return string(s[charStart:]), true
}

func getLineOffset(lineOffsets []int, line int) (int, bool) {
if line == 1 {
return 0, true
} else if line > 1 && line <= len(lineOffsets) {
offset := lineOffsets[line-2]
return offset, true
end := start + strings.IndexByte(s.raw[start:], '\n')
if end < start {
end = len(s.raw)
}
return -1, false
return s.raw[start:end], true
}
85 changes: 85 additions & 0 deletions internal/ring/ring.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package ring

// Ring is a very simple ring buffer implementation that uses a slice. The
// internal slice will only grow, never shrink. When it grows, it grows in
// chunks of "chunkSize" (given as argument in the [New] function). Pointer and
// reference types can be safely used because memory is cleared.
type Ring[T any] struct {
data []T
back, len, chunkSize int
}

func New[T any](chunkSize int) *Ring[T] {
if chunkSize < 1 {
panic("chunkSize must be greater than zero")
}
return &Ring[T]{
chunkSize: chunkSize,
}
}

func (r *Ring[T]) Len() int {
return r.len
}

func (r *Ring[T]) Cap() int {
return len(r.data)
}

func (r *Ring[T]) Reset() {
var zero T
for i := range r.data {
r.data[i] = zero // clear mem, optimized by the compiler, in Go 1.21 the "clear" builtin can be used
}
r.back = 0
r.len = 0
}

// Nth returns the n-th oldest value (zero-based) in the ring without making
// any change.
func (r *Ring[T]) Nth(n int) (v T, ok bool) {
if n < 0 || n >= r.len || len(r.data) == 0 {
return v, false
}
n = (n + r.back) % len(r.data)
return r.data[n], true
}

// Dequeue returns the oldest value.
func (r *Ring[T]) Dequeue() (v T, ok bool) {
if r.len == 0 {
return v, false
}
v, r.data[r.back] = r.data[r.back], v // retrieve and clear mem
r.len--
r.back = (r.back + 1) % len(r.data)
return v, true
}

// Enqueue adds an item to the ring.
func (r *Ring[T]) Enqueue(v T) {
if r.len == len(r.data) {
r.grow()
}
writePos := (r.back + r.len) % len(r.data)
r.data[writePos] = v
r.len++
}

func (r *Ring[T]) grow() {
s := make([]T, len(r.data)+r.chunkSize)
if r.len > 0 {
chunk1 := r.back + r.len
if chunk1 > len(r.data) {
chunk1 = len(r.data)
}
copied := copy(s, r.data[r.back:chunk1])

if copied < r.len { // wrapped slice
chunk2 := r.len - copied
copy(s[copied:], r.data[:chunk2])
}
}
r.back = 0
r.data = s
}
140 changes: 140 additions & 0 deletions internal/ring/ring_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package ring

import (
"fmt"
"testing"
)

func TestRing(t *testing.T) {
type op = ringOp[int]
testRing(t, New[int](3),
// noops on empty ring
op{cap: 0, opType: opRst, value: 0, items: []int{}},
op{cap: 0, opType: opDeq, value: 0, items: []int{}},

// basic
op{cap: 3, opType: opEnq, value: 1, items: []int{1}},
op{cap: 3, opType: opDeq, value: 1, items: []int{}},

// wrapping
op{cap: 3, opType: opEnq, value: 2, items: []int{2}},
op{cap: 3, opType: opEnq, value: 3, items: []int{2, 3}},
op{cap: 3, opType: opEnq, value: 4, items: []int{2, 3, 4}},
op{cap: 3, opType: opDeq, value: 2, items: []int{3, 4}},
op{cap: 3, opType: opDeq, value: 3, items: []int{4}},
op{cap: 3, opType: opDeq, value: 4, items: []int{}},

// resetting
op{cap: 3, opType: opEnq, value: 2, items: []int{2}},
op{cap: 3, opType: opRst, value: 0, items: []int{}},
op{cap: 3, opType: opDeq, value: 0, items: []int{}},

// growing without wrapping
op{cap: 3, opType: opEnq, value: 5, items: []int{5}},
op{cap: 3, opType: opEnq, value: 6, items: []int{5, 6}},
op{cap: 3, opType: opEnq, value: 7, items: []int{5, 6, 7}},
op{cap: 6, opType: opEnq, value: 8, items: []int{5, 6, 7, 8}},
op{cap: 6, opType: opRst, value: 0, items: []int{}},
op{cap: 6, opType: opDeq, value: 0, items: []int{}},

// growing and wrapping
op{cap: 6, opType: opEnq, value: 9, items: []int{9}},
op{cap: 6, opType: opEnq, value: 10, items: []int{9, 10}},
op{cap: 6, opType: opEnq, value: 11, items: []int{9, 10, 11}},
op{cap: 6, opType: opEnq, value: 12, items: []int{9, 10, 11, 12}},
op{cap: 6, opType: opEnq, value: 13, items: []int{9, 10, 11, 12, 13}},
op{cap: 6, opType: opEnq, value: 14, items: []int{9, 10, 11, 12, 13, 14}},
op{cap: 6, opType: opDeq, value: 9, items: []int{10, 11, 12, 13, 14}},
op{cap: 6, opType: opDeq, value: 10, items: []int{11, 12, 13, 14}},
op{cap: 6, opType: opEnq, value: 15, items: []int{11, 12, 13, 14, 15}},
op{cap: 6, opType: opEnq, value: 16, items: []int{11, 12, 13, 14, 15, 16}},
op{cap: 9, opType: opEnq, value: 17, items: []int{11, 12, 13, 14, 15, 16, 17}}, // grows wrapped
op{cap: 9, opType: opDeq, value: 11, items: []int{12, 13, 14, 15, 16, 17}},
op{cap: 9, opType: opDeq, value: 12, items: []int{13, 14, 15, 16, 17}},
op{cap: 9, opType: opDeq, value: 13, items: []int{14, 15, 16, 17}},
op{cap: 9, opType: opDeq, value: 14, items: []int{15, 16, 17}},
op{cap: 9, opType: opDeq, value: 15, items: []int{16, 17}},
op{cap: 9, opType: opDeq, value: 16, items: []int{17}},
op{cap: 9, opType: opDeq, value: 17, items: []int{}},
op{cap: 9, opType: opDeq, value: 0, items: []int{}},
)

t.Run("should panic on invalid chunkSize", func(t *testing.T) {
defer func() {
if r := recover(); r == nil {
t.Fatalf("should have panicked")
}
}()
New[int](0)
})
}

const (
opEnq = iota // enqueue an item
opDeq // dequeue an item and an item was available
opRst // reset
)

type ringOp[T comparable] struct {
cap int // expected values
opType int // opEnq or opDeq
value T // value to enqueue or value expected for dequeue; ignored for opRst
items []T // items left
}

func testRing[T comparable](t *testing.T, r *Ring[T], ops ...ringOp[T]) {
for i, op := range ops {
testOK := t.Run(fmt.Sprintf("opIndex=%v", i), func(t *testing.T) {
testRingOp(t, r, op)
})
if !testOK {
return
}
}
}

func testRingOp[T comparable](t *testing.T, r *Ring[T], op ringOp[T]) {
var zero T
switch op.opType {
case opEnq:
r.Enqueue(op.value)
case opDeq:
shouldSucceed := r.Len() > 0
v, ok := r.Dequeue()
switch {
case ok != shouldSucceed:
t.Fatalf("should have succeeded: %v", shouldSucceed)
case ok && v != op.value:
t.Fatalf("expected value: %v; got: %v", op.value, v)
case !ok && v != zero:
t.Fatalf("expected zero value; got: %v", v)
}
case opRst:
r.Reset()
}
if c := r.Cap(); c != op.cap {
t.Fatalf("expected cap: %v; got: %v", op.cap, c)
}
if l := r.Len(); l != len(op.items) {
t.Errorf("expected Len(): %v; got: %v", len(op.items), l)
}
var got []T
for i := 0; ; i++ {
v, ok := r.Nth(i)
if !ok {
break
}
got = append(got, v)
}
if l := len(got); l != len(op.items) {
t.Errorf("expected items: %v\ngot items: %v", op.items, got)
}
for i := range op.items {
if op.items[i] != got[i] {
t.Fatalf("expected items: %v\ngot items: %v", op.items, got)
}
}
if v, ok := r.Nth(len(op.items)); ok || v != zero {
t.Fatalf("expected no more items, got: v=%v; ok=%v", v, ok)
}
}
20 changes: 20 additions & 0 deletions parser/bench_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package parser

import "testing"

func BenchmarkParser(b *testing.B) {
const source = `
/*
Showing worst case scenario
*/
let value = trim("contains escapes \n\"\\ \U0001F600 and non ASCII ñ"); // inline comment
len(value) == 0x2A
// let's introduce an error too
whatever
`
b.ReportAllocs()
p := new(Parser)
for i := 0; i < b.N; i++ {
p.Parse(source, nil)
}
Comment on lines +16 to +19
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the previous code does not have a reusable parser, the code that I run to benchmark the old code was:

	for i := 0; i < b.N; i++ {
		Parse(source)
	}

}
Loading
Loading