diff --git a/buffer.go b/buffer.go new file mode 100644 index 0000000..609d876 --- /dev/null +++ b/buffer.go @@ -0,0 +1,84 @@ +package lexmachine + +import ( + "fmt" + + "github.com/timtadh/lexmachine/stream" +) + +// Buffer is a abstracts to implementations of "text". The first is a []byte with a +type Buffer interface { + Byte(i int) byte + HasByte(i int) bool + TC() int + SetTC(i int) +} + +type SliceBuffer struct { + Text []byte + TextCounter int +} + +func sliceBuffer(text []byte, tc int) *SliceBuffer { + return &SliceBuffer{ + Text: text, + TextCounter: tc, + } +} + +func (s *SliceBuffer) Byte(i int) byte { + return s.Text[i] +} + +func (s *SliceBuffer) HasByte(i int) bool { + return i >= 0 && i < len(s.Text) +} + +func (s *SliceBuffer) TC() int { + return s.TextCounter +} + +func (s *SliceBuffer) SetTC(tc int) { + s.TextCounter = tc +} + +func (s *SliceBuffer) finalize() int { + return s.TextCounter +} + +type StreamBuffer struct { + Text stream.Stream + Lookahead int +} + +func streamBuffer(text stream.Stream) *StreamBuffer { + return &StreamBuffer{ + Text: text, + Lookahead: 0, + } +} + +func (s *StreamBuffer) Byte(i int) byte { + c, has := s.Text.Peek(i) + if !has { + panic(fmt.Errorf("read past the end of the buffer")) + } + return c.Byte +} + +func (s *StreamBuffer) HasByte(i int) bool { + _, has := s.Text.Peek(i) + return has +} + +func (s *StreamBuffer) TC() int { + return s.Lookahead +} + +func (s *StreamBuffer) SetTC(tc int) { + s.Lookahead = tc +} + +func (s *StreamBuffer) finalize() { + s.Text.Advance(s.Lookahead) +} diff --git a/lexer.go b/lexer.go index ab6b219..9dedc43 100644 --- a/lexer.go +++ b/lexer.go @@ -1,61 +1,19 @@ package lexmachine import ( - "bytes" "fmt" -) -import ( dfapkg "github.com/timtadh/lexmachine/dfa" "github.com/timtadh/lexmachine/frontend" "github.com/timtadh/lexmachine/inst" "github.com/timtadh/lexmachine/machines" + "github.com/timtadh/lexmachine/stream" + "github.com/timtadh/lexmachine/stream_machines" ) -// Token is an optional token representation you could use to represent the -// tokens produced by a lexer built with lexmachine. -// -// Here is an example for constructing a lexer Action which turns a -// machines.Match struct into a token using the scanners Token helper function. -// -// func token(name string, tokenIds map[string]int) lex.Action { -// return func(s *lex.Scanner, m *machines.Match) (interface{}, error) { -// return s.Token(tokenIds[name], string(m.Bytes), m), nil -// } -// } -// -type Token struct { - Type int - Value interface{} - Lexeme []byte - TC int - StartLine int - StartColumn int - EndLine int - EndColumn int -} - -// Equals checks the equality of two tokens ignoring the Value field. -func (t *Token) Equals(other *Token) bool { - if t == nil && other == nil { - return true - } else if t == nil { - return false - } else if other == nil { - return false - } - return t.TC == other.TC && - t.StartLine == other.StartLine && - t.StartColumn == other.StartColumn && - t.EndLine == other.EndLine && - t.EndColumn == other.EndColumn && - bytes.Equal(t.Lexeme, other.Lexeme) && - t.Type == other.Type -} - -// String formats the token in a human readable form. -func (t *Token) String() string { - return fmt.Sprintf("%d %q %d (%d, %d)-(%d, %d)", t.Type, t.Value, t.TC, t.StartLine, t.StartColumn, t.EndLine, t.EndColumn) +type pattern struct { + regex []byte + action Action } // An Action is a function which get called when the Scanner finds a match @@ -64,12 +22,7 @@ func (t *Token) String() string { // have different needs Actions merely return an interface{}. This allows you // to represent a token in anyway you wish. An example Token struct is provided // above. -type Action func(scan *Scanner, match *machines.Match) (interface{}, error) - -type pattern struct { - regex []byte - action Action -} +type Action func(scan Scanner, match *machines.Match) (interface{}, error) // Lexer is a "builder" object which lets you construct a Scanner type which // does the actual work of tokenizing (splitting up and categorizing) a byte @@ -84,111 +37,13 @@ type Lexer struct { dfa *dfapkg.DFA } -// Scanner tokenizes a byte string based on the patterns provided to the lexer -// object which constructed the scanner. This object works as functional -// iterator using the Next method. -// -// Example -// -// lexer, err := CreateLexer() -// if err != nil { -// return err -// } -// scanner, err := lexer.Scanner(someBytes) -// if err != nil { -// return err -// } -// for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() { -// if err != nil { -// return err -// } -// fmt.Println(tok) -// } -// -type Scanner struct { - lexer *Lexer - matches map[int]int - scan machines.Scanner - Text []byte - TC int - pTC int - sLine int - sColumn int - eLine int - eColumn int -} - -// Next iterates through the string being scanned returning one token at a time -// until either an error is encountered or the end of the string is reached. -// The token is returned by the tok value. An error is indicated by err. -// Finally, eos (a bool) indicates the End Of String when it returns as true. -// -// Example -// -// for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() { -// if err != nil { -// // handle the error and exit the loop. For example: -// return err -// } -// // do some processing on tok or store it somewhere. eg. -// fmt.Println(tok) -// } -// -// One useful error type which could be returned by Next() is a -// match.UnconsumedInput which provides the position information for where in -// the text the scanning failed. -// -// For more information on functional iterators see: -// http://hackthology.com/functional-iteration-in-go.html -func (s *Scanner) Next() (tok interface{}, err error, eos bool) { - var token interface{} - for token == nil { - tc, match, err, scan := s.scan(s.TC) - if scan == nil { - return nil, nil, true - } else if err != nil { - return nil, err, false - } else if match == nil { - return nil, fmt.Errorf("No match but no error"), false - } - s.scan = scan - s.pTC = s.TC - s.TC = tc - s.sLine = match.StartLine - s.sColumn = match.StartColumn - s.eLine = match.EndLine - s.eColumn = match.EndColumn - - pattern := s.lexer.patterns[s.matches[match.PC]] - token, err = pattern.action(s, match) - if err != nil { - return nil, err, false - } - } - return token, nil, false -} - -// Token is a helper function for constructing a Token type inside of a Action. -func (s *Scanner) Token(typ int, value interface{}, m *machines.Match) *Token { - return &Token{ - Type: typ, - Value: value, - Lexeme: m.Bytes, - TC: m.TC, - StartLine: m.StartLine, - StartColumn: m.StartColumn, - EndLine: m.EndLine, - EndColumn: m.EndColumn, - } -} - // NewLexer constructs a new lexer object. func NewLexer() *Lexer { return &Lexer{} } -// Scanner creates a scanner for a particular byte string from the lexer. -func (l *Lexer) Scanner(text []byte) (*Scanner, error) { +// TextScanner creates a scanner for a particular byte string from the lexer. +func (l *Lexer) TextScanner(text []byte) (*TextScanner, error) { if l.program == nil && l.dfa == nil { err := l.Compile() if err != nil { @@ -200,9 +55,9 @@ func (l *Lexer) Scanner(text []byte) (*Scanner, error) { textCopy := make([]byte, len(text)) copy(textCopy, text) - var s *Scanner + var s *TextScanner if l.dfa != nil { - s = &Scanner{ + s = &TextScanner{ lexer: l, matches: l.dfaMatches, scan: machines.DFALexerEngine(l.dfa.Start, l.dfa.Error, l.dfa.Trans, l.dfa.Accepting, textCopy), @@ -210,7 +65,7 @@ func (l *Lexer) Scanner(text []byte) (*Scanner, error) { TC: 0, } } else { - s = &Scanner{ + s = &TextScanner{ lexer: l, matches: l.nfaMatches, scan: machines.LexerEngine(l.program, textCopy), @@ -221,6 +76,29 @@ func (l *Lexer) Scanner(text []byte) (*Scanner, error) { return s, nil } +// StreamScanner creates a scanner for a particular stream from the lexer. +func (l *Lexer) StreamScanner(text stream.Stream) (*StreamScanner, error) { + if l.program == nil && l.dfa == nil { + err := l.Compile() + if err != nil { + return nil, err + } + } + + var s *StreamScanner + if l.dfa != nil { + s = &StreamScanner{ + lexer: l, + matches: l.dfaMatches, + scan: stream_machines.DFALexerEngine(l.dfa.Start, l.dfa.Error, l.dfa.Trans, l.dfa.Accepting, text), + Text: text, + } + } else { + panic("not implemented") + } + return s, nil +} + // Add pattern to match on. When a match occurs during scanning the action // function will be called by the Scanner to turn the low level machines.Match // struct into a token. @@ -325,7 +203,7 @@ func (l *Lexer) CompileDFA() error { } func (l *Lexer) matchesEmptyString() (bool, error) { - s, err := l.Scanner([]byte("")) + s, err := l.TextScanner([]byte("")) if err != nil { return false, err } diff --git a/lexer_test.go b/lexer_test.go index 76749ae..7f3ab5f 100644 --- a/lexer_test.go +++ b/lexer_test.go @@ -1,6 +1,7 @@ package lexmachine import ( + "bytes" "fmt" "strconv" "strings" @@ -8,6 +9,7 @@ import ( "github.com/timtadh/data-structures/test" "github.com/timtadh/lexmachine/machines" + "github.com/timtadh/lexmachine/stream" ) func TestSimple(x *testing.T) { @@ -22,25 +24,25 @@ func TestSimple(x *testing.T) { lexer.Add( []byte("print"), - func(scan *Scanner, match *machines.Match) (interface{}, error) { + func(scan Scanner, match *machines.Match) (interface{}, error) { return scan.Token(PRINT, nil, match), nil }, ) lexer.Add( []byte("([a-z]|[A-Z])([a-z]|[A-Z]|[0-9]|_)*"), - func(scan *Scanner, match *machines.Match) (interface{}, error) { + func(scan Scanner, match *machines.Match) (interface{}, error) { return scan.Token(NAME, string(match.Bytes), match), nil }, ) lexer.Add( []byte("="), - func(scan *Scanner, match *machines.Match) (interface{}, error) { + func(scan Scanner, match *machines.Match) (interface{}, error) { return scan.Token(EQUALS, nil, match), nil }, ) lexer.Add( []byte("[0-9]+"), - func(scan *Scanner, match *machines.Match) (interface{}, error) { + func(scan Scanner, match *machines.Match) (interface{}, error) { i, err := strconv.Atoi(string(match.Bytes)) if err != nil { return nil, err @@ -50,35 +52,82 @@ func TestSimple(x *testing.T) { ) lexer.Add( []byte("( |\t|\n)"), - func(scan *Scanner, match *machines.Match) (interface{}, error) { + func(scan Scanner, match *machines.Match) (interface{}, error) { // skip white space return nil, nil }, ) lexer.Add( []byte("//[^\n]*\n"), - func(scan *Scanner, match *machines.Match) (interface{}, error) { + func(scan Scanner, match *machines.Match) (interface{}, error) { // skip white space return nil, nil }, ) lexer.Add( []byte("/\\*"), - func(scan *Scanner, match *machines.Match) (interface{}, error) { - for tc := scan.TC; tc < len(scan.Text); tc++ { - if scan.Text[tc] == '\\' { + //func(s Scanner, match *machines.Match) (interface{}, error) { + // scan := s.(*TextScanner) + // for tc := scan.TC; tc < len(scan.Text); tc++ { + // if scan.Text[tc] == '\\' { + // // the next character is skipped + // tc++ + // } else if scan.Text[tc] == '*' && tc+1 < len(scan.Text) { + // if scan.Text[tc+1] == '/' { + // scan.TC = tc + 2 + // return nil, nil + // } + // } + // } + // return nil, + // fmt.Errorf("unclosed comment starting at %d, (%d, %d)", + // match.TC, match.StartLine, match.StartColumn) + //}, + func(s Scanner, match *machines.Match) (interface{}, error) { + text := s.Buffer() + buf := make([]byte, 0, 10) + buf = append(buf, match.Bytes...) + open := 1 + tc := text.TC() + for ; open > 0; tc++ { + if !text.HasByte(tc) { + return nil, + fmt.Errorf("unclosed comment starting at %d, (%d, %d) containing %q", + match.TC, match.StartLine, match.StartColumn, buf) + } + char := text.Byte(tc) + buf = append(buf, char) + if char == '\\' { // the next character is skipped tc++ - } else if scan.Text[tc] == '*' && tc+1 < len(scan.Text) { - if scan.Text[tc+1] == '/' { - scan.TC = tc + 2 - return nil, nil + if text.HasByte(tc) { + buf = append(buf, text.Byte(tc)) + } + } else if char == '/' { + if text.HasByte(tc + 1) { + next := text.Byte(tc + 1) + if next == '*' { + buf = append(buf, next) + tc++ + open++ + continue + } + } + } else if char == '*' { + if text.HasByte(tc + 1) { + next := text.Byte(tc + 1) + if next == '/' { + buf = append(buf, next) + tc++ + open-- + continue + } } } } - return nil, - fmt.Errorf("unclosed comment starting at %d, (%d, %d)", - match.TC, match.StartLine, match.StartColumn) + fmt.Printf("%q\n", buf) + text.SetTC(tc) + return nil, nil }, ) @@ -88,10 +137,10 @@ func TestSimple(x *testing.T) { print fred name =12 // asdf comment - /*awef oiwe + /**//*awef oiwe ooiwje \*/ weoi weoi*/ printname = 13 - print printname + print printname/*/**/*/ `) expected := []*Token{ @@ -105,20 +154,14 @@ func TestSimple(x *testing.T) { {NAME, "name", []byte("name"), 41, 5, 3, 5, 6}, {EQUALS, nil, []byte("="), 46, 5, 8, 5, 8}, {NUMBER, 12, []byte("12"), 47, 5, 9, 5, 10}, - {NAME, "printname", []byte("printname"), 112, 9, 11, 9, 19}, - {EQUALS, nil, []byte("="), 122, 9, 21, 9, 21}, - {NUMBER, 13, []byte("13"), 124, 9, 23, 9, 24}, - {PRINT, nil, []byte("print"), 129, 10, 3, 10, 7}, - {NAME, "printname", []byte("printname"), 135, 10, 9, 10, 17}, + {NAME, "printname", []byte("printname"), 116, 9, 11, 9, 19}, + {EQUALS, nil, []byte("="), 126, 9, 21, 9, 21}, + {NUMBER, 13, []byte("13"), 128, 9, 23, 9, 24}, + {PRINT, nil, []byte("print"), 133, 10, 3, 10, 7}, + {NAME, "printname", []byte("printname"), 139, 10, 9, 10, 17}, } - scan := func(lexer *Lexer) { - scanner, err := lexer.Scanner(text) - if err != nil { - t.Error(err) - t.Log(lexer.program.Serialize()) - } - + scan := func(scanner Scanner) { i := 0 for tk, err, eof := scanner.Next(); !eof; tk, err, eof = scanner.Next() { if err != nil { @@ -132,15 +175,34 @@ func TestSimple(x *testing.T) { } } - // first do the test with the NFA + // // first do the test with the NFA t.AssertNil(lexer.CompileNFA()) - scan(lexer) + { + scanner, err := lexer.TextScanner(text) + if err != nil { + t.Fatal(err) + } + scan(scanner) + } // then do the test with the DFA lexer.program = nil lexer.nfaMatches = nil t.AssertNil(lexer.CompileDFA()) - scan(lexer) + { + scanner, err := lexer.TextScanner(text) + if err != nil { + t.Fatal(err) + } + scan(scanner) + } + { + scanner, err := lexer.StreamScanner(stream.BufferedStream(bytes.NewBuffer(text))) + if err != nil { + t.Fatal(err) + } + scan(scanner) + } } func TestPartialLexer(x *testing.T) { @@ -216,7 +278,7 @@ func TestPartialLexer(x *testing.T) { } getToken := func(tokenType int) Action { - return func(s *Scanner, m *machines.Match) (interface{}, error) { + return func(s Scanner, m *machines.Match) (interface{}, error) { return s.Token(tokenType, string(m.Bytes), m), nil } } @@ -229,7 +291,7 @@ func TestPartialLexer(x *testing.T) { lexer.Add([]byte("[A-Za-z$][A-Za-z0-9$]+"), getToken(tokmap["IDENT"])) lexer.Add([]byte(">=|<=|=|>|<|\\|\\||&&"), getToken(tokmap["OP"])) scan := func(lexer *Lexer) { - scanner, err := lexer.Scanner([]byte(text)) + scanner, err := lexer.TextScanner([]byte(text)) t.AssertNil(err) i := 0 for tk, err, eof := scanner.Next(); !eof; tk, err, eof = scanner.Next() { @@ -256,7 +318,7 @@ func TestPartialLexer(x *testing.T) { func TestRegression(t *testing.T) { token := func(name string) Action { - return func(s *Scanner, m *machines.Match) (interface{}, error) { + return func(s Scanner, m *machines.Match) (interface{}, error) { return fmt.Sprintf("%v:%q", name, string(m.Bytes)), nil } } @@ -278,7 +340,7 @@ func TestRegression(t *testing.T) { runTest := func(lexer *Lexer) { for _, test := range tests { - scanner, err := lexer.Scanner([]byte(test.text)) + scanner, err := lexer.TextScanner([]byte(test.text)) if err != nil { t.Fatal(err) } @@ -356,11 +418,11 @@ ddns-update-style none; newLexer := func() *Lexer { lex := NewLexer() - skip := func(*Scanner, *machines.Match) (interface{}, error) { + skip := func(Scanner, *machines.Match) (interface{}, error) { return nil, nil } token := func(name string) Action { - return func(s *Scanner, m *machines.Match) (interface{}, error) { + return func(s Scanner, m *machines.Match) (interface{}, error) { return s.Token(tokenIds[name], string(m.Bytes), m), nil } } @@ -376,7 +438,7 @@ ddns-update-style none; } runTest := func(lexer *Lexer) { - scanner, err := lexer.Scanner([]byte(text)) + scanner, err := lexer.TextScanner([]byte(text)) if err != nil { return } @@ -425,11 +487,11 @@ func TestPythonStrings(t *testing.T) { for i, tok := range tokens { tokenIds[tok] = i } - skip := func(*Scanner, *machines.Match) (interface{}, error) { + skip := func(Scanner, *machines.Match) (interface{}, error) { return nil, nil } token := func(name string) Action { - return func(s *Scanner, m *machines.Match) (interface{}, error) { + return func(s Scanner, m *machines.Match) (interface{}, error) { return s.Token(tokenIds[name], string(m.Bytes), m), nil } } @@ -468,7 +530,7 @@ func TestPythonStrings(t *testing.T) { runTest := func(lexer *Lexer) { for _, test := range tests { fmt.Printf("test %q\n", test.text) - scanner, err := lexer.Scanner([]byte(test.text)) + scanner, err := lexer.TextScanner([]byte(test.text)) if err != nil { t.Fatal(err) } @@ -516,7 +578,7 @@ func TestPythonStrings(t *testing.T) { } func TestNoEmptyStrings(t *testing.T) { - skip := func(*Scanner, *machines.Match) (interface{}, error) { + skip := func(Scanner, *machines.Match) (interface{}, error) { return nil, nil } lexer := NewLexer() diff --git a/scanner.go b/scanner.go new file mode 100644 index 0000000..e029ff4 --- /dev/null +++ b/scanner.go @@ -0,0 +1,112 @@ +package lexmachine + +import ( + "fmt" + + "github.com/timtadh/lexmachine/machines" +) + +type Scanner interface { + Next() (tok interface{}, err error, eos bool) + Token(typ int, value interface{}, m *machines.Match) *Token + Buffer() Buffer +} + +// Scanner tokenizes a byte string based on the patterns provided to the lexer +// object which constructed the scanner. This object works as functional +// iterator using the Next method. +// +// Example +// +// lexer, err := CreateLexer() +// if err != nil { +// return err +// } +// scanner, err := lexer.Scanner(someBytes) +// if err != nil { +// return err +// } +// for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() { +// if err != nil { +// return err +// } +// fmt.Println(tok) +// } +// +type TextScanner struct { + lexer *Lexer + matches map[int]int + scan machines.Scanner + Text []byte + TC int + buf *SliceBuffer +} + +func (s *TextScanner) Buffer() Buffer { + if s.buf == nil { + panic(fmt.Errorf("Buffer called outside of an Action")) + } + return s.buf +} + +// Next iterates through the string being scanned returning one token at a time +// until either an error is encountered or the end of the string is reached. +// The token is returned by the tok value. An error is indicated by err. +// Finally, eos (a bool) indicates the End Of String when it returns as true. +// +// Example +// +// for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() { +// if err != nil { +// // handle the error and exit the loop. For example: +// return err +// } +// // do some processing on tok or store it somewhere. eg. +// fmt.Println(tok) +// } +// +// One useful error type which could be returned by Next() is a +// match.UnconsumedInput which provides the position information for where in +// the text the scanning failed. +// +// For more information on functional iterators see: +// http://hackthology.com/functional-iteration-in-go.html +func (s *TextScanner) Next() (tok interface{}, err error, eos bool) { + var token interface{} + for token == nil { + tc, match, err, scan := s.scan(s.TC) + if scan == nil { + return nil, nil, true + } else if err != nil { + return nil, err, false + } else if match == nil { + return nil, fmt.Errorf("No match but no error"), false + } + s.scan = scan + s.TC = tc + + s.buf = sliceBuffer(s.Text, s.TC) + pattern := s.lexer.patterns[s.matches[match.PC]] + token, err = pattern.action(s, match) + s.TC = s.buf.finalize() + s.buf = nil + if err != nil { + return nil, err, false + } + } + return token, nil, false +} + +// Token is a helper function for constructing a Token type inside of a Action. +func (s *TextScanner) Token(typ int, value interface{}, m *machines.Match) *Token { + return &Token{ + Type: typ, + Value: value, + Lexeme: m.Bytes, + TC: m.TC, + StartLine: m.StartLine, + StartColumn: m.StartColumn, + EndLine: m.EndLine, + EndColumn: m.EndColumn, + } +} diff --git a/stream.go b/stream.go new file mode 100644 index 0000000..f3c31ce --- /dev/null +++ b/stream.go @@ -0,0 +1,91 @@ +package lexmachine + +import ( + "fmt" + + "github.com/timtadh/lexmachine/machines" + "github.com/timtadh/lexmachine/stream" + "github.com/timtadh/lexmachine/stream_machines" +) + +// StreamScanner tokenizes a stream of bytes (see stream.Stream) which can be +// constructed from an io.Reader. This object work analogously to the regular +// Scanner. Note: if the stream you are scanning fits in memory using the +// regular Scanner is likely more efficient. Finally, stream.Stream objects can +// only advance the text forwards so an Action cannot move the text counter +// backwards (as is possible with Scanner). +type StreamScanner struct { + lexer *Lexer + matches map[int]int + scan stream_machines.Scanner + Text stream.Stream + buf *StreamBuffer +} + +func (s *StreamScanner) Buffer() Buffer { + if s.buf == nil { + panic(fmt.Errorf("Buffer called outside of an Action")) + } + return s.buf +} + +// Next iterates through the string being scanned returning one token at a time +// until either an error is encountered or the end of the string is reached. +// The token is returned by the tok value. An error is indicated by err. +// Finally, eos (a bool) indicates the End Of String when it returns as true. +// +// Example +// +// for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() { +// if err != nil { +// // handle the error and exit the loop. For example: +// return err +// } +// // do some processing on tok or store it somewhere. eg. +// fmt.Println(tok) +// } +// +// One useful error type which could be returned by Next() is a +// match.UnconsumedInput which provides the position information for where in +// the text the scanning failed. +// +// For more information on functional iterators see: +// http://hackthology.com/functional-iteration-in-go.html +func (s *StreamScanner) Next() (tok interface{}, err error, eos bool) { + var token interface{} + for token == nil { + match, err, scan := s.scan() + if scan == nil { + return nil, nil, true + } else if err != nil { + return nil, err, false + } else if match == nil { + return nil, fmt.Errorf("No match but no error"), false + } + s.scan = scan + + s.buf = streamBuffer(s.Text) + pattern := s.lexer.patterns[s.matches[match.PC]] + token, err = pattern.action(s, match) + s.buf.finalize() + s.buf = nil + if err != nil { + return nil, err, false + } + } + return token, nil, false +} + +// Token is a helper function for constructing a Token type inside of a Action. +func (s *StreamScanner) Token(typ int, value interface{}, m *machines.Match) *Token { + return &Token{ + Type: typ, + Value: value, + Lexeme: m.Bytes, + TC: m.TC, + StartLine: m.StartLine, + StartColumn: m.StartColumn, + EndLine: m.EndLine, + EndColumn: m.EndColumn, + } +} diff --git a/stream/buffered.go b/stream/buffered.go new file mode 100644 index 0000000..5f6a37a --- /dev/null +++ b/stream/buffered.go @@ -0,0 +1,191 @@ +package stream + +import ( + "fmt" + "io" + "sync" +) + +type bufferedStream struct { + lock sync.Mutex + r io.Reader + tc int + line int + column int + started bool + eos bool + buf []Character + err error +} + +// BufferedStream makes a Stream which is backed by an expandable buffer. +func BufferedStream(r io.Reader) Stream { + b := &bufferedStream{ + r: r, + tc: -1, + line: 1, + column: 0, + } + return b +} + +// Character returns the character at the cursor +func (b *bufferedStream) Character() Character { + b.lock.Lock() + defer b.lock.Unlock() + if !b.started { + panic(fmt.Errorf("Call to Byte() before first call to Advance")) + } else if b.eos { + panic(fmt.Errorf("Call to Byte() after first call to Advance returned false")) + } + return b.buf[0] +} + +// Peek gets the character at lookahead i +func (b *bufferedStream) Peek(i int) (char Character, has bool) { + b.lock.Lock() + defer b.lock.Unlock() + if !b.started { + panic(fmt.Errorf("Call to Peek() before first call to Advance")) + } else if b.eos { + panic(fmt.Errorf("Call to Peek() after first call to Advance returned false")) + } + if i < 0 { + panic(fmt.Errorf("Peek() must be called with lookahead >= 0 got %d", i)) + } + if len(b.buf) >= i+1 { + return b.buf[i], true + } + if !b.read(i) { + return Character{}, false + } + return b.buf[i], true +} + +// Started indicates if Advance has been called at least once. +func (b *bufferedStream) Started() bool { + b.lock.Lock() + defer b.lock.Unlock() + return b.started +} + +// EOS indicates whether the stream has reached End Of Stream +func (b *bufferedStream) EOS() bool { + b.lock.Lock() + defer b.lock.Unlock() + return b.eos +} + +// Err returns the error from the underlying io.Reader if io.Read() returned +// a non-EOF error. +func (b *bufferedStream) Err() error { + b.lock.Lock() + defer b.lock.Unlock() + if !b.started { + panic(fmt.Errorf("Call to Err() before first call to Advance")) + } else if !b.eos { + panic(fmt.Errorf("Call to Err() before call to Advance returned false")) + } + return b.err +} + +// Advance moves the cursor forward by i +func (b *bufferedStream) Advance(i int) bool { + b.lock.Lock() + defer b.lock.Unlock() + return b.advance(i) +} + +// advance moves the cursor forward by i +func (b *bufferedStream) advance(i int) bool { + if i == 0 { + return true + } + if i < 0 { + panic(fmt.Errorf("Advance() must be called with move >= 0 got %d", i)) + } + // the "cursor" technically starts at -1, this does that adjustment + if !b.started { + b.started = true + i-- + // ensures a read happens even if i==0 when the buf is empty + if len(b.buf) <= 0 && !b.read(1) { + b.eos = true + return false + } + } + i = i - b.trimBuffer(i) + if len(b.buf) <= i { + if !b.read(i) { + b.eos = true + return false + } + } + if i > 0 { + i = i - b.trimBuffer(i) + if i != 0 { + panic(fmt.Errorf("i != 0 (i = %d)", i)) + } + } + return true +} + +// trims the buffer by up i bytes and returns the number of bytes trimmed. +func (b *bufferedStream) trimBuffer(i int) int { + if len(b.buf) > i { + // we already recorded the position + // of b.buf[0]. we need to track all the chars + // we are dropping by the skip + copy(b.buf[:len(b.buf)-i], b.buf[i:]) + b.buf = b.buf[:len(b.buf)-i] + return i + } else { + trimmed := len(b.buf) + b.buf = b.buf[:0] + return trimmed + } + return 0 +} + +// updates the position information for the given character. only call once +// per character in the stream. +func (b *bufferedStream) trackPos(char byte) { + b.tc++ + if char == '\n' { + b.line++ + b.column = 0 + } else { + b.column++ + } +} + +// reads at least i bytes from the underlying reader into the buffer. +func (b *bufferedStream) read(i int) bool { + if b.eos { + return false + } + buf := make([]byte, 4096) + for { + n, err := b.r.Read(buf) + if err != nil { + if err != io.EOF { + // only set err if it is an unexpected error. + b.err = err + } + return false + } + for _, c := range buf[:n] { + b.trackPos(c) + b.buf = append(b.buf, Character{ + Byte: c, + TC: b.tc, + Line: b.line, + Column: b.column, + }) + } + if len(b.buf) >= i+1 { + break + } + } + return true +} diff --git a/stream/stream.go b/stream/stream.go new file mode 100644 index 0000000..a27770c --- /dev/null +++ b/stream/stream.go @@ -0,0 +1,67 @@ +package stream + +import "fmt" + +// Stream represents a stream of bytes. Its interface is analogous to +// bufio.Scanner. Here is an example for how to read all the bytes in a stream +// (and print them one by one): +// +// s := BufferedStream(reader) +// for s.Advance(1) { +// fmt.Println(s.Character().Byte) +// } +// if s.Err() != nil { +// return s.Err() +// } +// +type Stream interface { + + // Character returns the current byte in the stream. This method will panic + // if Advance has not been called before this method or Advance has + // returned false. + Character() Character + + // Peek returns byte at the current cursor + the lookahead in the stream if + // one exists. If lookahead == 0, it returns the same character Character() + // returns. If lookahead == 1, it returns the next byte, and so on. Peek + // does not advance the cursor. If there are no further bytes in the stream + // (or lookahead causes a read past the end of the stream) Peek returns has + // == false. If you call Peek() before Advance() has been called it will + // panic. + Peek(lookahead int) (char Character, has bool) + + // Advance moves the cursor i bytes forward in the stream. If there is a + // byte to read it returns true. If it reaches the end of the stream (EOS) + // it returns false. Advance with i > than number of bytes remaining moves + // the cursor to the end of stream (may be less than i) and returns false + // (as you cannot read past the end of the stream). Advance must be called + // with movement >= 0 otherwise it will panic. If Advance is called with + // i == 0 it does nothing (including setting the stream to started). + Advance(i int) bool + + // Started returns true the stream has been started (eg. a call to Advance + // has been made with a positive movement). + Started() bool + + // EOS returns true if the stream has reached the end of the stream. + EOS() bool + + // Err returns an error if there was an error reading from the underlying + // source of the bytes. Panics if called before Advance returns false. + // Err() will never return io.EOF (it will be nil in this case -- following + // the behavior of ioutil.ReadAll) + Err() error +} + +// Character represents one byte in a stream with position information. +type Character struct { + Byte byte + TC int + Line int + Column int +} + +// String humanizes the character +func (c Character) String() string { + return fmt.Sprintf("<%q tc:%d @ %d:%d>", c.Byte, c.TC, c.Line, c.Column) +} diff --git a/stream/stream_test.go b/stream/stream_test.go new file mode 100644 index 0000000..ba52835 --- /dev/null +++ b/stream/stream_test.go @@ -0,0 +1,345 @@ +package stream + +import ( + "bytes" + "testing" +) + +func TestReadFullStream(t *testing.T) { + text := "hello world" + var buf bytes.Buffer + s := BufferedStream(bytes.NewBufferString(text)) + for s.Advance(1) { + if err := buf.WriteByte(s.Character().Byte); err != nil { + if err != nil { + t.Fatalf("err writing %v", err) + } + } + } + if s.Err() != nil { + t.Fatalf("stream err %v", s.Err()) + } + if buf.String() != text { + t.Fatalf("expect %q got %q", text, buf.String()) + } +} + +func TestReadEveryOther(t *testing.T) { + text := "hello world" + expected := "el ol" + var buf bytes.Buffer + s := BufferedStream(bytes.NewBufferString(text)) + for s.Advance(2) { + if err := buf.WriteByte(s.Character().Byte); err != nil { + if err != nil { + t.Fatalf("err writing %v", err) + } + } + } + if s.Err() != nil { + t.Fatalf("stream err %v", s.Err()) + } + if buf.String() != expected { + t.Fatalf("expect %q got %q", expected, buf.String()) + } +} + +func TestReadEvery3(t *testing.T) { + text := "hello world" + expected := "l r" + var buf bytes.Buffer + s := BufferedStream(bytes.NewBufferString(text)) + for s.Advance(3) { + if err := buf.WriteByte(s.Character().Byte); err != nil { + if err != nil { + t.Fatalf("err writing %v", err) + } + } + } + if s.Err() != nil { + t.Fatalf("stream err %v", s.Err()) + } + if buf.String() != expected { + t.Fatalf("expect %q got %q", expected, buf.String()) + } +} + +func TestPeekTillW(t *testing.T) { + text := "hello world" + expected := "world" + var buf bytes.Buffer + s := BufferedStream(bytes.NewBufferString(text)) + if !s.Started() { + s.Advance(1) + } + for i := 0; ; i++ { + b, has := s.Peek(i) + if !has { + break + } + if b.Byte == 'w' { + s.Advance(i) + break + } + } + if s.Character().Byte != 'w' { + t.Fatalf("expected w got %v", s.Character().Byte) + } + for { + if err := buf.WriteByte(s.Character().Byte); err != nil { + if err != nil { + t.Fatalf("err writing %v", err) + } + } + if !s.Advance(1) { + break + } + } + if s.Err() != nil { + t.Fatalf("stream err %v", s.Err()) + } + if buf.String() != expected { + t.Fatalf("expect %q got %q", expected, buf.String()) + } +} + +func TestPeekTillWThenL(t *testing.T) { + text := "hello world" + expected := "ld" + var buf bytes.Buffer + s := BufferedStream(bytes.NewBufferString(text)) + if !s.Started() { + s.Advance(1) + } + for i := 0; ; i++ { + b, has := s.Peek(i) + if !has { + break + } + if b.Byte == 'w' { + s.Advance(i) + break + } + } + if s.Character().Byte != 'w' { + t.Fatalf("expected w got %v", s.Character().Byte) + } + for i := 1; ; i++ { + b, has := s.Peek(i) + if !has { + break + } + if b.Byte == 'l' { + s.Advance(i) + break + } + } + if s.Character().Byte != 'l' { + t.Fatalf("expected l got %v", s.Character().Byte) + } + for { + if err := buf.WriteByte(s.Character().Byte); err != nil { + if err != nil { + t.Fatalf("err writing %v", err) + } + } + if !s.Advance(1) { + break + } + } + if s.Err() != nil { + t.Fatalf("stream err %v", s.Err()) + } + if buf.String() != expected { + t.Fatalf("expect %q got %q", expected, buf.String()) + } +} + +func TestPeekTillWThenLThenEnd(t *testing.T) { + text := "hello world" + expected := "" + var buf bytes.Buffer + s := BufferedStream(bytes.NewBufferString(text)) + if !s.Started() { + s.Advance(1) + } + for i := 0; ; i++ { + b, has := s.Peek(i) + if !has { + break + } + if b.Byte == 'w' { + s.Advance(i) + break + } + } + if s.Character().Byte != 'w' { + t.Fatalf("expected w got %v", s.Character().Byte) + } + for i := 1; ; i++ { + b, has := s.Peek(i) + if !has { + break + } + if b.Byte == 'l' { + s.Advance(i) + break + } + } + if s.Character().Byte != 'l' { + t.Fatalf("expected l got %v", s.Character().Byte) + } + for i := 1; ; i++ { + _, has := s.Peek(i) + if !has { + s.Advance(i) + break + } + } + if !s.EOS() { + t.Fatalf("expected EOS") + } + if s.Err() != nil { + t.Fatalf("stream err %v", s.Err()) + } + if buf.String() != expected { + t.Fatalf("expect %q got %q", expected, buf.String()) + } +} + +func TestPeekThenReadFullStream(t *testing.T) { + text := "hello world" + var peek bytes.Buffer + var read bytes.Buffer + s := BufferedStream(bytes.NewBufferString(text)) + if !s.Started() { + s.Advance(1) + } + for i := 0; ; i++ { + b, has := s.Peek(i) + if !has { + break + } + if err := peek.WriteByte(b.Byte); err != nil { + if err != nil { + t.Fatalf("err writing %v", err) + } + } + } + for !s.EOS() { + if err := read.WriteByte(s.Character().Byte); err != nil { + if err != nil { + t.Fatalf("err writing %v", err) + } + } + s.Advance(1) + } + if s.Err() != nil { + t.Fatalf("stream err %v", s.Err()) + } + if peek.String() != text { + t.Fatalf("expect %q got %q", text, peek.String()) + } + if read.String() != text { + t.Fatalf("expect %q got %q", text, read.String()) + } +} + +func TestLineColumns(t *testing.T) { + text := `b + this + is + wizard +` + var expected = []struct { + tc, line, column int + char byte + }{ + {0, 1, 1, 'b'}, + {1, 2, 0, '\n'}, + {2, 2, 1, '\t'}, + {3, 2, 2, 't'}, + {4, 2, 3, 'h'}, + {5, 2, 4, 'i'}, + {6, 2, 5, 's'}, + {7, 3, 0, '\n'}, + {8, 3, 1, '\t'}, + {9, 3, 2, 'i'}, + {10, 3, 3, 's'}, + {11, 4, 0, '\n'}, + {12, 4, 1, '\t'}, + {13, 4, 2, 'w'}, + {14, 4, 3, 'i'}, + {15, 4, 4, 'z'}, + {16, 4, 5, 'a'}, + {17, 4, 6, 'r'}, + {18, 4, 7, 'd'}, + {19, 5, 0, '\n'}, + } + s := BufferedStream(bytes.NewBufferString(text)) + // pre-peek everything just to futz with the interior state + if !s.Started() { + s.Advance(1) + } + for i := 0; ; i++ { + _, has := s.Peek(i) + if !has { + break + } + } + for i := 0; !s.EOS(); i++ { + char := s.Character() + if char.Byte != expected[i].char { + t.Fatalf("got %v expected %v", char.Byte, expected[i].char) + } + if char.TC != expected[i].tc { + t.Fatalf("got %v expected %v", char.TC, expected[i].tc) + } + if char.Line != expected[i].line { + t.Fatalf("got %v expected %v", char.Line, expected[i].line) + } + if char.Column != expected[i].column { + t.Fatalf("got %v expected %v", char.Column, expected[i].column) + } + s.Advance(1) + } +} + +func TestEveryOtherLineColumns(t *testing.T) { + text := `b + this + is + wizard +` + var expected = []struct { + tc, line, column int + char byte + }{ + {1, 2, 0, '\n'}, + {3, 2, 2, 't'}, + {5, 2, 4, 'i'}, + {7, 3, 0, '\n'}, + {9, 3, 2, 'i'}, + {11, 4, 0, '\n'}, + {13, 4, 2, 'w'}, + {15, 4, 4, 'z'}, + {17, 4, 6, 'r'}, + {19, 5, 0, '\n'}, + } + s := BufferedStream(bytes.NewBufferString(text)) + for i := 0; s.Advance(2); i++ { + c := s.Character() + if c.Byte != expected[i].char { + t.Fatalf("got %v expected %v", c.Byte, expected[i].char) + } + if c.TC != expected[i].tc { + t.Fatalf("got %v expected %v", c.TC, expected[i].tc) + } + if c.Line != expected[i].line { + t.Fatalf("got %v expected %v", c.Line, expected[i].line) + } + if c.Column != expected[i].column { + t.Fatalf("got %v expected %v", c.Column, expected[i].column) + } + } +} diff --git a/stream_machines/dfa_machine.go b/stream_machines/dfa_machine.go new file mode 100644 index 0000000..2c845b6 --- /dev/null +++ b/stream_machines/dfa_machine.go @@ -0,0 +1,90 @@ +package stream_machines + +import ( + "github.com/timtadh/lexmachine/machines" + "github.com/timtadh/lexmachine/stream" +) + +type Scanner func() (*machines.Match, error, Scanner) + +// DFALexerEngine does the actual tokenization of the byte slice text using the +// DFA state machine. If the lexing process fails the Scanner will return +// an UnconsumedInput error. +func DFALexerEngine(startState, errorState int, trans machines.DFATrans, accepting machines.DFAAccepting, text stream.Stream) Scanner { + var scan Scanner + scan = func() (*machines.Match, error, Scanner) { + if text.EOS() { + return nil, nil, nil + } + buf := make([]stream.Character, 0, 10) + matchID := -1 + matchLH := -1 + state := startState + if match, has := accepting[state]; has { + matchID = match + matchLH = -1 + } + if !text.Started() { + if !text.Advance(1) { + return nil, nil, nil + } + } + for lh := 0; state != errorState; lh++ { + c, has := text.Peek(lh) + if !has { + break + } + buf = append(buf, c) + state = trans[state][c.Byte] + if match, has := accepting[state]; has { + matchID = match + matchLH = lh + } + } + if match, has := accepting[state]; has { + matchID = match + matchLH = len(buf) - 1 + } + if matchLH == -1 && matchID > -1 { + err := &machines.EmptyMatchError{ + MatchID: matchID, + TC: buf[0].TC, + Line: buf[0].Line, + Column: buf[0].Column, + } + return nil, err, scan + } else if matchID > -1 && matchLH >= 0 { + lexeme := make([]byte, 0, matchLH+1) + for _, c := range buf[:matchLH+1] { + lexeme = append(lexeme, c.Byte) + } + match := &machines.Match{ + PC: matchID, + TC: buf[0].TC, + StartLine: buf[0].Line, + StartColumn: buf[0].Column, + EndLine: buf[matchLH].Line, + EndColumn: buf[matchLH].Column, + Bytes: lexeme, + } + text.Advance(matchLH + 1) + return match, nil, scan + } else { + lexeme := make([]byte, 0, len(buf)) + for _, c := range buf { + lexeme = append(lexeme, c.Byte) + } + err := &machines.UnconsumedInput{ + StartTC: buf[0].TC, + FailTC: buf[len(buf)-1].TC, + StartLine: buf[0].Line, + StartColumn: buf[0].Column, + FailLine: buf[len(buf)-1].Line, + FailColumn: buf[len(buf)-1].Column, + Text: lexeme, + } + return nil, err, scan + } + } + return scan +} diff --git a/token.go b/token.go new file mode 100644 index 0000000..d596b24 --- /dev/null +++ b/token.go @@ -0,0 +1,52 @@ +package lexmachine + +import ( + "bytes" + "fmt" +) + +// Token is an optional token representation you could use to represent the +// tokens produced by a lexer built with lexmachine. +// +// Here is an example for constructing a lexer Action which turns a +// machines.Match struct into a token using the scanners Token helper function. +// +// func token(name string, tokenIds map[string]int) lex.Action { +// return func(s *lex.Scanner, m *machines.Match) (interface{}, error) { +// return s.Token(tokenIds[name], string(m.Bytes), m), nil +// } +// } +// +type Token struct { + Type int + Value interface{} + Lexeme []byte + TC int + StartLine int + StartColumn int + EndLine int + EndColumn int +} + +// Equals checks the equality of two tokens ignoring the Value field. +func (t *Token) Equals(other *Token) bool { + if t == nil && other == nil { + return true + } else if t == nil { + return false + } else if other == nil { + return false + } + return t.TC == other.TC && + t.StartLine == other.StartLine && + t.StartColumn == other.StartColumn && + t.EndLine == other.EndLine && + t.EndColumn == other.EndColumn && + bytes.Equal(t.Lexeme, other.Lexeme) && + t.Type == other.Type +} + +// String formats the token in a human readable form. +func (t *Token) String() string { + return fmt.Sprintf("%d %q %d (%d, %d)-(%d, %d)", t.Type, t.Value, t.TC, t.StartLine, t.StartColumn, t.EndLine, t.EndColumn) +}