diff --git a/doc/Tokenizer.md b/doc/Tokenizer.md index 4b7b5b7..d8022d6 100644 --- a/doc/Tokenizer.md +++ b/doc/Tokenizer.md @@ -51,3 +51,43 @@ For some languages, like VB.NET, it has a context sensitive tokenizer. You could ## NOTE `buildLexer` only accepts regular expressions like this: `/^xxx/g`. + +## Stateful tokenization + +Internally, the lexer maintains a stack of states that you can grow. A state is +defined as the set of rules that the lexer uses to tokenize the input. For +instance, in the examples shown above, `buildLexer` was used to create a lexer +with a single state with three rules each. Stateful tokenization is useful if +you want to provide different rules to the lexer based on previously matched +tokens. + +The following example shows a lexer that tokenizes nested block comments. Start +by looking at the set of top-level rules defined by `buildLexer`. These rules +look standard, except for the rule that recognizes a `TokenKind.CommentBegin`. +When a rule contains a fourth element, and the rule is matched, it means that +the lexer will switch to a different state. In this case, the fourth element +tells us that the lexer will switch to the `BlockComment` state by pushing the +state to its internal stack. The definition of a state works almost analogously +to the definition of the top-level state using `buildLexer`. When the tokenizer +switches to another state, only the rules defined inside of that state apply +until the tokenizer leaves the state again. To leave a state, the fourth element +of a rule can be set to `'pop'`, which pops the state off of the lexers' +internal stack. In case you wish to push the same state to the stack that you +are already in, use the `'push'` directive. When the fourth element of a rule is +omitted, the lexer will remain in its current state. + +```typescript +const blockComment: LexerState = [ + [false, /^\/\*/g, TokenKind.CommentBegin, "push"], // nested comment + [false, /^\*\//g, TokenKind.CommentEnd, "pop"], + [true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents], +]; + +const tokenizer = buildLexer([ + [false, /^\/\*/g, TokenKind.CommentBegin, blockComment], + [true, /^\d+/g, TokenKind.Number], + [true, /^[a-zA-Z]\w*/g, TokenKind.Identifier], + [false, /^,/g, TokenKind.Comma], + [false, /^\s+/g, TokenKind.Space], +]); +``` diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts index 8f82ed4..b9decee 100644 --- a/packages/ts-parsec/src/Lexer.ts +++ b/packages/ts-parsec/src/Lexer.ts @@ -83,17 +83,38 @@ class TokenImpl implements Token { } } -class LexerImpl implements Lexer { - constructor(public rules: [boolean, RegExp, T][]) { - for (const rule of this.rules) { - if (rule[1].source[0] !== '^') { - throw new Error(`Regular expression patterns for a tokenizer should start with "^": ${rule[1].source}`); - } - if (!rule[1].global) { - throw new Error(`Regular expression patterns for a tokenizer should be global: ${rule[1].source}`); +export type LexerRule = [boolean, RegExp, T, (LexerRule[] | 'push' | 'pop')?]; +export type LexerState = LexerRule[]; + +function analyzeLexerRules( + rules: LexerState, + memo: Set> = new Set(), +): void { + memo.add(rules); + for (const [, regex, , state] of rules) { + if (regex.source[0] !== '^') { + throw new Error(`Regular expression patterns for a tokenizer should start with '^': ${regex.source}`); + } + if (!regex.global) { + throw new Error(`Regular expression patterns for a tokenizer should be global: ${regex.source}`); + } + if (state !== undefined) { + if (state !== 'pop' && state !== 'push') { + if (memo.has(state)) { + return; + } + analyzeLexerRules(state, memo); } } } +} + +class LexerImpl implements Lexer { + private states: LexerState[] = [this.rules]; + + constructor(public rules: LexerState) { + analyzeLexerRules(rules); + } public parse(input: string): TokenImpl | undefined { return this.parseNextAvailable(input, 0, 1, 1); @@ -106,7 +127,9 @@ class LexerImpl implements Lexer { const subString = input.substr(indexStart); let result: TokenImpl | undefined; - for (const [keep, regexp, kind] of this.rules) { + const currentRuleset = this.states[this.states.length - 1]; + let nextState: LexerState | 'push' | 'pop' | undefined; + for (const [keep, regexp, kind, next] of currentRuleset) { regexp.lastIndex = 0; if (regexp.test(subString)) { const text = subString.substr(0, regexp.lastIndex); @@ -123,6 +146,7 @@ class LexerImpl implements Lexer { const newResult = new TokenImpl(this, input, kind, text, { index: indexStart, rowBegin, columnBegin, rowEnd, columnEnd }, keep); if (result === undefined || result.text.length < newResult.text.length) { result = newResult; + nextState = next; } } } @@ -133,6 +157,13 @@ class LexerImpl implements Lexer { `Unable to tokenize the rest of the input: ${input.substr(indexStart)}` ); } else { + if (nextState === 'pop') { + this.states.pop(); + } else if (nextState === 'push') { + this.states.push(currentRuleset); + } else if (nextState !== undefined) { + this.states.push(nextState); + } return result; } } @@ -155,6 +186,21 @@ class LexerImpl implements Lexer { } } -export function buildLexer(rules: [boolean, RegExp, T][]): Lexer { +export function buildLexer(rules: LexerState): Lexer { return new LexerImpl(rules); } + +// TESTING + +const statements: LexerState = []; +const stringLiteral: LexerState = []; + +statements.push( + [true, /^"/g, "stringDelimiter", stringLiteral], +); + +stringLiteral.push( + [true, /^\${/g, "stringInterpolationDelimiter", statements], +); + +buildLexer(statements); diff --git a/packages/tspc-test/src/TestLexer.ts b/packages/tspc-test/src/TestLexer.ts index 59f65ef..653e4d9 100644 --- a/packages/tspc-test/src/TestLexer.ts +++ b/packages/tspc-test/src/TestLexer.ts @@ -4,7 +4,7 @@ // tslint:disable:trailing-comma import * as assert from 'assert'; -import { buildLexer } from 'typescript-parsec'; +import { buildLexer, LexerState } from 'typescript-parsec'; function notUndefined(t: T | undefined): T { assert.notStrictEqual(t, undefined); @@ -133,3 +133,53 @@ test(`Lexer: identifiers and numbers with discardable commas and spaces`, () => assert.strictEqual(token, undefined); }); + +test(`Lexer: C-style nested block comments via lexer states`, () => { + enum TokenKind { + CommentBegin, + CommentEnd, + CommentContents, + Number, + Identifier, + Comma, + Space, + } + + const blockComment: LexerState = [ + [false, /^\/\*/g, TokenKind.CommentBegin, 'push'], // nested comment + [false, /^\*\//g, TokenKind.CommentEnd, 'pop'], + [true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents], + ]; + + const lexer = buildLexer([ + [false, /^\/\*/g, TokenKind.CommentBegin, blockComment], + [true, /^\d+/g, TokenKind.Number], + [true, /^[a-zA-Z]\w*/g, TokenKind.Identifier], + [false, /^,/g, TokenKind.Comma], + [false, /^\s+/g, TokenKind.Space], + ]); + + let token = lexer.parse(`123 /* abc /*456*/*/ def`); + + token = notUndefined(token); + assert.strictEqual(token.kind, TokenKind.Number); + assert.strictEqual(token.text, '123'); + token = token.next; + + token = notUndefined(token); + assert.strictEqual(token.kind, TokenKind.CommentContents); + assert.strictEqual(token.text, ' abc '); + token = token.next; + + token = notUndefined(token); + assert.strictEqual(token.kind, TokenKind.CommentContents); + assert.strictEqual(token.text, '456'); + token = token.next; + + token = notUndefined(token); + assert.strictEqual(token.kind, TokenKind.Identifier); + assert.strictEqual(token.text, 'def'); + token = token.next; + + assert.strictEqual(token, undefined); +});