Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions doc/Tokenizer.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,43 @@ For some languages, like VB.NET, it has a context sensitive tokenizer. You could
## NOTE

`buildLexer` only accepts regular expressions like this: `/^xxx/g`.

## Stateful tokenization

Internally, the lexer maintains a stack of states that you can grow. A state is
defined as the set of rules that the lexer uses to tokenize the input. For
instance, in the examples shown above, `buildLexer` was used to create a lexer
with a single state with three rules each. Stateful tokenization is useful if
you want to provide different rules to the lexer based on previously matched
tokens.

The following example shows a lexer that tokenizes nested block comments. Start
by looking at the set of top-level rules defined by `buildLexer`. These rules
look standard, except for the rule that recognizes a `TokenKind.CommentBegin`.
When a rule contains a fourth element, and the rule is matched, it means that
the lexer will switch to a different state. In this case, the fourth element
tells us that the lexer will switch to the `BlockComment` state by pushing the
state to its internal stack. The definition of a state works almost analogously
to the definition of the top-level state using `buildLexer`. When the tokenizer
switches to another state, only the rules defined inside of that state apply
until the tokenizer leaves the state again. To leave a state, the fourth element
of a rule can be set to `'pop'`, which pops the state off of the lexers'
internal stack. In case you wish to push the same state to the stack that you
are already in, use the `'push'` directive. When the fourth element of a rule is
omitted, the lexer will remain in its current state.

```typescript
const blockComment: LexerState<TokenKind> = [
[false, /^\/\*/g, TokenKind.CommentBegin, "push"], // nested comment
[false, /^\*\//g, TokenKind.CommentEnd, "pop"],
[true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents],
];

const tokenizer = buildLexer([
[false, /^\/\*/g, TokenKind.CommentBegin, blockComment],
[true, /^\d+/g, TokenKind.Number],
[true, /^[a-zA-Z]\w*/g, TokenKind.Identifier],
[false, /^,/g, TokenKind.Comma],
[false, /^\s+/g, TokenKind.Space],
]);
```
66 changes: 56 additions & 10 deletions packages/ts-parsec/src/Lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,38 @@ class TokenImpl<T> implements Token<T> {
}
}

class LexerImpl<T> implements Lexer<T> {
constructor(public rules: [boolean, RegExp, T][]) {
for (const rule of this.rules) {
if (rule[1].source[0] !== '^') {
throw new Error(`Regular expression patterns for a tokenizer should start with "^": ${rule[1].source}`);
}
if (!rule[1].global) {
throw new Error(`Regular expression patterns for a tokenizer should be global: ${rule[1].source}`);
export type LexerRule<T> = [boolean, RegExp, T, (LexerRule<T>[] | 'push' | 'pop')?];
export type LexerState<T> = LexerRule<T>[];

function analyzeLexerRules<T>(
rules: LexerState<T>,
memo: Set<LexerState<T>> = new Set(),
): void {
memo.add(rules);
for (const [, regex, , state] of rules) {
if (regex.source[0] !== '^') {
throw new Error(`Regular expression patterns for a tokenizer should start with '^': ${regex.source}`);
}
if (!regex.global) {
throw new Error(`Regular expression patterns for a tokenizer should be global: ${regex.source}`);
}
if (state !== undefined) {
if (state !== 'pop' && state !== 'push') {
if (memo.has(state)) {
return;
}
analyzeLexerRules(state, memo);
}
}
}
}

class LexerImpl<T> implements Lexer<T> {
private states: LexerState<T>[] = [this.rules];

constructor(public rules: LexerState<T>) {
analyzeLexerRules(rules);
}

public parse(input: string): TokenImpl<T> | undefined {
return this.parseNextAvailable(input, 0, 1, 1);
Expand All @@ -106,7 +127,9 @@ class LexerImpl<T> implements Lexer<T> {

const subString = input.substr(indexStart);
let result: TokenImpl<T> | undefined;
for (const [keep, regexp, kind] of this.rules) {
const currentRuleset = this.states[this.states.length - 1];
let nextState: LexerState<T> | 'push' | 'pop' | undefined;
for (const [keep, regexp, kind, next] of currentRuleset) {
regexp.lastIndex = 0;
if (regexp.test(subString)) {
const text = subString.substr(0, regexp.lastIndex);
Expand All @@ -123,6 +146,7 @@ class LexerImpl<T> implements Lexer<T> {
const newResult = new TokenImpl<T>(this, input, kind, text, { index: indexStart, rowBegin, columnBegin, rowEnd, columnEnd }, keep);
if (result === undefined || result.text.length < newResult.text.length) {
result = newResult;
nextState = next;
}
}
}
Expand All @@ -133,6 +157,13 @@ class LexerImpl<T> implements Lexer<T> {
`Unable to tokenize the rest of the input: ${input.substr(indexStart)}`
);
} else {
if (nextState === 'pop') {
this.states.pop();
} else if (nextState === 'push') {
this.states.push(currentRuleset);
} else if (nextState !== undefined) {
this.states.push(nextState);
}
return result;
}
}
Expand All @@ -155,6 +186,21 @@ class LexerImpl<T> implements Lexer<T> {
}
}

export function buildLexer<T>(rules: [boolean, RegExp, T][]): Lexer<T> {
export function buildLexer<T>(rules: LexerState<T>): Lexer<T> {
return new LexerImpl<T>(rules);
}

// TESTING

const statements: LexerState<string> = [];
const stringLiteral: LexerState<string> = [];

statements.push(
[true, /^"/g, "stringDelimiter", stringLiteral],
);

stringLiteral.push(
[true, /^\${/g, "stringInterpolationDelimiter", statements],
);

buildLexer(statements);
52 changes: 51 additions & 1 deletion packages/tspc-test/src/TestLexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// tslint:disable:trailing-comma

import * as assert from 'assert';
import { buildLexer } from 'typescript-parsec';
import { buildLexer, LexerState } from 'typescript-parsec';

function notUndefined<T>(t: T | undefined): T {
assert.notStrictEqual(t, undefined);
Expand Down Expand Up @@ -133,3 +133,53 @@ test(`Lexer: identifiers and numbers with discardable commas and spaces`, () =>

assert.strictEqual(token, undefined);
});

test(`Lexer: C-style nested block comments via lexer states`, () => {
enum TokenKind {
CommentBegin,
CommentEnd,
CommentContents,
Number,
Identifier,
Comma,
Space,
}

const blockComment: LexerState<TokenKind> = [
[false, /^\/\*/g, TokenKind.CommentBegin, 'push'], // nested comment
[false, /^\*\//g, TokenKind.CommentEnd, 'pop'],
[true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents],
];

const lexer = buildLexer([
[false, /^\/\*/g, TokenKind.CommentBegin, blockComment],
[true, /^\d+/g, TokenKind.Number],
[true, /^[a-zA-Z]\w*/g, TokenKind.Identifier],
[false, /^,/g, TokenKind.Comma],
[false, /^\s+/g, TokenKind.Space],
]);

let token = lexer.parse(`123 /* abc /*456*/*/ def`);

token = notUndefined(token);
assert.strictEqual(token.kind, TokenKind.Number);
assert.strictEqual(token.text, '123');
token = token.next;

token = notUndefined(token);
assert.strictEqual(token.kind, TokenKind.CommentContents);
assert.strictEqual(token.text, ' abc ');
token = token.next;

token = notUndefined(token);
assert.strictEqual(token.kind, TokenKind.CommentContents);
assert.strictEqual(token.text, '456');
token = token.next;

token = notUndefined(token);
assert.strictEqual(token.kind, TokenKind.Identifier);
assert.strictEqual(token.text, 'def');
token = token.next;

assert.strictEqual(token, undefined);
});