microsoft · LeonPuchinger · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025
diff --git a/doc/Tokenizer.md b/doc/Tokenizer.md
@@ -51,3 +51,43 @@ For some languages, like VB.NET, it has a context sensitive tokenizer. You could
 ## NOTE
 
 `buildLexer` only accepts regular expressions like this: `/^xxx/g`.
+
+## Stateful tokenization
+
+Internally, the lexer maintains a stack of states that you can grow. A state is
+defined as the set of rules that the lexer uses to tokenize the input. For
+instance, in the examples shown above, `buildLexer` was used to create a lexer
+with a single state with three rules each. Stateful tokenization is useful if
+you want to provide different rules to the lexer based on previously matched
+tokens.
+
+The following example shows a lexer that tokenizes nested block comments. Start
+by looking at the set of top-level rules defined by `buildLexer`. These rules
+look standard, except for the rule that recognizes a `TokenKind.CommentBegin`.
+When a rule contains a fourth element, and the rule is matched, it means that
+the lexer will switch to a different state. In this case, the fourth element
+tells us that the lexer will switch to the `BlockComment` state by pushing the
+state to its internal stack. The definition of a state works almost analogously
+to the definition of the top-level state using `buildLexer`. When the tokenizer
+switches to another state, only the rules defined inside of that state apply
+until the tokenizer leaves the state again. To leave a state, the fourth element
+of a rule can be set to `'pop'`, which pops the state off of the lexers'
+internal stack. In case you wish to push the same state to the stack that you
+are already in, use the `'push'` directive. When the fourth element of a rule is
+omitted, the lexer will remain in its current state.
+
+```typescript
+const blockComment: LexerState<TokenKind> = [
+    [false, /^\/\*/g, TokenKind.CommentBegin, "push"], // nested comment
+    [false, /^\*\//g, TokenKind.CommentEnd, "pop"],
+    [true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents],
+];
+
+const tokenizer = buildLexer([
+    [false, /^\/\*/g, TokenKind.CommentBegin, blockComment],
+    [true, /^\d+/g, TokenKind.Number],
+    [true, /^[a-zA-Z]\w*/g, TokenKind.Identifier],
+    [false, /^,/g, TokenKind.Comma],
+    [false, /^\s+/g, TokenKind.Space],
+]);
+```
diff --git a/packages/ts-parsec/src/Lexer.ts b/packages/ts-parsec/src/Lexer.ts
@@ -83,17 +83,38 @@ class TokenImpl<T> implements Token<T> {
     }
 }
 
-class LexerImpl<T> implements Lexer<T> {
-    constructor(public rules: [boolean, RegExp, T][]) {
-        for (const rule of this.rules) {
-            if (rule[1].source[0] !== '^') {
-                throw new Error(`Regular expression patterns for a tokenizer should start with "^": ${rule[1].source}`);
-            }
-            if (!rule[1].global) {
-                throw new Error(`Regular expression patterns for a tokenizer should be global: ${rule[1].source}`);
+export type LexerRule<T> = [boolean, RegExp, T, (LexerRule<T>[] | 'push' | 'pop')?];
+export type LexerState<T> = LexerRule<T>[];
+
+function analyzeLexerRules<T>(
+    rules: LexerState<T>,
+    memo: Set<LexerState<T>> = new Set(),
+): void {
+    memo.add(rules);
+    for (const [, regex, , state] of rules) {
+        if (regex.source[0] !== '^') {
+            throw new Error(`Regular expression patterns for a tokenizer should start with '^': ${regex.source}`);
+        }
+        if (!regex.global) {
+            throw new Error(`Regular expression patterns for a tokenizer should be global: ${regex.source}`);
+        }
+        if (state !== undefined) {
+            if (state !== 'pop' && state !== 'push') {
+                if (memo.has(state)) {
+                    return;
+                }
+                analyzeLexerRules(state, memo);
             }
         }
     }
+}
+
+class LexerImpl<T> implements Lexer<T> {
+    private states: LexerState<T>[] = [this.rules];
+
+    constructor(public rules: LexerState<T>) {
+        analyzeLexerRules(rules);
+    }
 
     public parse(input: string): TokenImpl<T> | undefined {
         return this.parseNextAvailable(input, 0, 1, 1);
@@ -106,7 +127,9 @@ class LexerImpl<T> implements Lexer<T> {
 
         const subString = input.substr(indexStart);
         let result: TokenImpl<T> | undefined;
-        for (const [keep, regexp, kind] of this.rules) {
+        const currentRuleset = this.states[this.states.length - 1];
+        let nextState: LexerState<T> | 'push' | 'pop' | undefined;
+        for (const [keep, regexp, kind, next] of currentRuleset) {
             regexp.lastIndex = 0;
             if (regexp.test(subString)) {
                 const text = subString.substr(0, regexp.lastIndex);
@@ -123,6 +146,7 @@ class LexerImpl<T> implements Lexer<T> {
                 const newResult = new TokenImpl<T>(this, input, kind, text, { index: indexStart, rowBegin, columnBegin, rowEnd, columnEnd }, keep);
                 if (result === undefined || result.text.length < newResult.text.length) {
                     result = newResult;
+                    nextState = next;
                 }
             }
         }
@@ -133,6 +157,13 @@ class LexerImpl<T> implements Lexer<T> {
                 `Unable to tokenize the rest of the input: ${input.substr(indexStart)}`
             );
         } else {
+            if (nextState === 'pop') {
+                this.states.pop();
+            } else if (nextState === 'push') {
+                this.states.push(currentRuleset);
+            } else if (nextState !== undefined) {
+                this.states.push(nextState);
+            }
             return result;
         }
     }
@@ -155,6 +186,21 @@ class LexerImpl<T> implements Lexer<T> {
     }
 }
 
-export function buildLexer<T>(rules: [boolean, RegExp, T][]): Lexer<T> {
+export function buildLexer<T>(rules: LexerState<T>): Lexer<T> {
     return new LexerImpl<T>(rules);
 }
+
+// TESTING
+
+const statements: LexerState<string> = [];
+const stringLiteral: LexerState<string> = [];
+
+statements.push(
+  [true, /^"/g, "stringDelimiter", stringLiteral],
+);
+
+stringLiteral.push(
+  [true, /^\${/g, "stringInterpolationDelimiter", statements],
+);
+
+buildLexer(statements);
diff --git a/packages/tspc-test/src/TestLexer.ts b/packages/tspc-test/src/TestLexer.ts
@@ -4,7 +4,7 @@
 // tslint:disable:trailing-comma
 
 import * as assert from 'assert';
-import { buildLexer } from 'typescript-parsec';
+import { buildLexer, LexerState } from 'typescript-parsec';
 
 function notUndefined<T>(t: T | undefined): T {
     assert.notStrictEqual(t, undefined);
@@ -133,3 +133,53 @@ test(`Lexer: identifiers and numbers with discardable commas and spaces`, () =>
 
     assert.strictEqual(token, undefined);
 });
+
+test(`Lexer: C-style nested block comments via lexer states`, () => {
+    enum TokenKind {
+        CommentBegin,
+        CommentEnd,
+        CommentContents,
+        Number,
+        Identifier,
+        Comma,
+        Space,
+    }
+
+    const blockComment: LexerState<TokenKind> = [
+        [false, /^\/\*/g, TokenKind.CommentBegin, 'push'], // nested comment
+        [false, /^\*\//g, TokenKind.CommentEnd, 'pop'],
+        [true, /^(?:(?!\/\*|\*\/).)+/g, TokenKind.CommentContents],
+    ];
+
+    const lexer = buildLexer([
+        [false, /^\/\*/g, TokenKind.CommentBegin, blockComment],
+        [true, /^\d+/g, TokenKind.Number],
+        [true, /^[a-zA-Z]\w*/g, TokenKind.Identifier],
+        [false, /^,/g, TokenKind.Comma],
+        [false, /^\s+/g, TokenKind.Space],
+    ]);
+
+    let token = lexer.parse(`123 /* abc /*456*/*/ def`);
+
+    token = notUndefined(token);
+    assert.strictEqual(token.kind, TokenKind.Number);
+    assert.strictEqual(token.text, '123');
+    token = token.next;
+
+    token = notUndefined(token);
+    assert.strictEqual(token.kind, TokenKind.CommentContents);
+    assert.strictEqual(token.text, ' abc ');
+    token = token.next;
+
+    token = notUndefined(token);
+    assert.strictEqual(token.kind, TokenKind.CommentContents);
+    assert.strictEqual(token.text, '456');
+    token = token.next;
+
+    token = notUndefined(token);
+    assert.strictEqual(token.kind, TokenKind.Identifier);
+    assert.strictEqual(token.text, 'def');
+    token = token.next;
+
+    assert.strictEqual(token, undefined);
+});