diff --git a/.gitignore b/.gitignore index 4fc9b28f88a5..3e7b0fbc6782 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,6 @@ serena/ .beads/ AGENTS.md +# Generated benchmark output +packages/pyright-internal/src/tests/benchmarks/.generated/ + diff --git a/packages/pyright-internal/package.json b/packages/pyright-internal/package.json index 990232d58b82..aa4d7a82006a 100644 --- a/packages/pyright-internal/package.json +++ b/packages/pyright-internal/package.json @@ -13,9 +13,10 @@ "clean": "shx rm -rf ./dist ./out", "webpack:testserver": "webpack --config ./src/tests/lsp/webpack.testserver.config.js --mode=development", "webpack:testserver:watch": "npm run clean && webpack --config ./src/tests/lsp/webpack.testserver.config.js --mode development --watch --progress", - "test": "npm run webpack:testserver && node --max-old-space-size=8192 --expose-gc ./node_modules/jest/bin/jest --forceExit", - "test:norebuild": "node --max-old-space-size=8192 --expose-gc ./node_modules/jest/bin/jest --forceExit", - "test:coverage": "node --max-old-space-size=8192 --expose-gc ./node_modules/jest/bin/jest --forceExit --reporters=jest-junit --reporters=default --coverage --coverageReporters=cobertura --coverageReporters=html --coverageReporters=json", + "test": "npm run webpack:testserver && node --max-old-space-size=8192 --expose-gc ./node_modules/jest/bin/jest --forceExit --testPathIgnorePatterns src/tests/benchmarks", + "test:norebuild": "node --max-old-space-size=8192 --expose-gc ./node_modules/jest/bin/jest --forceExit --testPathIgnorePatterns src/tests/benchmarks", + "test:benchmark": "cross-env PYRIGHT_RUN_BENCHMARKS=1 node --max-old-space-size=8192 --expose-gc ./node_modules/jest/bin/jest --forceExit --testTimeout=300000 --runInBand --detectOpenHandles src/tests/benchmarks", + "test:coverage": "node --max-old-space-size=8192 --expose-gc ./node_modules/jest/bin/jest --forceExit --testPathIgnorePatterns src/tests/benchmarks --reporters=jest-junit --reporters=default --coverage --coverageReporters=cobertura --coverageReporters=html --coverageReporters=json", "test:imports": "node --max-old-space-size=8192 --expose-gc ./node_modules/jest/bin/jest importResolver.test --forceExit --runInBand" }, "dependencies": { diff --git a/packages/pyright-internal/src/analyzer/sourceFile.ts b/packages/pyright-internal/src/analyzer/sourceFile.ts index 776801ecabca..feb92ad716c1 100644 --- a/packages/pyright-internal/src/analyzer/sourceFile.ts +++ b/packages/pyright-internal/src/analyzer/sourceFile.ts @@ -774,7 +774,7 @@ export class SourceFile { this._writableData.taskListDiagnostics = []; this._addTaskListDiagnostics( configOptions.taskListTokens, - parseFileResults.tokenizerOutput, + parseFileResults, this._writableData.taskListDiagnostics ); }); @@ -1327,13 +1327,16 @@ export class SourceFile { // to the specified diagnostic list. private _addTaskListDiagnostics( taskListTokens: TaskListToken[] | undefined, - tokenizerOutput: TokenizerOutput, + parseFileResults: ParseFileResults, diagList: Diagnostic[] ) { if (!taskListTokens || taskListTokens.length === 0 || !diagList) { return; } + const tokenizerOutput = parseFileResults.tokenizerOutput; + const fileContents = parseFileResults.text; + for (let i = 0; i < tokenizerOutput.tokens.count; i++) { const token = tokenizerOutput.tokens.getItemAt(i); @@ -1343,36 +1346,65 @@ export class SourceFile { } for (const comment of token.comments) { - for (const token of taskListTokens) { - // Check if the comment matches the task list token. - // The comment must start with zero or more whitespace characters, - // followed by the taskListToken (case insensitive), - // followed by (0+ whitespace + EOL) OR (1+ NON-alphanumeric characters) - const regexStr = '^[\\s]*' + token.text + '([\\s]*$|[\\W]+)'; - const regex = RegExp(regexStr, 'i'); // case insensitive - - // If the comment doesn't match, skip it. - if (!regex.test(comment.value)) { + for (const taskToken of taskListTokens) { + // Match: optional leading whitespace, then taskToken.text (case-insensitive), + // then either (whitespace to end) or (non-alphanumeric char). + const commentStart = comment.start; + const commentEnd = commentStart + comment.length; + const taskText = taskToken.text; + const taskLen = taskText.length; + + // Skip leading whitespace within the source text range. + let pos = commentStart; + while (pos < commentEnd) { + const ch = fileContents.charCodeAt(pos); + if (ch === 0x20 || ch === 0x09 || ch === 0x0a || ch === 0x0d || ch === 0x0c || ch === 0x0b) { + pos++; + } else { + break; + } + } + + // Check if the task token text matches (case-insensitive). + if (pos + taskLen > commentEnd) { continue; } - // Calculate the range for the diagnostic. This allows navigation - // to the comment via double clicking the item in the task list pane. - let rangeStart = comment.start; + let matched = true; + for (let k = 0; k < taskLen; k++) { + const a = fileContents.charCodeAt(pos + k); + const b = taskText.charCodeAt(k); + if (a !== b && (a | 0x20) !== (b | 0x20)) { + matched = false; + break; + } + } + if (!matched) { + continue; + } - // The comment technically starts right after the comment identifier(#), - // but we want the caret right before the task list token (since there - // might be whitespace before it). - const indexOfToken = comment.value.toLowerCase().indexOf(token.text.toLowerCase()); - rangeStart += indexOfToken; + // After the token, require whitespace-to-end or a non-word character. + const afterPos = pos + taskLen; + if (afterPos < commentEnd) { + const ch = fileContents.charCodeAt(afterPos); + // Check if ch is a word character [a-zA-Z0-9_] + const isWord = + (ch >= 0x61 && ch <= 0x7a) || + (ch >= 0x41 && ch <= 0x5a) || + (ch >= 0x30 && ch <= 0x39) || + ch === 0x5f; + if (isWord) { + continue; + } + } + // Match succeeded. pos is the offset of the task token in the source text. const rangeEnd = TextRange.getEnd(comment); - const range = convertOffsetsToRange(rangeStart, rangeEnd, tokenizerOutput.lines!); + const range = convertOffsetsToRange(pos, rangeEnd, tokenizerOutput.lines!); - // Add the diagnostic to the list and trim whitespace from the comment so - // it's easier to read in the task list. + const commentValue = comment.value; diagList.push( - new Diagnostic(DiagnosticCategory.TaskItem, comment.value.trim(), range, token.priority) + new Diagnostic(DiagnosticCategory.TaskItem, commentValue.trim(), range, taskToken.priority) ); } } diff --git a/packages/pyright-internal/src/parser/characterStream.ts b/packages/pyright-internal/src/parser/characterStream.ts index a7065bffd1b9..4960c552edce 100644 --- a/packages/pyright-internal/src/parser/characterStream.ts +++ b/packages/pyright-internal/src/parser/characterStream.ts @@ -108,8 +108,30 @@ export class CharacterStream { } skipWhitespace(): void { - while (!this.isEndOfStream() && this.isAtWhiteSpace()) { - this.moveNext(); + // Tight loop: advance _position/_currentChar directly while the + // current char is a space/tab/form-feed. Avoids the method-call + // overhead of moveNext() + isAtWhiteSpace() + isWhiteSpace() per + // iteration, which is one of the hottest paths in tokenization. + const text = this._text; + const len = text.length; + let pos = this._position; + while (pos < len) { + const ch = text.charCodeAt(pos); + if (ch === Char.Space || ch === Char.Tab || ch === Char.FormFeed) { + pos++; + } else { + break; + } + } + if (pos !== this._position) { + this._position = pos; + if (pos >= len) { + this._isEndOfStream = true; + this._position = len; + this._currentChar = 0; + } else { + this._currentChar = text.charCodeAt(pos); + } } } diff --git a/packages/pyright-internal/src/parser/parser.ts b/packages/pyright-internal/src/parser/parser.ts index 4e6e86f63de8..212e7a41928d 100644 --- a/packages/pyright-internal/src/parser/parser.ts +++ b/packages/pyright-internal/src/parser/parser.ts @@ -232,6 +232,8 @@ const maxChildNodeDepth = 256; export class Parser { private _fileContents?: string; private _tokenizerOutput?: TokenizerOutput; + private _tokens?: TextRangeCollection; + private _tokenCount = 0; private _tokenIndex = 0; private _areErrorsSuppressed = false; private _parseOptions: ParseOptions = new ParseOptions(); @@ -406,6 +408,8 @@ export class Parser { initialParenDepth, this._parseOptions.useNotebookMode ); + this._tokens = this._tokenizerOutput.tokens; + this._tokenCount = this._tokens.count; this._tokenIndex = 0; } @@ -5259,7 +5263,7 @@ export class Parser { } private _getNextToken(): Token { - const token = this._tokenizerOutput!.tokens.getItemAt(this._tokenIndex); + const token = this._tokens!.getItemAt(this._tokenIndex); if (!this._atEof()) { this._tokenIndex++; } @@ -5270,19 +5274,20 @@ export class Parser { private _atEof(): boolean { // Are we pointing at the last token in the stream (which is // assumed to be an end-of-stream token)? - return this._tokenIndex >= this._tokenizerOutput!.tokens.count - 1; + return this._tokenIndex >= this._tokenCount - 1; } private _peekToken(count = 0): Token { - if (this._tokenIndex + count < 0) { - return this._tokenizerOutput!.tokens.getItemAt(0); + const targetIndex = this._tokenIndex + count; + if (targetIndex < 0) { + return this._tokens!.getItemAt(0); } - if (this._tokenIndex + count >= this._tokenizerOutput!.tokens.count) { - return this._tokenizerOutput!.tokens.getItemAt(this._tokenizerOutput!.tokens.count - 1); + if (targetIndex >= this._tokenCount) { + return this._tokens!.getItemAt(this._tokenCount - 1); } - return this._tokenizerOutput!.tokens.getItemAt(this._tokenIndex + count); + return this._tokens!.getItemAt(targetIndex); } private _peekTokenType(): TokenType { diff --git a/packages/pyright-internal/src/parser/tokenizer.ts b/packages/pyright-internal/src/parser/tokenizer.ts index 416532935708..2e75bb5351d8 100644 --- a/packages/pyright-internal/src/parser/tokenizer.ts +++ b/packages/pyright-internal/src/parser/tokenizer.ts @@ -92,6 +92,66 @@ const _keywords: Map = new Map([ const _softKeywords = new Set(['match', 'case', 'type']); +// Fast-reject table: keywords are 2–9 chars long and only start with these +// character codes. A 128-entry boolean table indexed by charCodeAt(0) rejects +// most identifiers without touching the _keywords Map. +const _keywordFirstCharTable: boolean[] = (() => { + const table = new Array(128).fill(false); + for (const kw of _keywords.keys()) { + const code = kw.charCodeAt(0); + if (code < 128) { + table[code] = true; + } + } + return table; +})(); + +const _keywordMinLen = 2; +const _keywordMaxLen = 9; // __debug__ + +interface KeywordEntry { + text: string; + type: KeywordType; +} + +// For keyword-like identifiers, compare directly against the source text slice +// to avoid creating temporary substring objects on the keyword path. +const _keywordEntriesByFirstChar: Array = (() => { + const entriesByFirstChar: Array = new Array(128); + for (const [text, type] of _keywords.entries()) { + const firstCharCode = text.charCodeAt(0); + if (firstCharCode < 128) { + const entries = entriesByFirstChar[firstCharCode] ?? (entriesByFirstChar[firstCharCode] = []); + entries.push({ text, type }); + } + } + return entriesByFirstChar; +})(); + +function getKeywordTypeFromTextSlice(text: string, start: number, length: number): KeywordType | undefined { + if (length < _keywordMinLen || length > _keywordMaxLen) { + return undefined; + } + + const firstCharCode = text.charCodeAt(start); + if (firstCharCode >= 128 || !_keywordFirstCharTable[firstCharCode]) { + return undefined; + } + + const candidates = _keywordEntriesByFirstChar[firstCharCode]; + if (!candidates) { + return undefined; + } + + for (const candidate of candidates) { + if (candidate.text.length === length && text.startsWith(candidate.text, start)) { + return candidate.type; + } + } + + return undefined; +} + const _operatorInfo: { [key: number]: OperatorFlags } = { [OperatorType.Add]: OperatorFlags.Unary | OperatorFlags.Binary, [OperatorType.AddEqual]: OperatorFlags.Assignment, @@ -138,17 +198,377 @@ const _operatorInfo: { [key: number]: OperatorFlags } = { [OperatorType.NotIn]: OperatorFlags.Binary, }; +const _unsetSingleCharOperatorType = -1; +const _singleCharOperatorTypeTable: Int16Array = (() => { + const table = new Int16Array(128); + table.fill(_unsetSingleCharOperatorType); + table[Char.Equal] = OperatorType.Assign; + table[Char.Plus] = OperatorType.Add; + table[Char.Hyphen] = OperatorType.Subtract; + table[Char.Asterisk] = OperatorType.Multiply; + table[Char.Slash] = OperatorType.Divide; + table[Char.Ampersand] = OperatorType.BitwiseAnd; + table[Char.Bar] = OperatorType.BitwiseOr; + table[Char.Caret] = OperatorType.BitwiseXor; + table[Char.Percent] = OperatorType.Mod; + table[Char.Tilde] = OperatorType.BitwiseInvert; + table[Char.At] = OperatorType.MatrixMultiply; + table[Char.Less] = OperatorType.LessThan; + table[Char.Greater] = OperatorType.GreaterThan; + return table; +})(); + +const _singleCharEqualOperatorTypeTable: Int16Array = (() => { + const table = new Int16Array(128); + table.fill(_unsetSingleCharOperatorType); + table[Char.Plus] = OperatorType.AddEqual; + table[Char.Hyphen] = OperatorType.SubtractEqual; + table[Char.Asterisk] = OperatorType.MultiplyEqual; + table[Char.Slash] = OperatorType.DivideEqual; + table[Char.Ampersand] = OperatorType.BitwiseAndEqual; + table[Char.Bar] = OperatorType.BitwiseOrEqual; + table[Char.Caret] = OperatorType.BitwiseXorEqual; + table[Char.Percent] = OperatorType.ModEqual; + table[Char.At] = OperatorType.MatrixMultiplyEqual; + return table; +})(); + +function getTwoCharKey(char1: number, char2: number): number { + return (char1 << 8) | char2; +} + +// Two-char operator/token tables: use Map instead of Int16Array(65536). +// With only 5+1 entries, a Map uses ~200 bytes vs 256KB for two Int16Arrays. +const _twoCharOperatorTypeMap = new Map([ + [getTwoCharKey(Char.Equal, Char.Equal), OperatorType.Equals], + [getTwoCharKey(Char.ExclamationMark, Char.Equal), OperatorType.NotEquals], + [getTwoCharKey(Char.Less, Char.Equal), OperatorType.LessThanOrEqual], + [getTwoCharKey(Char.Greater, Char.Equal), OperatorType.GreaterThanOrEqual], + [getTwoCharKey(Char.Less, Char.Greater), OperatorType.LessOrGreaterThan], +]); + +const _twoCharSpecialTokenTypeMap = new Map([ + [getTwoCharKey(Char.Hyphen, Char.Greater), TokenType.Arrow], +]); + +const _repeatedCharOperatorTypeTable: Int16Array = (() => { + const table = new Int16Array(128); + table.fill(_unsetSingleCharOperatorType); + table[Char.Asterisk] = OperatorType.Power; + table[Char.Slash] = OperatorType.FloorDivide; + table[Char.Less] = OperatorType.LeftShift; + table[Char.Greater] = OperatorType.RightShift; + return table; +})(); + +const _repeatedCharEqualOperatorTypeTable: Int16Array = (() => { + const table = new Int16Array(128); + table.fill(_unsetSingleCharOperatorType); + table[Char.Asterisk] = OperatorType.PowerEqual; + table[Char.Slash] = OperatorType.FloorDivideEqual; + table[Char.Less] = OperatorType.LeftShiftEqual; + table[Char.Greater] = OperatorType.RightShiftEqual; + return table; +})(); + const _byteOrderMarker = 0xfeff; const defaultTabSize = 8; -const magicsRegEx = /\\\s*$/; -// The character class for type: ignore rule codes includes ':' so that -// tool-namespaced codes such as "ty:unresolved-reference" are accepted. -// pyright: ignore uses the original class since tool-namespaced codes -// are not expected there. -const typeIgnoreCommentRegEx = /((^|#)\s*)type:\s*ignore(\s*\[([\s\w:,-]*)\]|\s|$)/; -const pyrightIgnoreCommentRegEx = /((^|#)\s*)pyright:\s*ignore(\s*\[([\s\w-,]*)\]|\s|$)/; -const underscoreRegEx = /_/g; + +// Fast-reject table: only these ASCII chars can begin a string literal +// (quote chars or valid string prefix chars f/r/b/u/t and their uppercase). +// Checking this table first avoids calling _getStringPrefixLength() for the +// vast majority of tokens (identifiers, numbers, operators, etc.). +const _canStartString: boolean[] = (() => { + const table = new Array(128).fill(false); + table[Char.SingleQuote] = true; + table[Char.DoubleQuote] = true; + for (const ch of [Char.f, Char.F, Char.r, Char.R, Char.b, Char.B, Char.u, Char.U, Char.t, Char.T]) { + table[ch] = true; + } + return table; +})(); + +// ASCII identifier-continue table. Indexed by char code < 128; true if the +// char can appear inside an identifier (letter, digit, underscore). +// Building this at module load by querying isIdentifierChar lets the tight +// identifier-swallow loop avoid function-call overhead entirely on the common +// ASCII path. Non-ASCII chars fall back to the generic path. +const _asciiIdentifierContinue: boolean[] = (() => { + const table = new Array(128).fill(false); + for (let i = 0; i < 128; i++) { + if (isIdentifierChar(i)) { + table[i] = true; + } + } + return table; +})(); + +const _asciiIdentifierStart: boolean[] = (() => { + const table = new Array(128).fill(false); + for (let i = 0; i < 128; i++) { + if (isIdentifierStartChar(i)) { + table[i] = true; + } + } + return table; +})(); + +// Create a detached copy of a source text range without going through Buffer. +// Each charAt() for ASCII returns a V8-cached single-char string that does not +// reference the parent. The concatenation chain becomes a ConsString independent +// of the source text, avoiding V8 SlicedString memory pinning. +// ~4-9x faster than Buffer.from(str,'utf8').toString('utf8') for typical +// Python identifier lengths (5-20 chars). +function detachSubstring(text: string, start: number, end: number): string { + let result = ''; + for (let i = start; i < end; i++) { + result += text.charAt(i); + } + return result; +} + +// Strip underscore characters from a source text range without first creating +// an intermediate substring. +function removeUnderscoresFromRange(text: string, start: number, end: number): string { + let firstUnderscoreIndex = -1; + for (let i = start; i < end; i++) { + if (text.charCodeAt(i) === Char.Underscore) { + firstUnderscoreIndex = i; + break; + } + } + + if (firstUnderscoreIndex < 0) { + return text.slice(start, end); + } + + let result = text.slice(start, firstUnderscoreIndex); + for (let i = firstUnderscoreIndex + 1; i < end; i++) { + if (text.charCodeAt(i) !== Char.Underscore) { + result += text[i]; + } + } + return result; +} + +// Manual replacement for magicsRegEx = /\\\s*$/ +// Check if a range [start, end) within `text` ends with a backslash followed +// by optional whitespace. +function endsWithBackslashContinuation(text: string, start: number, end: number): boolean { + let i = end - 1; + // Skip trailing whitespace + while (i >= start) { + const ch = text.charCodeAt(i); + if (ch === Char.Space || ch === Char.Tab || ch === Char.FormFeed) { + i--; + } else { + break; + } + } + return i >= start && text.charCodeAt(i) === Char.Backslash; +} + +// Result structure matching the shape previously extracted from regex match groups. +interface IgnoreDirectiveMatch { + fullMatch: string; // group 0: full matched text + prefix: string; // group 1: prefix before directive keyword + bracketContent?: string; // group 4: content inside [...] if present + index: number; // match position within the input string +} + +// Parses a bracketed rule list starting at `pos` (which must point at '['). +// Returns the bracket content (without brackets) and the position just past ']', +// or undefined if the bracket is malformed (e.g. unclosed, or contains invalid chars +// before a closing bracket is found). +function parseIgnoreBracketContent( + text: string, + pos: number, + rangeEnd: number, + allowColon: boolean +): { content: string; newPos: number } | undefined { + pos++; // skip '[' + const bracketStart = pos; + while (pos < rangeEnd && text.charCodeAt(pos) !== Char.CloseBracket) { + // Only allow valid bracket content chars: \s, \w, -, , + // (plus ':' for type: ignore to support tool-namespaced codes) + const bc = text.charCodeAt(pos); + if ( + (bc >= Char.a && bc <= Char.z) || + (bc >= Char.A && bc <= Char.Z) || + (bc >= Char._0 && bc <= Char._9) || + bc === Char.Underscore || + bc === Char.Hyphen || + bc === Char.Comma || + bc === Char.Space || + bc === Char.Tab || + (allowColon && bc === Char.Colon) + ) { + pos++; + } else { + break; + } + } + if (pos < rangeEnd && text.charCodeAt(pos) === Char.CloseBracket) { + return { content: text.slice(bracketStart, pos), newPos: pos + 1 }; + } + return undefined; +} + +// Manual replacement for typeIgnoreCommentRegEx / pyrightIgnoreCommentRegEx. +// Scans `text` within [rangeStart, rangeEnd) for `: ignore [rules]` +// where directive is 'type' or 'pyright'. +// Returns a match object or undefined. Returned `index` is absolute within `text`. +function matchIgnoreDirective( + text: string, + rangeStart: number, + rangeEnd: number, + directive: string +): IgnoreDirectiveMatch | undefined { + // The directive can be preceded by optional `#` and whitespace, or + // appear at the start of the range with optional whitespace. + // type: ignore allows tool-namespaced codes (e.g. "ty:rule-name") in brackets; + // pyright: ignore does not. + const allowColonInBracket = directive === 'type'; + let searchFrom = rangeStart; + + while (searchFrom < rangeEnd) { + // Find the next occurrence of the directive keyword, bounded by + // rangeEnd. A bounded hand-rolled scan is important here: native + // String.prototype.indexOf has no end bound and, when the keyword is + // absent from the current comment but present elsewhere in the file, + // can scan well past rangeEnd — producing O(n) behavior per comment + // and O(n^2) overall on comment-heavy files. + const firstCharCode = directive.charCodeAt(0); + let directiveIdx = -1; + const scanLimit = rangeEnd - directive.length; + for (let i = searchFrom; i <= scanLimit; i++) { + if (text.charCodeAt(i) === firstCharCode) { + let found = true; + for (let d = 1; d < directive.length; d++) { + if (text.charCodeAt(i + d) !== directive.charCodeAt(d)) { + found = false; + break; + } + } + if (found) { + directiveIdx = i; + break; + } + } + } + if (directiveIdx < 0) { + return undefined; + } + + // Determine the prefix: scan backward from directiveIdx to find + // the `#` or start-of-range, collecting whitespace. + let prefixStart = directiveIdx; + let foundAnchor = false; + + // Walk backward over spaces/tabs + let j = directiveIdx - 1; + while (j >= rangeStart && (text.charCodeAt(j) === Char.Space || text.charCodeAt(j) === Char.Tab)) { + j--; + } + + if (j < rangeStart) { + // At start of range + prefixStart = rangeStart; + foundAnchor = true; + } else if (text.charCodeAt(j) === Char.Hash) { + prefixStart = j; + foundAnchor = true; + } + + if (!foundAnchor) { + searchFrom = directiveIdx + 1; + continue; + } + + // After directive keyword, expect ':' + let pos = directiveIdx + directive.length; + if (pos >= rangeEnd || text.charCodeAt(pos) !== Char.Colon) { + searchFrom = directiveIdx + 1; + continue; + } + pos++; // skip ':' + + // Skip optional whitespace after ':' + while (pos < rangeEnd && (text.charCodeAt(pos) === Char.Space || text.charCodeAt(pos) === Char.Tab)) { + pos++; + } + + // Expect 'ignore' + const ignoreStr = 'ignore'; + if (pos + ignoreStr.length > rangeEnd) { + searchFrom = directiveIdx + 1; + continue; + } + + let matched = true; + for (let k = 0; k < ignoreStr.length; k++) { + if (text.charCodeAt(pos + k) !== ignoreStr.charCodeAt(k)) { + matched = false; + break; + } + } + if (!matched) { + searchFrom = directiveIdx + 1; + continue; + } + pos += ignoreStr.length; + + // After 'ignore', expect whitespace, '[', or end-of-range + let bracketContent: string | undefined; + + if (pos >= rangeEnd) { + // End of range — valid + } else { + const ch = text.charCodeAt(pos); + if (ch === Char.Space || ch === Char.Tab) { + // Skip whitespace to check for optional bracket + while (pos < rangeEnd && (text.charCodeAt(pos) === Char.Space || text.charCodeAt(pos) === Char.Tab)) { + pos++; + } + if (pos < rangeEnd && text.charCodeAt(pos) === Char.OpenBracket) { + const parsed = parseIgnoreBracketContent(text, pos, rangeEnd, allowColonInBracket); + if (parsed === undefined) { + searchFrom = directiveIdx + 1; + continue; + } + bracketContent = parsed.content; + pos = parsed.newPos; + } + } else if (ch === Char.OpenBracket) { + // Bracket immediately after 'ignore' + const parsed = parseIgnoreBracketContent(text, pos, rangeEnd, allowColonInBracket); + if (parsed === undefined) { + searchFrom = directiveIdx + 1; + continue; + } + bracketContent = parsed.content; + pos = parsed.newPos; + } else { + // No space, no bracket — not a valid match + searchFrom = directiveIdx + 1; + continue; + } + } + + const prefix = text.slice(prefixStart, directiveIdx); + const fullMatch = text.slice(prefixStart, pos); + + return { + fullMatch, + prefix, + bracketContent, + index: prefixStart, + }; + } + + return undefined; +} export interface TokenizerOutput { // List of all tokens. @@ -228,6 +648,10 @@ export class Tokenizer { private _lineRanges: TextRange[] = []; private _indentAmounts: IndentInfo[] = []; private _typeIgnoreAll: IgnoreComment | undefined; + // Cached answer to "are there any non-trivial tokens yet?" Once true it + // stays true, so the O(n) scan in _handleComment only runs while the token + // stream consists purely of NewLine / Indent tokens. + private _hasTokenBeforeIgnoreAll = false; private _typeIgnoreLines = new Map(); private _pyrightIgnoreLines = new Map(); private _comments: Comment[] | undefined; @@ -259,10 +683,15 @@ export class Tokenizer { // Assume Jupyter notebook tokenization rules? private _useNotebookMode = false; - // Intern identifier strings within a single tokenization pass. This reduces - // per-identifier allocations while still ensuring we don't retain substrings - // that reference the original source text. - private readonly _identifierInternedStrings = new Map(); + // Direct-mapped identifier intern cache. Indexed by a cheap hash of + // (firstChar, lastChar, length). On a hit (slot defined and string + // equals the current source range), reuse the cached string instead of + // re-allocating via detachSubstring. Collisions simply overwrite the + // slot — no chaining, O(1) lookup, no Map overhead. Sized as a power of + // two so the mask is a single AND. + private static readonly _identifierCacheSize = 2048; + private static readonly _identifierCacheMask = Tokenizer._identifierCacheSize - 1; + private _identifierCache: Array = new Array(Tokenizer._identifierCacheSize); tokenize( text: string, @@ -293,7 +722,8 @@ export class Tokenizer { this._lineRanges = []; this._indentAmounts = []; this._useNotebookMode = useNotebookMode; - this._identifierInternedStrings.clear(); + // Clear per-source identifier intern cache. + this._identifierCache.fill(undefined); const end = start + length; @@ -459,21 +889,24 @@ export class Tokenizer { // tokens onto the token list. Returns true if the caller should advance // to the next character. private _handleCharacter(): boolean { - // f-strings, b-strings, etc - const stringPrefixLength = this._getStringPrefixLength(); - - if (stringPrefixLength >= 0) { - let stringPrefix = ''; - if (stringPrefixLength > 0) { - stringPrefix = this._cs.getText().slice(this._cs.position, this._cs.position + stringPrefixLength); - // Indeed a string - this._cs.advance(stringPrefixLength); - } + // f-strings, b-strings, etc — only check if current char can start a string + const currentChar = this._cs.currentChar; + if (currentChar < 128 && _canStartString[currentChar]) { + const stringPrefixLength = this._getStringPrefixLength(); + + if (stringPrefixLength >= 0) { + let stringPrefix = ''; + if (stringPrefixLength > 0) { + stringPrefix = this._cs.getText().slice(this._cs.position, this._cs.position + stringPrefixLength); + // Indeed a string + this._cs.advance(stringPrefixLength); + } - const quoteTypeFlags = this._getQuoteTypeFlags(stringPrefix); - if (quoteTypeFlags !== StringTokenFlags.None) { - this._handleString(quoteTypeFlags, stringPrefixLength); - return true; + const quoteTypeFlags = this._getQuoteTypeFlags(stringPrefix); + if (quoteTypeFlags !== StringTokenFlags.None) { + this._handleString(quoteTypeFlags, stringPrefixLength); + return true; + } } } @@ -890,51 +1323,104 @@ export class Tokenizer { } private _tryIdentifier(): boolean { - const swallowRemainingChars = () => { - while (true) { - if (isIdentifierChar(this._cs.currentChar)) { - this._cs.moveNext(); - } else if (isIdentifierChar(this._cs.currentChar, this._cs.nextChar)) { - this._cs.moveNext(); - this._cs.moveNext(); + const cs = this._cs; + const text = cs.getText(); + const textLen = text.length; + const start = cs.position; + + // Fast path for ASCII identifier start. Avoids the function call and + // surrogate logic for the common case (Python source is overwhelmingly + // ASCII identifiers). + const firstChar = cs.currentChar; + let pos = start; + if (firstChar < 128) { + if (!_asciiIdentifierStart[firstChar]) { + // Not an identifier start and not a surrogate candidate. + return false; + } + pos++; + + // Tight loop: advance while we're still in ASCII identifier chars. + while (pos < textLen) { + const ch = text.charCodeAt(pos); + if (ch < 128 && _asciiIdentifierContinue[ch]) { + pos++; } else { break; } } - }; - const start = this._cs.position; - if (isIdentifierStartChar(this._cs.currentChar)) { - this._cs.moveNext(); - swallowRemainingChars(); - } else if (isIdentifierStartChar(this._cs.currentChar, this._cs.nextChar)) { - this._cs.moveNext(); - this._cs.moveNext(); - swallowRemainingChars(); + // If we hit a non-ASCII char, fall back to the generic loop to + // handle possible unicode identifier continue / surrogate pairs. + if (pos < textLen && text.charCodeAt(pos) >= 128) { + cs.advance(pos - start); + this._swallowNonAsciiIdentifierChars(); + pos = cs.position; + } else { + cs.advance(pos - start); + } + } else { + // Non-ASCII start: use the generic path (supports surrogates). + if (isIdentifierStartChar(firstChar)) { + cs.moveNext(); + } else if (isIdentifierStartChar(firstChar, cs.nextChar)) { + cs.moveNext(); + cs.moveNext(); + } else { + return false; + } + this._swallowNonAsciiIdentifierChars(); + pos = cs.position; } - if (this._cs.position > start) { - const value = this._cs.getText().slice(start, this._cs.position); - const keywordType = _keywords.get(value); + if (pos > start) { + const end = pos; + const length = end - start; + const keywordType = getKeywordTypeFromTextSlice(text, start, length); + if (keywordType !== undefined) { - this._tokens.push( - KeywordToken.create(start, this._cs.position - start, keywordType, this._getComments()) - ); + this._tokens.push(KeywordToken.create(start, length, keywordType, this._getComments())); } else { - const internedValue = this._identifierInternedStrings.get(value) ?? this._internIdentifierString(value); - this._tokens.push( - IdentifierToken.create(start, this._cs.position - start, internedValue, this._getComments()) - ); + const value = this._internIdentifier(text, start, end, length); + this._tokens.push(IdentifierToken.create(start, length, value, this._getComments())); } return true; } return false; } - private _internIdentifierString(value: string) { - const clonedValue = cloneStr(value); - this._identifierInternedStrings.set(clonedValue, clonedValue); - return clonedValue; + // Per-tokenize identifier intern cache. Direct-mapped, so collisions + // simply replace the slot. Common identifiers (self, cls, True, None, + // str, int, dict, etc.) get deduplicated to a single string object, + // avoiding repeated detachSubstring allocations for the same name. + private _internIdentifier(text: string, start: number, end: number, length: number): string { + const firstChar = text.charCodeAt(start); + const lastChar = text.charCodeAt(end - 1); + // Hash mixes length, first and last char; multiplier values chosen + // to spread hits for common short identifiers across the table. + const hash = (firstChar * 31 + lastChar * 7 + length) & Tokenizer._identifierCacheMask; + const cached = this._identifierCache[hash]; + if (cached !== undefined && cached.length === length && text.startsWith(cached, start)) { + return cached; + } + const value = detachSubstring(text, start, end); + this._identifierCache[hash] = value; + return value; + } + + // Generic identifier-continue loop that handles unicode + surrogate pairs. + // Falls back to this when the fast ASCII loop encounters a non-ASCII char. + private _swallowNonAsciiIdentifierChars(): void { + while (true) { + if (isIdentifierChar(this._cs.currentChar)) { + this._cs.moveNext(); + } else if (isIdentifierChar(this._cs.currentChar, this._cs.nextChar)) { + this._cs.moveNext(); + this._cs.moveNext(); + } else { + break; + } + } } private _isPossibleNumber(): boolean { @@ -990,8 +1476,9 @@ export class Tokenizer { } if (radix > 0) { - const text = this._cs.getText().slice(start, this._cs.position); - const simpleIntText = text.replace(underscoreRegEx, ''); + const end = this._cs.position; + const text = this._cs.getText(); + const simpleIntText = removeUnderscoresFromRange(text, start, end); let intValue: number | bigint = parseInt(simpleIntText.slice(leadingChars), radix); if (!isNaN(intValue)) { @@ -1005,7 +1492,7 @@ export class Tokenizer { } this._tokens.push( - NumberToken.create(start, text.length, intValue, true, false, this._getComments()) + NumberToken.create(start, end - start, intValue, true, false, this._getComments()) ); return true; } @@ -1043,12 +1530,14 @@ export class Tokenizer { } if (isDecimalInteger) { - let text = this._cs.getText().slice(start, this._cs.position); - const simpleIntText = text.replace(underscoreRegEx, ''); + const textEnd = this._cs.position; + const sourceText = this._cs.getText(); + const simpleIntText = removeUnderscoresFromRange(sourceText, start, textEnd); let intValue: number | bigint = parseInt(simpleIntText, 10); if (!isNaN(intValue)) { let isImaginary = false; + let tokenLength = textEnd - start; const bigIntValue = BigInt(simpleIntText); if ( @@ -1061,12 +1550,12 @@ export class Tokenizer { if (this._cs.currentChar === Char.j || this._cs.currentChar === Char.J) { isImaginary = true; - text += String.fromCharCode(this._cs.currentChar); this._cs.moveNext(); + tokenLength += 1; } this._tokens.push( - NumberToken.create(start, text.length, intValue, true, isImaginary, this._getComments()) + NumberToken.create(start, tokenLength, intValue, true, isImaginary, this._getComments()) ); return true; } @@ -1079,24 +1568,19 @@ export class Tokenizer { (this._cs.currentChar === Char.Period && this._cs.nextChar >= Char._0 && this._cs.nextChar <= Char._9) ) { if (this._skipFloatingPointCandidate()) { - let text = this._cs.getText().slice(start, this._cs.position); - const value = parseFloat(text); + const floatEnd = this._cs.position; + const floatText = removeUnderscoresFromRange(this._cs.getText(), start, floatEnd); + const value = parseFloat(floatText); if (!isNaN(value)) { let isImaginary = false; + let tokenLength = floatEnd - start; if (this._cs.currentChar === Char.j || this._cs.currentChar === Char.J) { isImaginary = true; - text += String.fromCharCode(this._cs.currentChar); this._cs.moveNext(); + tokenLength += 1; } this._tokens.push( - NumberToken.create( - start, - this._cs.position - start, - value, - false, - isImaginary, - this._getComments() - ) + NumberToken.create(start, tokenLength, value, false, isImaginary, this._getComments()) ); return true; } @@ -1108,139 +1592,76 @@ export class Tokenizer { } private _tryOperator(): boolean { + const currentChar = this._cs.currentChar; let length = 0; const nextChar = this._cs.nextChar; let operatorType: OperatorType; - switch (this._cs.currentChar) { - case Char.Plus: - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.AddEqual : OperatorType.Add; - break; - - case Char.Ampersand: - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.BitwiseAndEqual : OperatorType.BitwiseAnd; - break; - - case Char.Bar: - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.BitwiseOrEqual : OperatorType.BitwiseOr; - break; - - case Char.Caret: - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.BitwiseXorEqual : OperatorType.BitwiseXor; - break; - - case Char.Equal: - if ( - this._activeFString?.activeReplacementField && - this._activeFString?.activeReplacementField.parenDepth === this._parenDepth && - !this._activeFString.activeReplacementField.inFormatSpecifier && - nextChar !== Char.Equal - ) { - length = 1; - operatorType = OperatorType.Assign; - break; - } - - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.Equals : OperatorType.Assign; - break; - - case Char.ExclamationMark: - if (nextChar !== Char.Equal) { - if (this._activeFString) { - // Handle the conversion separator (!) within an f-string. - this._tokens.push( - Token.create(TokenType.ExclamationMark, this._cs.position, 1, this._getComments()) - ); - this._cs.advance(1); - return true; - } - - return false; - } - length = 2; - operatorType = OperatorType.NotEquals; - break; - - case Char.Percent: - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.ModEqual : OperatorType.Mod; - break; + if (currentChar < 128 && nextChar < 128) { + const twoCharKey = (currentChar << 8) | nextChar; + const specialTokenType = _twoCharSpecialTokenTypeMap.get(twoCharKey); + if (specialTokenType !== undefined) { + this._tokens.push(Token.create(specialTokenType, this._cs.position, 2, this._getComments())); + this._cs.advance(2); + return true; + } - case Char.Tilde: - length = 1; - operatorType = OperatorType.BitwiseInvert; - break; + const twoCharOperatorType = _twoCharOperatorTypeMap.get(twoCharKey); + if (twoCharOperatorType !== undefined) { + this._tokens.push(OperatorToken.create(this._cs.position, 2, twoCharOperatorType, this._getComments())); + this._cs.advance(2); + return true; + } - case Char.Hyphen: - if (nextChar === Char.Greater) { - this._tokens.push(Token.create(TokenType.Arrow, this._cs.position, 2, this._getComments())); - this._cs.advance(2); + if (currentChar === nextChar) { + const repeatedOperatorType = _repeatedCharOperatorTypeTable[currentChar]; + if (repeatedOperatorType !== _unsetSingleCharOperatorType) { + const hasTrailingEqual = this._cs.lookAhead(2) === Char.Equal; + const repeatedLength = hasTrailingEqual ? 3 : 2; + const operatorType = hasTrailingEqual + ? _repeatedCharEqualOperatorTypeTable[currentChar] + : repeatedOperatorType; + this._tokens.push( + OperatorToken.create( + this._cs.position, + repeatedLength, + operatorType as OperatorType, + this._getComments() + ) + ); + this._cs.advance(repeatedLength); return true; } + } + } - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.SubtractEqual : OperatorType.Subtract; - break; - - case Char.Asterisk: - if (nextChar === Char.Asterisk) { - length = this._cs.lookAhead(2) === Char.Equal ? 3 : 2; - operatorType = length === 3 ? OperatorType.PowerEqual : OperatorType.Power; - } else { - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.MultiplyEqual : OperatorType.Multiply; - } - break; - - case Char.Slash: - if (nextChar === Char.Slash) { - length = this._cs.lookAhead(2) === Char.Equal ? 3 : 2; - operatorType = length === 3 ? OperatorType.FloorDivideEqual : OperatorType.FloorDivide; - } else { - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.DivideEqual : OperatorType.Divide; - } - break; - - case Char.Less: - if (nextChar === Char.Less) { - length = this._cs.lookAhead(2) === Char.Equal ? 3 : 2; - operatorType = length === 3 ? OperatorType.LeftShiftEqual : OperatorType.LeftShift; - } else if (nextChar === Char.Greater) { + if (currentChar < 128) { + const singleCharOperatorType = _singleCharOperatorTypeTable[currentChar]; + if (singleCharOperatorType !== _unsetSingleCharOperatorType) { + const equalOperatorType = _singleCharEqualOperatorTypeTable[currentChar]; + if (nextChar === Char.Equal && equalOperatorType !== _unsetSingleCharOperatorType) { length = 2; - operatorType = OperatorType.LessOrGreaterThan; + operatorType = equalOperatorType as OperatorType; } else { - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.LessThanOrEqual : OperatorType.LessThan; - } - break; - - case Char.Greater: - if (nextChar === Char.Greater) { - length = this._cs.lookAhead(2) === Char.Equal ? 3 : 2; - operatorType = length === 3 ? OperatorType.RightShiftEqual : OperatorType.RightShift; - } else { - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.GreaterThanOrEqual : OperatorType.GreaterThan; + length = 1; + operatorType = singleCharOperatorType as OperatorType; } - break; - case Char.At: - length = nextChar === Char.Equal ? 2 : 1; - operatorType = length === 2 ? OperatorType.MatrixMultiplyEqual : OperatorType.MatrixMultiply; - break; + this._tokens.push(OperatorToken.create(this._cs.position, length, operatorType, this._getComments())); + this._cs.advance(length); + return true; + } + } - default: - return false; + // `!=` is handled by the 2-char fast path above. + if (currentChar === Char.ExclamationMark && this._activeFString) { + // Handle the conversion separator (!) within an f-string. + this._tokens.push(Token.create(TokenType.ExclamationMark, this._cs.position, 1, this._getComments())); + this._cs.advance(1); + return true; } - this._tokens.push(OperatorToken.create(this._cs.position, length, operatorType, this._getComments())); - this._cs.advance(length); - return length > 0; + + return false; } private _handleInvalid(): boolean { @@ -1298,19 +1719,17 @@ export class Tokenizer { private _handleIPythonMagics(type: CommentType): void { const start = this._cs.position + 1; + const sourceText = this._cs.getText(); let begin = start; while (true) { this._cs.skipToEol(); if (type === CommentType.IPythonMagic || type === CommentType.IPythonShellEscape) { - const length = this._cs.position - begin; - const value = this._cs.getText().slice(begin, begin + length); - // is it multiline magics? // %magic command \ // next arguments - if (!value.match(magicsRegEx)) { + if (!endsWithBackslashContinuation(sourceText, begin, this._cs.position)) { break; } } @@ -1324,7 +1743,7 @@ export class Tokenizer { } const length = this._cs.position - start; - const comment = Comment.create(start, length, this._cs.getText().slice(start, start + length), type); + const comment = Comment.create(start, length, sourceText.slice(start, start + length), type); this._addComments(comment); } @@ -1333,53 +1752,74 @@ export class Tokenizer { this._cs.skipToEol(); const length = this._cs.position - start; - const comment = Comment.create(start, length, this._cs.getText().slice(start, start + length)); - - const typeIgnoreRegexMatch = comment.value.match(typeIgnoreCommentRegEx); - if (typeIgnoreRegexMatch) { - const commentStart = start + (typeIgnoreRegexMatch.index ?? 0); - const textRange: TextRange = { - start: commentStart + typeIgnoreRegexMatch[1].length, - length: typeIgnoreRegexMatch[0].length - typeIgnoreRegexMatch[1].length, - }; - const ignoreComment: IgnoreComment = { - range: textRange, - rulesList: this._getIgnoreCommentRulesList(commentStart, typeIgnoreRegexMatch), - }; + const sourceText = this._cs.getText(); + const end = start + length; - if (this._tokens.findIndex((t) => t.type !== TokenType.NewLine && t && t.type !== TokenType.Indent) < 0) { - this._typeIgnoreAll = ignoreComment; - } else { - this._typeIgnoreLines.set(this._lineRanges.length, ignoreComment); + // Fast pre-filter: any ignore directive must contain the substring 'ignore'. + // indexOf is a highly-optimized native call and lets us skip the full + // directive scan for the vast majority of comments (which are free-form text). + const ignoreIdx = sourceText.indexOf('ignore', start); + if (ignoreIdx >= 0 && ignoreIdx < end) { + const typeIgnoreMatch = matchIgnoreDirective(sourceText, start, end, 'type'); + if (typeIgnoreMatch) { + const commentStart = typeIgnoreMatch.index; + const textRange: TextRange = { + start: commentStart + typeIgnoreMatch.prefix.length, + length: typeIgnoreMatch.fullMatch.length - typeIgnoreMatch.prefix.length, + }; + const ignoreComment: IgnoreComment = { + range: textRange, + rulesList: this._getIgnoreCommentRulesList(commentStart, typeIgnoreMatch), + }; + + let isIgnoreAll = false; + if (!this._hasTokenBeforeIgnoreAll) { + // Are there any tokens other than NewLine / Indent yet? + const hasOther = this._tokens.some( + (t) => t && t.type !== TokenType.NewLine && t.type !== TokenType.Indent + ); + if (hasOther) { + this._hasTokenBeforeIgnoreAll = true; + } else { + isIgnoreAll = true; + } + } + + if (isIgnoreAll) { + this._typeIgnoreAll = ignoreComment; + } else { + this._typeIgnoreLines.set(this._lineRanges.length, ignoreComment); + } } - } - const pyrightIgnoreRegexMatch = comment.value.match(pyrightIgnoreCommentRegEx); - if (pyrightIgnoreRegexMatch) { - const commentStart = start + (pyrightIgnoreRegexMatch.index ?? 0); - const textRange: TextRange = { - start: commentStart + pyrightIgnoreRegexMatch[1].length, - length: pyrightIgnoreRegexMatch[0].length - pyrightIgnoreRegexMatch[1].length, - }; - const ignoreComment: IgnoreComment = { - range: textRange, - rulesList: this._getIgnoreCommentRulesList(commentStart, pyrightIgnoreRegexMatch), - }; - this._pyrightIgnoreLines.set(this._lineRanges.length, ignoreComment); + const pyrightIgnoreMatch = matchIgnoreDirective(sourceText, start, end, 'pyright'); + if (pyrightIgnoreMatch) { + const commentStart = pyrightIgnoreMatch.index; + const textRange: TextRange = { + start: commentStart + pyrightIgnoreMatch.prefix.length, + length: pyrightIgnoreMatch.fullMatch.length - pyrightIgnoreMatch.prefix.length, + }; + const ignoreComment: IgnoreComment = { + range: textRange, + rulesList: this._getIgnoreCommentRulesList(commentStart, pyrightIgnoreMatch), + }; + this._pyrightIgnoreLines.set(this._lineRanges.length, ignoreComment); + } } + const comment = Comment.create(start, length, sourceText.slice(start, end)); this._addComments(comment); } // Extracts the individual rules within a "type: ignore [x, y, z]" comment. - private _getIgnoreCommentRulesList(start: number, match: RegExpMatchArray): IgnoreCommentRule[] | undefined { - if (match.length < 5 || match[4] === undefined) { + private _getIgnoreCommentRulesList(start: number, match: IgnoreDirectiveMatch): IgnoreCommentRule[] | undefined { + if (match.bracketContent === undefined) { return undefined; } - const splitElements = match[4].split(','); + const splitElements = match.bracketContent.split(','); const commentRules: IgnoreCommentRule[] = []; - let currentOffset = start + match[0].indexOf('[') + 1; + let currentOffset = start + match.fullMatch.indexOf('[') + 1; for (const element of splitElements) { const frontTrimmed = element.trimStart(); diff --git a/packages/pyright-internal/src/parser/tokenizerTypes.ts b/packages/pyright-internal/src/parser/tokenizerTypes.ts index 19dcae595d4e..8fa2ec988515 100644 --- a/packages/pyright-internal/src/parser/tokenizerTypes.ts +++ b/packages/pyright-internal/src/parser/tokenizerTypes.ts @@ -193,15 +193,8 @@ export interface Comment extends TextRange { } export namespace Comment { - export function create(start: number, length: number, value: string, type = CommentType.Regular) { - const comment: Comment = { - type, - start, - length, - value, - }; - - return comment; + export function create(start: number, length: number, value: string, type = CommentType.Regular): Comment { + return { type, start, length, value }; } } @@ -209,21 +202,23 @@ export interface TokenBase extends TextRange { readonly type: TokenType; // Comments prior to the token. + // Intentionally optional: most tokens have no comments, so omitting this + // property keeps V8 object size smaller for the common case. Each `create` + // factory returns a two-shape object (with vs. without `comments`) so that + // comment-free tokens skip the extra property slot entirely. readonly comments?: Comment[] | undefined; } export interface Token extends TokenBase {} export namespace Token { - export function create(type: TokenType, start: number, length: number, comments: Comment[] | undefined) { - const token: Token = { + export function create(type: TokenType, start: number, length: number, comments: Comment[] | undefined): Token { + return { start, length, type, comments, }; - - return token; } } @@ -240,17 +235,27 @@ export namespace IndentToken { indentAmount: number, isIndentAmbiguous: boolean, comments: Comment[] | undefined - ) { - const token: IndentToken = { + ): IndentToken { + // Two-shape pattern: omit `comments` slot when unused to reduce + // per-token allocation size. ~95% of tokens carry no comments. + if (comments !== undefined) { + return { + start, + length, + type: TokenType.Indent, + isIndentAmbiguous, + comments, + indentAmount, + }; + } + + return { start, length, type: TokenType.Indent, isIndentAmbiguous, - comments, indentAmount, }; - - return token; } } @@ -269,18 +274,27 @@ export namespace DedentToken { matchesIndent: boolean, isDedentAmbiguous: boolean, comments: Comment[] | undefined - ) { - const token: DedentToken = { + ): DedentToken { + if (comments !== undefined) { + return { + start, + length, + type: TokenType.Dedent, + comments, + indentAmount, + matchesIndent, + isDedentAmbiguous, + }; + } + + return { start, length, type: TokenType.Dedent, - comments, indentAmount, matchesIndent, isDedentAmbiguous, }; - - return token; } } @@ -290,16 +304,28 @@ export interface NewLineToken extends Token { } export namespace NewLineToken { - export function create(start: number, length: number, newLineType: NewLineType, comments: Comment[] | undefined) { - const token: NewLineToken = { + export function create( + start: number, + length: number, + newLineType: NewLineType, + comments: Comment[] | undefined + ): NewLineToken { + if (comments !== undefined) { + return { + start, + length, + type: TokenType.NewLine, + comments, + newLineType, + }; + } + + return { start, length, type: TokenType.NewLine, - comments, newLineType, }; - - return token; } } @@ -309,16 +335,28 @@ export interface KeywordToken extends Token { } export namespace KeywordToken { - export function create(start: number, length: number, keywordType: KeywordType, comments: Comment[] | undefined) { - const token: KeywordToken = { + export function create( + start: number, + length: number, + keywordType: KeywordType, + comments: Comment[] | undefined + ): KeywordToken { + if (comments !== undefined) { + return { + start, + length, + type: TokenType.Keyword, + comments, + keywordType, + }; + } + + return { start, length, type: TokenType.Keyword, - comments, keywordType, }; - - return token; } export function isSoftKeyword(token: KeywordToken) { @@ -350,19 +388,30 @@ export namespace StringToken { escapedValue: string, prefixLength: number, comments: Comment[] | undefined - ) { - const token: StringToken = { + ): StringToken { + const quoteMarkLength = flags & StringTokenFlags.Triplicate ? 3 : 1; + if (comments !== undefined) { + return { + start, + length, + type: TokenType.String, + flags, + escapedValue, + prefixLength, + quoteMarkLength, + comments, + }; + } + + return { start, length, type: TokenType.String, flags, escapedValue, prefixLength, - quoteMarkLength: flags & StringTokenFlags.Triplicate ? 3 : 1, - comments, + quoteMarkLength, }; - - return token; } } @@ -386,18 +435,28 @@ export namespace FStringStartToken { flags: StringTokenFlags, prefixLength: number, comments: Comment[] | undefined - ) { - const token: FStringStartToken = { + ): FStringStartToken { + const quoteMarkLength = flags & StringTokenFlags.Triplicate ? 3 : 1; + if (comments !== undefined) { + return { + start, + length, + type: TokenType.FStringStart, + flags, + prefixLength, + quoteMarkLength, + comments, + }; + } + + return { start, length, type: TokenType.FStringStart, flags, prefixLength, - quoteMarkLength: flags & StringTokenFlags.Triplicate ? 3 : 1, - comments, + quoteMarkLength, }; - - return token; } } @@ -456,18 +515,27 @@ export namespace NumberToken { isInteger: boolean, isImaginary: boolean, comments: Comment[] | undefined - ) { - const token: NumberToken = { + ): NumberToken { + if (comments !== undefined) { + return { + start, + length, + type: TokenType.Number, + isInteger, + isImaginary, + value, + comments, + }; + } + + return { start, length, type: TokenType.Number, isInteger, isImaginary, value, - comments, }; - - return token; } } @@ -477,16 +545,28 @@ export interface OperatorToken extends Token { } export namespace OperatorToken { - export function create(start: number, length: number, operatorType: OperatorType, comments: Comment[] | undefined) { - const token: OperatorToken = { + export function create( + start: number, + length: number, + operatorType: OperatorType, + comments: Comment[] | undefined + ): OperatorToken { + if (comments !== undefined) { + return { + start, + length, + type: TokenType.Operator, + operatorType, + comments, + }; + } + + return { start, length, type: TokenType.Operator, operatorType, - comments, }; - - return token; } } @@ -496,18 +576,36 @@ export interface IdentifierToken extends Token { } export namespace IdentifierToken { - export function create(start: number, length: number, value: string, comments: Comment[] | undefined) { + export function create( + start: number, + length: number, + value: string, + comments: Comment[] | undefined + ): IdentifierToken { // Perform "NFKC normalization", as per the Python lexical spec. - const normalizedValue = value.normalize('NFKC'); - - const token: IdentifierToken = { + let normalizedValue = value; + for (let i = 0; i < value.length; i++) { + if (value.charCodeAt(i) > 0x7f) { + normalizedValue = value.normalize('NFKC'); + break; + } + } + + if (comments !== undefined) { + return { + start, + length, + type: TokenType.Identifier, + value: normalizedValue, + comments, + }; + } + + return { start, length, type: TokenType.Identifier, value: normalizedValue, - comments, }; - - return token; } } diff --git a/packages/pyright-internal/src/tests/benchmarkData/comment_heavy.py b/packages/pyright-internal/src/tests/benchmarkData/comment_heavy.py new file mode 100644 index 000000000000..a855a3f33dcb --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarkData/comment_heavy.py @@ -0,0 +1,284 @@ +# comment_heavy.py — many type: ignore / pyright: ignore / noqa comments +# Stresses the tokenizer's comment directive scanning paths. + +from typing import Any, Dict, List, Optional, Tuple, Union + +# --- type: ignore variants --- + +x1: int = "not_int" # type: ignore +x2: str = 42 # type: ignore +x3: float = "nope" # type: ignore +x4: bool = 123 # type: ignore +x5: bytes = 456 # type: ignore + +x6: int = "a" # type: ignore[assignment] +x7: str = 1 # type: ignore[assignment] +x8: float = True # type: ignore[assignment] +x9: bool = None # type: ignore[assignment] +x10: bytes = [] # type: ignore[assignment] + +x11 = undefined_name # type: ignore[name-defined] +x12 = another_undefined # type: ignore[name-defined] +x13 = yet_another # type: ignore[name-defined] + +# --- pyright: ignore variants --- + +y1: int = "not_int" # pyright: ignore +y2: str = 42 # pyright: ignore +y3: float = "nope" # pyright: ignore +y4: bool = 123 # pyright: ignore +y5: bytes = 456 # pyright: ignore + +y6: int = "a" # pyright: ignore[reportAssignmentType] +y7: str = 1 # pyright: ignore[reportAssignmentType] +y8: float = True # pyright: ignore[reportAssignmentType] +y9: bool = None # pyright: ignore[reportAssignmentType] +y10: bytes = [] # pyright: ignore[reportAssignmentType] + +y11: int = "str" # pyright: ignore[reportAssignmentType, reportGeneralClassIssues] +y12: str = 42 # pyright: ignore[reportAssignmentType, reportGeneralClassIssues] + +# --- noqa comments --- + +import os # noqa: F401 +import sys # noqa: F401 +import re # noqa +import json # noqa: E302 +import csv # noqa: F401, E302 +import io # noqa + +# --- Mixed comments --- + +z1: int = "str" # type: ignore # noqa: F841 +z2: str = 42 # type: ignore[assignment] # noqa +z3 = undefined # type: ignore[name-defined] # noqa: F821 +z4: int = "nope" # pyright: ignore # noqa: F841 +z5: int = "nope" # pyright: ignore[reportAssignmentType] # noqa + +# --- Regular comments (should be fast-rejected by directive scanner) --- + +# This is a regular comment +# Another regular comment +# Yet another regular comment that is quite long and spans many characters to stress the scanner +# Regular comment with some keywords: def class import return if else +# Regular comment mentioning ignore but not as a directive +# A comment that says "type" but is not a type: ignore +# type: This looks similar but is not a valid directive +# pyright: This also looks similar but is not valid + +# --- Doc comments (hash-prefixed) --- + +# Module: comment_heavy +# Purpose: Stress test comment directive scanning +# Author: Benchmark generator +# Date: 2024-01-01 +# Version: 1.0.0 + +# --- Function with many ignored lines --- + + +def poorly_typed_function( + a, # type: ignore + b, # type: ignore + c, # type: ignore + d, # type: ignore + e, # type: ignore +) -> None: # type: ignore + result = a + b # type: ignore + result2 = c * d # type: ignore + result3 = e ** 2 # type: ignore + final = result + result2 + result3 # type: ignore + return final # type: ignore + + +def another_poorly_typed(x, y, z): # type: ignore + # type: ignore on every line + a: int = x # type: ignore + b: str = y # type: ignore + c: float = z # type: ignore + d: bool = a + b # type: ignore + e: bytes = c + d # type: ignore + f: list = e * 2 # type: ignore + g: dict = f + 1 # type: ignore + h: tuple = g - 1 # type: ignore + i: set = h / 2 # type: ignore + j: int = i + j # type: ignore # noqa: F821 + return (a, b, c, d, e, f, g, h, i, j) # type: ignore + + +# --- Class with pyright: ignore --- + + +class IgnoredClass: + x: int = "not_int" # pyright: ignore[reportAssignmentType] + y: str = 42 # pyright: ignore[reportAssignmentType] + + def __init__(self) -> None: + self.a: int = "str" # pyright: ignore[reportAssignmentType] + self.b: str = 42 # pyright: ignore[reportAssignmentType] + self.c: float = "3.14" # pyright: ignore[reportAssignmentType] + self.d: bool = "True" # pyright: ignore[reportAssignmentType] + + def method1(self) -> int: # type: ignore + return "not_int" # type: ignore + + def method2(self) -> str: # type: ignore + return 42 # type: ignore + + def method3(self) -> float: # type: ignore + return True # type: ignore + + def method4(self) -> bool: # type: ignore + return 3.14 # type: ignore + + def method5(self) -> bytes: # type: ignore + return "string" # type: ignore + + def method6(self) -> list: # type: ignore + return 123 # type: ignore + + def method7(self) -> dict: # type: ignore + return [1, 2, 3] # type: ignore + + def method8(self) -> tuple: # type: ignore + return {1: 2} # type: ignore + + def method9(self) -> set: # type: ignore + return (1, 2, 3) # type: ignore + + def method10(self) -> None: # type: ignore + pass # type: ignore + + +# --- Bulk ignore blocks (100 lines) --- + + +def bulk_ignores_1(): + v1 = undefined_1 # type: ignore[name-defined] + v2 = undefined_2 # type: ignore[name-defined] + v3 = undefined_3 # type: ignore[name-defined] + v4 = undefined_4 # type: ignore[name-defined] + v5 = undefined_5 # type: ignore[name-defined] + v6 = undefined_6 # type: ignore[name-defined] + v7 = undefined_7 # type: ignore[name-defined] + v8 = undefined_8 # type: ignore[name-defined] + v9 = undefined_9 # type: ignore[name-defined] + v10 = undefined_10 # type: ignore[name-defined] + v11 = undefined_11 # pyright: ignore[reportUndefinedVariable] + v12 = undefined_12 # pyright: ignore[reportUndefinedVariable] + v13 = undefined_13 # pyright: ignore[reportUndefinedVariable] + v14 = undefined_14 # pyright: ignore[reportUndefinedVariable] + v15 = undefined_15 # pyright: ignore[reportUndefinedVariable] + v16 = undefined_16 # pyright: ignore[reportUndefinedVariable] + v17 = undefined_17 # pyright: ignore[reportUndefinedVariable] + v18 = undefined_18 # pyright: ignore[reportUndefinedVariable] + v19 = undefined_19 # pyright: ignore[reportUndefinedVariable] + v20 = undefined_20 # pyright: ignore[reportUndefinedVariable] + return None + + +def bulk_ignores_2(): + # 20 more lines with mixed directives + a1: int = "wrong" # type: ignore[assignment] + a2: str = 42 # type: ignore[assignment] + a3: float = True # type: ignore[assignment] + a4: bool = 3.14 # type: ignore[assignment] + a5: bytes = None # type: ignore[assignment] + a6: list = 42 # type: ignore[assignment] + a7: dict = "str" # type: ignore[assignment] + a8: tuple = False # type: ignore[assignment] + a9: set = 3.14 # type: ignore[assignment] + a10: int = None # type: ignore[assignment] + b1: int = "wrong" # pyright: ignore[reportAssignmentType] + b2: str = 42 # pyright: ignore[reportAssignmentType] + b3: float = True # pyright: ignore[reportAssignmentType] + b4: bool = 3.14 # pyright: ignore[reportAssignmentType] + b5: bytes = None # pyright: ignore[reportAssignmentType] + b6: list = 42 # pyright: ignore[reportAssignmentType] + b7: dict = "str" # pyright: ignore[reportAssignmentType] + b8: tuple = False # pyright: ignore[reportAssignmentType] + b9: set = 3.14 # pyright: ignore[reportAssignmentType] + b10: int = None # pyright: ignore[reportAssignmentType] + return None + + +# --- Lines with NO comments at all (to test non-comment fast path) --- + + +def clean_function_1(a: int, b: str, c: float) -> Tuple[int, str, float]: + x = a + 1 + y = b + " world" + z = c * 2.0 + return (x, y, z) + + +def clean_function_2(items: List[int]) -> Dict[str, int]: + result: Dict[str, int] = {} + total = 0 + for i, item in enumerate(items): + key = f"item_{i}" + result[key] = item + total += item + result["total"] = total + result["count"] = len(items) + result["average"] = total // max(len(items), 1) + return result + + +def clean_function_3( + data: Dict[str, Any], + keys: List[str], + default: Any = None, +) -> List[Any]: + return [data.get(k, default) for k in keys] + + +def clean_function_4(matrix: List[List[int]]) -> List[List[int]]: + if not matrix: + return [] + rows = len(matrix) + cols = len(matrix[0]) + transposed: List[List[int]] = [] + for j in range(cols): + row: List[int] = [] + for i in range(rows): + row.append(matrix[i][j]) + transposed.append(row) + return transposed + + +def clean_function_5(text: str, width: int = 80) -> List[str]: + words = text.split() + lines: List[str] = [] + current_line: List[str] = [] + current_length = 0 + for word in words: + if current_length + len(word) + len(current_line) > width: + lines.append(" ".join(current_line)) + current_line = [word] + current_length = len(word) + else: + current_line.append(word) + current_length += len(word) + if current_line: + lines.append(" ".join(current_line)) + return lines + + +# --- Inline type comments (old-style annotations) --- + + +def old_style_annotations(): + a = 42 # type: int + b = "hello" # type: str + c = 3.14 # type: float + d = True # type: bool + e = None # type: Optional[int] + f = [1, 2, 3] # type: List[int] + g = {"a": 1} # type: Dict[str, int] + h = (1, "a") # type: Tuple[int, str] + i = {1, 2, 3} # type: Set[int] + return (a, b, c, d, e, f, g, h, i) + + +# End of comment_heavy.py diff --git a/packages/pyright-internal/src/tests/benchmarkData/fstring_heavy.py b/packages/pyright-internal/src/tests/benchmarkData/fstring_heavy.py new file mode 100644 index 000000000000..9eb9ce8f2bf0 --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarkData/fstring_heavy.py @@ -0,0 +1,273 @@ +# fstring_heavy.py — deeply nested f-strings for tokenizer stress-testing +# Tests the f-string context stack handling and expression scanning. + +from typing import Any, Dict, List, Optional, Tuple + +# Simple f-strings +name = "world" +greeting = f"Hello, {name}!" +multi = f"{'hello'.upper()} {'world'.lower()}" + +# Nested f-strings (depth 2) +value = 42 +nested_1 = f"result: {f'inner {value}'}" +nested_2 = f"outer {f'middle {f'{value}'}'}" + +# F-strings with format specs +pi = 3.14159265358979 +formatted_float = f"{pi:.4f}" +formatted_int = f"{value:05d}" +formatted_hex = f"{value:#010x}" +formatted_bin = f"{value:08b}" +formatted_exp = f"{pi:.2e}" +formatted_percent = f"{0.756:.1%}" + +# F-strings with expressions +data = [1, 2, 3, 4, 5] +expr_1 = f"sum={sum(data)}, len={len(data)}, avg={sum(data)/len(data):.2f}" +expr_2 = f"max={max(data)}, min={min(data)}, range={max(data)-min(data)}" + +# F-strings with conditionals +status = "ok" +cond_1 = f"Status: {'PASS' if status == 'ok' else 'FAIL'}" +cond_2 = f"Value: {value if value > 0 else -value} ({'positive' if value > 0 else 'negative'})" + +# F-strings with dictionary access +config: Dict[str, Any] = {"host": "localhost", "port": 8080, "debug": True} +dict_1 = f"Server: {config['host']}:{config['port']}" +dict_2 = f"Debug mode: {'ON' if config['debug'] else 'OFF'}" + +# F-strings with list comprehensions +comp_1 = f"squares: {[x**2 for x in range(10)]}" +comp_2 = f"evens: {[x for x in range(20) if x % 2 == 0]}" + +# F-strings with method calls +text = "hello world" +method_1 = f"{text.title()!r}" +method_2 = f"{text.replace('world', 'python').upper()}" +method_3 = f"{', '.join(str(x) for x in range(5))}" + +# Multiline f-strings +multiline_1 = f""" +Name: {name} +Value: {value} +Status: {status} +Config: {config} +""" + +multiline_2 = f""" +{'='*50} +Report Summary +{'='*50} +Total items: {len(data)} +Sum: {sum(data)} +Average: {sum(data)/len(data):.2f} +{'='*50} +""" + +# F-strings with walrus operator +walrus_1 = f"{(n := 10)} doubled is {n * 2}" + +# Deeply nested f-strings (depth 3) +deep_1 = f"L1:{f'L2:{f'L3:{value}'}'}" +deep_2 = f"a{f'b{f'c{f'd'}'}'}" + +# F-strings with escape characters +escape_1 = f"path: {'C:\\\\Users\\\\test'}" +escape_2 = f"newline: {'line1\\nline2'}" +escape_3 = f"tab: {'col1\\tcol2'}" + +# F-string with complex expressions +import_fstr = f"{'import ' + 'os'}" +lambda_fstr = f"{(lambda x: x * 2)(21)}" + +# Batch of similar f-strings (simulating template usage) +items: List[Dict[str, Any]] = [ + {"name": f"item_{i}", "price": i * 10.5, "qty": i + 1} + for i in range(50) +] + + +def format_item(item: Dict[str, Any]) -> str: + return f" {item['name']:<20s} ${item['price']:>8.2f} x{item['qty']:>4d} = ${item['price'] * item['qty']:>10.2f}" + + +def format_table(items: List[Dict[str, Any]], title: str = "Inventory") -> str: + header = f"{'Name':<20s} {'Price':>8s} {'Qty':>4s} {'Total':>10s}" + separator = f"{'-'*20} {'-'*8} {'-'*4} {'-'*10}" + rows = "\n".join(format_item(item) for item in items) + total = sum(item["price"] * item["qty"] for item in items) + return f""" +{title} +{f'=' * len(title)} +{header} +{separator} +{rows} +{separator} +{'TOTAL':>34s} ${total:>10.2f} +""" + + +# F-strings in class definitions +class FormattedRecord: + def __init__(self, id: int, name: str, value: float) -> None: + self.id = id + self.name = name + self.value = value + + def __str__(self) -> str: + return f"Record(id={self.id}, name={self.name!r}, value={self.value:.4f})" + + def __repr__(self) -> str: + return f"FormattedRecord({self.id!r}, {self.name!r}, {self.value!r})" + + def to_csv(self) -> str: + return f"{self.id},{self.name},{self.value:.2f}" + + def to_json(self) -> str: + return f'{{"id": {self.id}, "name": "{self.name}", "value": {self.value}}}' + + def to_xml(self) -> str: + return f"{self.name}{self.value:.2f}" + + def summary(self, verbose: bool = False) -> str: + base = f"#{self.id}: {self.name} = {self.value:.2f}" + if verbose: + return f"{base} (type={type(self.value).__name__}, len_name={len(self.name)})" + return base + + +# F-strings with nested data structures +matrix: List[List[int]] = [[i * 10 + j for j in range(10)] for i in range(10)] + + +def format_matrix(m: List[List[int]]) -> str: + rows = "\n".join( + f" [{', '.join(f'{cell:3d}' for cell in row)}]" + for row in m + ) + return f"Matrix {len(m)}x{len(m[0]) if m else 0}:\n[\n{rows}\n]" + + +def format_tree( + node: Dict[str, Any], indent: int = 0, prefix: str = "" +) -> str: + name = node.get("name", "?") + children = node.get("children", []) + result = f"{' ' * indent}{prefix}{name}" + for i, child in enumerate(children): + is_last = i == len(children) - 1 + child_prefix = f"{'└── ' if is_last else '├── '}" + result += f"\n{format_tree(child, indent + 4, child_prefix)}" + return result + + +# Many small f-strings to stress token emission +def generate_report_lines(count: int) -> List[str]: + lines: List[str] = [] + for i in range(count): + lines.append(f"Line {i:04d}: value={i * 3.14:.2f}, hex={i:#06x}, bin={i:08b}") + return lines + + +def format_log_entry( + timestamp: str, + level: str, + module: str, + message: str, + extra: Optional[Dict[str, Any]] = None, +) -> str: + base = f"[{timestamp}] {level:>8s} {module:<30s} {message}" + if extra: + pairs = " ".join(f"{k}={v!r}" for k, v in extra.items()) + return f"{base} | {pairs}" + return base + + +# F-strings with type annotations in strings (for older Python compat) +future_annotations_example = { + "field1": f"{'Optional[List[Dict[str, Any]]]'}", + "field2": f"{'Union[int, str, Tuple[int, ...]]'}", + "field3": f"{'Callable[[str, int], Optional[bool]]'}", +} + +# More deeply nested formatting +def deep_format(data: Dict[str, Any], depth: int = 0) -> str: + indent = " " * depth + parts: List[str] = [] + for key, val in data.items(): + if isinstance(val, dict): + inner = deep_format(val, depth + 1) + parts.append(f"{indent}{key}:\n{inner}") + elif isinstance(val, list): + items_str = f", ".join(f"{v!r}" for v in val) + parts.append(f"{indent}{key}: [{items_str}]") + else: + parts.append(f"{indent}{key}: {val!r}") + return "\n".join(parts) + + +# Batch f-string generation to reach ~500 lines of f-string-heavy code +class LogFormatter: + _format: str + _fields: List[str] + + def __init__(self, fmt: str, fields: Optional[List[str]] = None) -> None: + self._format = fmt + self._fields = fields or [] + + def format(self, **kwargs: Any) -> str: + return f"[{self._format}] " + " ".join( + f"{f}={kwargs.get(f, 'N/A')!r}" for f in self._fields + ) + + +class TemplateEngine: + _templates: Dict[str, str] + + def __init__(self) -> None: + self._templates = {} + + def register(self, name: str, template: str) -> None: + self._templates[name] = template + + def render(self, name: str, **ctx: Any) -> str: + tmpl = self._templates.get(name, "") + return f"[{name}] {tmpl}" + "".join( + f" {k}={v}" for k, v in ctx.items() + ) + + +class HtmlBuilder: + _parts: List[str] + + def __init__(self) -> None: + self._parts = [] + + def tag(self, name: str, content: str, **attrs: str) -> "HtmlBuilder": + attr_str = " ".join(f'{k}="{v}"' for k, v in attrs.items()) + if attr_str: + self._parts.append(f"<{name} {attr_str}>{content}") + else: + self._parts.append(f"<{name}>{content}") + return self + + def div(self, content: str, class_name: str = "") -> "HtmlBuilder": + if class_name: + self._parts.append(f'
{content}
') + else: + self._parts.append(f"
{content}
") + return self + + def span(self, content: str, style: str = "") -> "HtmlBuilder": + if style: + self._parts.append(f'{content}') + else: + self._parts.append(f"{content}") + return self + + def build(self) -> str: + return f"\n\n\n{''.join(self._parts)}\n\n" + + +# End of fstring_heavy.py diff --git a/packages/pyright-internal/src/tests/benchmarkData/import_heavy.py b/packages/pyright-internal/src/tests/benchmarkData/import_heavy.py new file mode 100644 index 000000000000..f48c4f89f942 --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarkData/import_heavy.py @@ -0,0 +1,375 @@ +# import_heavy.py — many import statements for resolution benchmarking + +from __future__ import annotations + +# Standard library imports (varied styles) +import os +import sys +import io +import re +import json +import csv +import math +import time +import copy +import enum +import abc +import ast +import dis +import ssl +import xml +import html +import http +import uuid +import zlib +import gzip +import lzma +import bz2 +import base64 +import hashlib +import hmac +import secrets +import random +import struct +import array +import queue +import heapq +import bisect +import decimal +import fractions +import statistics +import string +import textwrap +import unicodedata +import difflib +import pprint +import reprlib +import warnings +import traceback +import linecache +import inspect +import dis +import code +import codeop +import compile +import compileall + +# From imports +from os import path, getcwd, listdir, makedirs, remove, rename +from os.path import ( + join, + exists, + isfile, + isdir, + basename, + dirname, + abspath, + relpath, + normpath, + splitext, + getsize, + getmtime, +) +from sys import argv, exit, stdin, stdout, stderr, platform, version +from io import BytesIO, StringIO, BufferedReader, TextIOWrapper +from re import compile, match, search, findall, sub, split, Pattern, Match +from json import dumps, loads, dump, load, JSONEncoder, JSONDecoder +from csv import reader, writer, DictReader, DictWriter +from math import ( + ceil, + floor, + sqrt, + pow, + log, + log2, + log10, + exp, + sin, + cos, + tan, + pi, + e, + inf, + nan, + isnan, + isinf, + isfinite, + gcd, + factorial, +) +from time import time as time_func, sleep, monotonic, perf_counter +from copy import copy as shallow_copy, deepcopy +from enum import Enum, IntEnum, Flag, IntFlag, auto, unique +from abc import ABC, ABCMeta, abstractmethod +from collections import ( + OrderedDict, + defaultdict, + deque, + Counter, + namedtuple, + ChainMap, +) +from collections.abc import ( + Iterable, + Iterator, + Generator, + Sequence, + MutableSequence, + Set, + MutableSet, + Mapping, + MutableMapping, + Callable, + Hashable, + Sized, + Container, + Reversible, + Collection, + Awaitable, + Coroutine, + AsyncIterator, + AsyncIterable, + AsyncGenerator, +) +from typing import ( + Any, + ClassVar, + Dict, + Final, + Generic, + List, + Literal, + Optional, + Protocol, + Set as TSet, + Tuple, + Type, + TypeVar, + Union, + cast, + overload, + runtime_checkable, + get_type_hints, + TYPE_CHECKING, + NamedTuple, + TypedDict, + Annotated, + TypeAlias, + TypeGuard, + Never, + Self, + Unpack, + ParamSpec, + Concatenate, + assert_type, + reveal_type, + dataclass_transform, + no_type_check, +) +from functools import ( + reduce, + partial, + lru_cache, + wraps, + total_ordering, + singledispatch, + cached_property, +) +from itertools import ( + chain, + combinations, + permutations, + product, + repeat, + count, + cycle, + islice, + groupby, + starmap, + accumulate, + zip_longest, + compress, + filterfalse, + takewhile, + dropwhile, + tee, +) +from contextlib import ( + contextmanager, + asynccontextmanager, + closing, + suppress, + redirect_stdout, + redirect_stderr, + nullcontext, + ExitStack, + AbstractContextManager, +) +from dataclasses import dataclass, field, fields, asdict, astuple, make_dataclass +from pathlib import Path, PurePath, PosixPath, WindowsPath, PurePosixPath +from datetime import datetime, date, time as dt_time, timedelta, timezone +from urllib.parse import ( + urlparse, + urlencode, + urljoin, + quote, + unquote, + parse_qs, + parse_qsl, + urlsplit, + urlunsplit, +) +from http import HTTPStatus +from http.client import HTTPConnection, HTTPSConnection, HTTPResponse +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from xml.etree import ElementTree +from xml.dom import minidom +from html.parser import HTMLParser +from concurrent.futures import ( + ThreadPoolExecutor, + ProcessPoolExecutor, + Future, + as_completed, + wait, + FIRST_COMPLETED, + ALL_COMPLETED, +) +from threading import Thread, Lock, RLock, Event, Condition, Semaphore, Timer +from multiprocessing import Process, Pool, Queue as MPQueue, Value, Array, Manager +from subprocess import run, Popen, PIPE, DEVNULL, CalledProcessError +from shutil import copy2, copytree, rmtree, move, which, disk_usage +from tempfile import ( + TemporaryFile, + NamedTemporaryFile, + mkdtemp, + mkstemp, + gettempdir, + SpooledTemporaryFile, +) +from unittest import TestCase, TestSuite, TestLoader, TextTestRunner, mock +from unittest.mock import Mock, MagicMock, patch, call, ANY, PropertyMock +from logging import ( + Logger, + getLogger, + StreamHandler, + FileHandler, + Formatter, + DEBUG, + INFO, + WARNING, + ERROR, + CRITICAL, + basicConfig, +) +from argparse import ArgumentParser, Namespace, FileType, Action, HelpFormatter +from configparser import ConfigParser, RawConfigParser +from socket import socket, AF_INET, AF_INET6, SOCK_STREAM, SOCK_DGRAM +from signal import signal, SIGINT, SIGTERM, SIG_DFL, SIG_IGN +from weakref import ref, WeakValueDictionary, WeakKeyDictionary, finalize +from operator import ( + add, + sub, + mul, + truediv, + floordiv, + mod, + pow as op_pow, + neg, + pos, + abs as op_abs, + eq, + ne, + lt, + le, + gt, + ge, + and_, + or_, + xor, + not_, + itemgetter, + attrgetter, + methodcaller, +) + +# Conditional imports +if TYPE_CHECKING: + from _typeshed import SupportsRead, SupportsWrite, StrPath + from typing_extensions import Buffer, ReadOnly + +# Aliased imports +import os.path as osp +import collections.abc as cabc +import xml.etree.ElementTree as ET + +# Try/except imports (common pattern) +try: + import numpy as np # type: ignore +except ImportError: + np = None # type: ignore + +try: + import pandas as pd # type: ignore +except ImportError: + pd = None # type: ignore + +try: + import requests # type: ignore +except ImportError: + requests = None # type: ignore + +try: + import yaml # type: ignore +except ImportError: + yaml = None # type: ignore + +try: + import toml # type: ignore +except ImportError: + toml = None # type: ignore + + +# Code that uses imported names to exercise resolution +def use_imports() -> None: + """Function that references many imported names.""" + p = Path(".") + files = list(p.iterdir()) + cwd = getcwd() + + data: Dict[str, Any] = {"key": "value"} + json_str = dumps(data) + parsed = loads(json_str) + + now = datetime.now() + delta = timedelta(days=1) + tomorrow = now + delta + + url = urlparse("https://example.com/path?key=value") + + with ThreadPoolExecutor(max_workers=4) as executor: + futures = [executor.submit(lambda x: x * 2, i) for i in range(10)] + + parser = ArgumentParser(description="test") + parser.add_argument("--verbose", action="store_true") + + logger = getLogger(__name__) + logger.setLevel(DEBUG) + + tmp_dir = mkdtemp() + result = sqrt(144) + items = list(chain([1, 2], [3, 4], [5, 6])) + grouped = groupby(sorted(items), key=lambda x: x % 2) + + counter = Counter(items) + + # Type aliases using imported types + Config: TypeAlias = Dict[str, Union[str, int, float, bool, List[Any]]] + Handler: TypeAlias = Callable[[str, int], Optional[bool]] + DataRow: TypeAlias = Tuple[int, str, float, Optional[str]] + + _ = (p, files, cwd, data, json_str, parsed, now, delta, tomorrow, url, + parser, logger, tmp_dir, result, items, grouped, counter) + + +# End of import_heavy.py diff --git a/packages/pyright-internal/src/tests/benchmarkData/large_class.py b/packages/pyright-internal/src/tests/benchmarkData/large_class.py new file mode 100644 index 000000000000..8d4fb76e484f --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarkData/large_class.py @@ -0,0 +1,853 @@ +# large_class.py — class with 200+ methods for member completion benchmarking + +from __future__ import annotations + +from typing import ( + Any, + ClassVar, + Dict, + Iterator, + List, + Optional, + Sequence, + Set, + Tuple, + TypeVar, + Union, +) + +_T = TypeVar("_T") + + +class LargeClass: + """A class with many methods to stress member completion.""" + + # Class variables + VERSION: ClassVar[str] = "1.0.0" + MAX_SIZE: ClassVar[int] = 1000 + DEFAULT_NAME: ClassVar[str] = "unnamed" + + # Instance variables + _name: str + _data: List[Any] + _metadata: Dict[str, Any] + _flags: Set[str] + _parent: Optional[LargeClass] + _children: List[LargeClass] + _cache: Dict[str, Any] + _counter: int + + def __init__( + self, + name: str, + data: Optional[List[Any]] = None, + parent: Optional[LargeClass] = None, + ) -> None: + self._name = name + self._data = data or [] + self._metadata = {} + self._flags = set() + self._parent = parent + self._children = [] + self._cache = {} + self._counter = 0 + + # --- Properties (20) --- + + @property + def name(self) -> str: + return self._name + + @name.setter + def name(self, value: str) -> None: + self._name = value + + @property + def data(self) -> List[Any]: + return self._data + + @property + def metadata(self) -> Dict[str, Any]: + return self._metadata + + @property + def flags(self) -> Set[str]: + return self._flags + + @property + def parent(self) -> Optional[LargeClass]: + return self._parent + + @property + def children(self) -> List[LargeClass]: + return self._children + + @property + def size(self) -> int: + return len(self._data) + + @property + def is_empty(self) -> bool: + return len(self._data) == 0 + + @property + def is_root(self) -> bool: + return self._parent is None + + @property + def is_leaf(self) -> bool: + return len(self._children) == 0 + + @property + def depth(self) -> int: + d = 0 + node = self._parent + while node is not None: + d += 1 + node = node._parent + return d + + @property + def path(self) -> str: + parts: List[str] = [] + node: Optional[LargeClass] = self + while node is not None: + parts.append(node._name) + node = node._parent + parts.reverse() + return "/".join(parts) + + @property + def root(self) -> LargeClass: + node = self + while node._parent is not None: + node = node._parent + return node + + @property + def siblings(self) -> List[LargeClass]: + if self._parent is None: + return [] + return [c for c in self._parent._children if c is not self] + + @property + def descendant_count(self) -> int: + count = len(self._children) + for child in self._children: + count += child.descendant_count + return count + + @property + def total_data_size(self) -> int: + total = len(self._data) + for child in self._children: + total += child.total_data_size + return total + + @property + def counter(self) -> int: + return self._counter + + @property + def cache_size(self) -> int: + return len(self._cache) + + @property + def has_metadata(self) -> bool: + return len(self._metadata) > 0 + + # --- Data manipulation methods (40) --- + + def add_item(self, item: Any) -> None: + self._data.append(item) + + def add_items(self, items: Sequence[Any]) -> None: + self._data.extend(items) + + def insert_item(self, index: int, item: Any) -> None: + self._data.insert(index, item) + + def remove_item(self, item: Any) -> bool: + try: + self._data.remove(item) + return True + except ValueError: + return False + + def pop_item(self, index: int = -1) -> Any: + return self._data.pop(index) + + def clear_data(self) -> None: + self._data.clear() + + def sort_data(self, reverse: bool = False) -> None: + self._data.sort(reverse=reverse) + + def reverse_data(self) -> None: + self._data.reverse() + + def get_item(self, index: int) -> Any: + return self._data[index] + + def get_items(self, start: int, end: int) -> List[Any]: + return self._data[start:end] + + def set_item(self, index: int, value: Any) -> None: + self._data[index] = value + + def find_item(self, item: Any) -> int: + try: + return self._data.index(item) + except ValueError: + return -1 + + def contains_item(self, item: Any) -> bool: + return item in self._data + + def count_item(self, item: Any) -> int: + return self._data.count(item) + + def first_item(self) -> Optional[Any]: + return self._data[0] if self._data else None + + def last_item(self) -> Optional[Any]: + return self._data[-1] if self._data else None + + def unique_items(self) -> List[Any]: + seen: Set[Any] = set() + result: List[Any] = [] + for item in self._data: + if item not in seen: + seen.add(item) + result.append(item) + return result + + def filter_items(self, predicate: Any) -> List[Any]: + return [item for item in self._data if predicate(item)] + + def map_items(self, func: Any) -> List[Any]: + return [func(item) for item in self._data] + + def reduce_items(self, func: Any, initial: Any = None) -> Any: + result = initial + for item in self._data: + if result is None: + result = item + else: + result = func(result, item) + return result + + def zip_with(self, other: Sequence[Any]) -> List[Tuple[Any, Any]]: + return list(zip(self._data, other)) + + def enumerate_items(self) -> List[Tuple[int, Any]]: + return list(enumerate(self._data)) + + def chunk_data(self, size: int) -> List[List[Any]]: + return [self._data[i : i + size] for i in range(0, len(self._data), size)] + + def flatten_data(self) -> List[Any]: + result: List[Any] = [] + for item in self._data: + if isinstance(item, list): + result.extend(item) + else: + result.append(item) + return result + + def take_items(self, n: int) -> List[Any]: + return self._data[:n] + + def drop_items(self, n: int) -> List[Any]: + return self._data[n:] + + def sample_items(self, n: int) -> List[Any]: + import random + return random.sample(self._data, min(n, len(self._data))) + + def shuffle_data(self) -> None: + import random + random.shuffle(self._data) + + def min_item(self) -> Optional[Any]: + return min(self._data) if self._data else None + + def max_item(self) -> Optional[Any]: + return max(self._data) if self._data else None + + def sum_items(self) -> Any: + return sum(self._data) if self._data else 0 + + def average_items(self) -> Optional[float]: + if not self._data: + return None + return sum(self._data) / len(self._data) + + def group_by(self, key_func: Any) -> Dict[Any, List[Any]]: + groups: Dict[Any, List[Any]] = {} + for item in self._data: + k = key_func(item) + if k not in groups: + groups[k] = [] + groups[k].append(item) + return groups + + def partition(self, predicate: Any) -> Tuple[List[Any], List[Any]]: + true_items: List[Any] = [] + false_items: List[Any] = [] + for item in self._data: + if predicate(item): + true_items.append(item) + else: + false_items.append(item) + return (true_items, false_items) + + def all_match(self, predicate: Any) -> bool: + return all(predicate(item) for item in self._data) + + def any_match(self, predicate: Any) -> bool: + return any(predicate(item) for item in self._data) + + def none_match(self, predicate: Any) -> bool: + return not any(predicate(item) for item in self._data) + + def find_first(self, predicate: Any) -> Optional[Any]: + for item in self._data: + if predicate(item): + return item + return None + + def find_last(self, predicate: Any) -> Optional[Any]: + for item in reversed(self._data): + if predicate(item): + return item + return None + + def distinct_count(self) -> int: + return len(set(self._data)) + + # --- Metadata methods (20) --- + + def set_metadata(self, key: str, value: Any) -> None: + self._metadata[key] = value + + def get_metadata(self, key: str, default: Any = None) -> Any: + return self._metadata.get(key, default) + + def has_metadata_key(self, key: str) -> bool: + return key in self._metadata + + def remove_metadata(self, key: str) -> Optional[Any]: + return self._metadata.pop(key, None) + + def clear_metadata(self) -> None: + self._metadata.clear() + + def metadata_keys(self) -> List[str]: + return list(self._metadata.keys()) + + def metadata_values(self) -> List[Any]: + return list(self._metadata.values()) + + def metadata_items(self) -> List[Tuple[str, Any]]: + return list(self._metadata.items()) + + def merge_metadata(self, other: Dict[str, Any]) -> None: + self._metadata.update(other) + + def copy_metadata_from(self, source: LargeClass) -> None: + self._metadata.update(source._metadata) + + def filter_metadata(self, predicate: Any) -> Dict[str, Any]: + return {k: v for k, v in self._metadata.items() if predicate(k, v)} + + def transform_metadata_values(self, func: Any) -> Dict[str, Any]: + return {k: func(v) for k, v in self._metadata.items()} + + def metadata_to_json(self) -> str: + import json + return json.dumps(self._metadata) + + def metadata_from_json(self, json_str: str) -> None: + import json + self._metadata = json.loads(json_str) + + def validate_metadata(self, schema: Dict[str, type]) -> List[str]: + errors: List[str] = [] + for key, expected_type in schema.items(): + if key not in self._metadata: + errors.append(f"Missing key: {key}") + elif not isinstance(self._metadata[key], expected_type): + errors.append(f"Wrong type for {key}: expected {expected_type.__name__}") + return errors + + def metadata_diff(self, other: LargeClass) -> Dict[str, Tuple[Any, Any]]: + all_keys = set(self._metadata.keys()) | set(other._metadata.keys()) + diff: Dict[str, Tuple[Any, Any]] = {} + for key in all_keys: + v1 = self._metadata.get(key) + v2 = other._metadata.get(key) + if v1 != v2: + diff[key] = (v1, v2) + return diff + + def snapshot_metadata(self) -> Dict[str, Any]: + return dict(self._metadata) + + def restore_metadata(self, snapshot: Dict[str, Any]) -> None: + self._metadata = dict(snapshot) + + def metadata_size_bytes(self) -> int: + import sys + return sys.getsizeof(self._metadata) + + def metadata_summary(self) -> str: + return f"Metadata: {len(self._metadata)} keys" + + # --- Flag methods (15) --- + + def add_flag(self, flag: str) -> None: + self._flags.add(flag) + + def remove_flag(self, flag: str) -> None: + self._flags.discard(flag) + + def has_flag(self, flag: str) -> bool: + return flag in self._flags + + def toggle_flag(self, flag: str) -> bool: + if flag in self._flags: + self._flags.discard(flag) + return False + self._flags.add(flag) + return True + + def clear_flags(self) -> None: + self._flags.clear() + + def set_flags(self, flags: Set[str]) -> None: + self._flags = set(flags) + + def get_flags(self) -> Set[str]: + return set(self._flags) + + def flag_count(self) -> int: + return len(self._flags) + + def has_any_flag(self, flags: Set[str]) -> bool: + return bool(self._flags & flags) + + def has_all_flags(self, flags: Set[str]) -> bool: + return flags.issubset(self._flags) + + def common_flags(self, other: LargeClass) -> Set[str]: + return self._flags & other._flags + + def diff_flags(self, other: LargeClass) -> Set[str]: + return self._flags - other._flags + + def union_flags(self, other: LargeClass) -> Set[str]: + return self._flags | other._flags + + def flags_to_list(self) -> List[str]: + return sorted(self._flags) + + def flags_summary(self) -> str: + return f"Flags: {', '.join(sorted(self._flags))}" + + # --- Tree methods (25) --- + + def add_child(self, child: LargeClass) -> None: + child._parent = self + self._children.append(child) + + def remove_child(self, child: LargeClass) -> bool: + try: + self._children.remove(child) + child._parent = None + return True + except ValueError: + return False + + def detach(self) -> None: + if self._parent: + self._parent.remove_child(self) + + def move_to(self, new_parent: LargeClass) -> None: + self.detach() + new_parent.add_child(self) + + def get_child(self, index: int) -> LargeClass: + return self._children[index] + + def find_child(self, name: str) -> Optional[LargeClass]: + for child in self._children: + if child._name == name: + return child + return None + + def find_descendant(self, name: str) -> Optional[LargeClass]: + for child in self._children: + if child._name == name: + return child + found = child.find_descendant(name) + if found is not None: + return found + return None + + def walk_tree(self) -> Iterator[LargeClass]: + yield self + for child in self._children: + yield from child.walk_tree() + + def walk_leaves(self) -> Iterator[LargeClass]: + if self.is_leaf: + yield self + else: + for child in self._children: + yield from child.walk_leaves() + + def ancestors(self) -> List[LargeClass]: + result: List[LargeClass] = [] + node = self._parent + while node is not None: + result.append(node) + node = node._parent + return result + + def common_ancestor(self, other: LargeClass) -> Optional[LargeClass]: + my_ancestors = set(id(a) for a in self.ancestors()) + node: Optional[LargeClass] = other + while node is not None: + if id(node) in my_ancestors: + return node + node = node._parent + return None + + def subtree_size(self) -> int: + return 1 + sum(child.subtree_size() for child in self._children) + + def height(self) -> int: + if not self._children: + return 0 + return 1 + max(child.height() for child in self._children) + + def is_ancestor_of(self, other: LargeClass) -> bool: + node = other._parent + while node is not None: + if node is self: + return True + node = node._parent + return False + + def is_descendant_of(self, other: LargeClass) -> bool: + return other.is_ancestor_of(self) + + def child_count(self) -> int: + return len(self._children) + + def sort_children(self, key: Optional[Any] = None) -> None: + if key: + self._children.sort(key=key) + else: + self._children.sort(key=lambda c: c._name) + + def reverse_children(self) -> None: + self._children.reverse() + + def flatten_tree(self) -> List[LargeClass]: + return list(self.walk_tree()) + + def tree_depth_map(self) -> Dict[int, List[LargeClass]]: + result: Dict[int, List[LargeClass]] = {} + for node in self.walk_tree(): + d = node.depth + if d not in result: + result[d] = [] + result[d].append(node) + return result + + def prune(self, predicate: Any) -> int: + removed = 0 + keep: List[LargeClass] = [] + for child in self._children: + if predicate(child): + child._parent = None + removed += 1 + else: + keep.append(child) + removed += child.prune(predicate) + self._children = keep + return removed + + def clone(self) -> LargeClass: + new_node = LargeClass(self._name, list(self._data)) + new_node._metadata = dict(self._metadata) + new_node._flags = set(self._flags) + for child in self._children: + cloned_child = child.clone() + new_node.add_child(cloned_child) + return new_node + + def merge_with(self, other: LargeClass) -> None: + self._data.extend(other._data) + self._metadata.update(other._metadata) + self._flags.update(other._flags) + for child in other._children: + self.add_child(child) + + def tree_summary(self) -> str: + return f"Tree({self._name}, children={self.child_count()}, descendants={self.descendant_count})" + + # --- Cache methods (10) --- + + def cache_get(self, key: str) -> Optional[Any]: + return self._cache.get(key) + + def cache_set(self, key: str, value: Any) -> None: + self._cache[key] = value + + def cache_has(self, key: str) -> bool: + return key in self._cache + + def cache_remove(self, key: str) -> Optional[Any]: + return self._cache.pop(key, None) + + def cache_clear(self) -> None: + self._cache.clear() + + def cache_keys(self) -> List[str]: + return list(self._cache.keys()) + + def cache_values(self) -> List[Any]: + return list(self._cache.values()) + + def cache_items(self) -> List[Tuple[str, Any]]: + return list(self._cache.items()) + + def cache_update(self, data: Dict[str, Any]) -> None: + self._cache.update(data) + + def cache_get_or_set(self, key: str, factory: Any) -> Any: + if key not in self._cache: + self._cache[key] = factory() + return self._cache[key] + + # --- Counter methods (10) --- + + def increment(self, by: int = 1) -> int: + self._counter += by + return self._counter + + def decrement(self, by: int = 1) -> int: + self._counter -= by + return self._counter + + def reset_counter(self) -> None: + self._counter = 0 + + def set_counter(self, value: int) -> None: + self._counter = value + + def counter_is_zero(self) -> bool: + return self._counter == 0 + + def counter_is_positive(self) -> bool: + return self._counter > 0 + + def counter_is_negative(self) -> bool: + return self._counter < 0 + + def counter_abs(self) -> int: + return abs(self._counter) + + def counter_clamp(self, low: int, high: int) -> int: + self._counter = max(low, min(high, self._counter)) + return self._counter + + def counter_summary(self) -> str: + return f"Counter: {self._counter}" + + # --- Serialization methods (10) --- + + def to_dict(self) -> Dict[str, Any]: + return { + "name": self._name, + "data": self._data, + "metadata": self._metadata, + "flags": list(self._flags), + "counter": self._counter, + "children": [c.to_dict() for c in self._children], + } + + @classmethod + def from_dict(cls, d: Dict[str, Any]) -> LargeClass: + obj = cls(d["name"], d.get("data", [])) + obj._metadata = d.get("metadata", {}) + obj._flags = set(d.get("flags", [])) + obj._counter = d.get("counter", 0) + for child_dict in d.get("children", []): + child = cls.from_dict(child_dict) + obj.add_child(child) + return obj + + def to_json(self) -> str: + import json + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> LargeClass: + import json + return cls.from_dict(json.loads(json_str)) + + def to_yaml_str(self) -> str: + lines: List[str] = [f"name: {self._name}"] + lines.append(f"counter: {self._counter}") + lines.append(f"flags: [{', '.join(sorted(self._flags))}]") + return "\n".join(lines) + + def copy(self) -> LargeClass: + return LargeClass.from_dict(self.to_dict()) + + def equals(self, other: LargeClass) -> bool: + return self.to_dict() == other.to_dict() + + def hash_value(self) -> int: + return hash((self._name, tuple(self._data), self._counter)) + + def size_bytes(self) -> int: + import sys + return sys.getsizeof(self) + + def describe(self) -> str: + return ( + f"LargeClass(name={self._name!r}, " + f"data_size={len(self._data)}, " + f"metadata_keys={len(self._metadata)}, " + f"flags={len(self._flags)}, " + f"children={len(self._children)}, " + f"counter={self._counter})" + ) + + # --- Dunder methods (20) --- + + def __repr__(self) -> str: + return f"LargeClass({self._name!r})" + + def __str__(self) -> str: + return self._name + + def __len__(self) -> int: + return len(self._data) + + def __bool__(self) -> bool: + return len(self._data) > 0 + + def __contains__(self, item: Any) -> bool: + return item in self._data + + def __iter__(self) -> Iterator[Any]: + return iter(self._data) + + def __getitem__(self, index: int) -> Any: + return self._data[index] + + def __setitem__(self, index: int, value: Any) -> None: + self._data[index] = value + + def __delitem__(self, index: int) -> None: + del self._data[index] + + def __eq__(self, other: object) -> bool: + if not isinstance(other, LargeClass): + return NotImplemented + return self._name == other._name and self._data == other._data + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) + + def __hash__(self) -> int: + return hash(self._name) + + def __lt__(self, other: LargeClass) -> bool: + return self._name < other._name + + def __le__(self, other: LargeClass) -> bool: + return self._name <= other._name + + def __gt__(self, other: LargeClass) -> bool: + return self._name > other._name + + def __ge__(self, other: LargeClass) -> bool: + return self._name >= other._name + + def __add__(self, other: LargeClass) -> LargeClass: + result = self.clone() + result._data.extend(other._data) + return result + + def __iadd__(self, other: LargeClass) -> LargeClass: + self._data.extend(other._data) + return self + + def __enter__(self) -> LargeClass: + return self + + def __exit__(self, *args: Any) -> None: + self.clear_data() + self.clear_metadata() + self.clear_flags() + self.cache_clear() + + +# --- Subclass to add more methods for completion ancestor chain --- + + +class ExtendedClass(LargeClass): + """Extension with additional domain methods.""" + + _tags: List[str] + _version: int + + def __init__(self, name: str, version: int = 1) -> None: + super().__init__(name) + self._tags = [] + self._version = version + + def add_tag(self, tag: str) -> None: + self._tags.append(tag) + + def remove_tag(self, tag: str) -> None: + if tag in self._tags: + self._tags.remove(tag) + + def has_tag(self, tag: str) -> bool: + return tag in self._tags + + def get_tags(self) -> List[str]: + return list(self._tags) + + def clear_tags(self) -> None: + self._tags.clear() + + def bump_version(self) -> int: + self._version += 1 + return self._version + + def get_version(self) -> int: + return self._version + + def set_version(self, version: int) -> None: + self._version = version + + def version_string(self) -> str: + return f"v{self._version}" + + def full_describe(self) -> str: + base = self.describe() + return f"{base}, tags={len(self._tags)}, version={self._version}" + + +# Marker for completion benchmark — trigger point +obj = ExtendedClass("test") +obj. # completion trigger point diff --git a/packages/pyright-internal/src/tests/benchmarkData/large_stdlib.py b/packages/pyright-internal/src/tests/benchmarkData/large_stdlib.py new file mode 100644 index 000000000000..9cc973aca832 --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarkData/large_stdlib.py @@ -0,0 +1,1721 @@ +# large_stdlib.py — simulates a large stdlib-like module (~3000+ lines) +# Used for tokenizer/parser/type-evaluator benchmarking. + +from __future__ import annotations + +import os +import sys +import typing +from typing import ( + Any, + ClassVar, + Dict, + Final, + Generic, + Iterator, + List, + Literal, + Optional, + Protocol, + Sequence, + Set, + Tuple, + TypeVar, + Union, + overload, + runtime_checkable, +) + +_T = TypeVar("_T") +_T_co = TypeVar("_T_co", covariant=True) +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") +_S = TypeVar("_S", bound="Sortable") + + +class Sortable(Protocol): + def __lt__(self, other: Any) -> bool: ... + def __le__(self, other: Any) -> bool: ... + + +# --- Large class hierarchy --- + + +class BaseNode: + """Base class for all AST nodes.""" + + kind: ClassVar[str] = "base" + _parent: Optional[BaseNode] = None + _children: List[BaseNode] + _line: int + _col: int + _end_line: int + _end_col: int + + def __init__( + self, + line: int = 0, + col: int = 0, + end_line: int = 0, + end_col: int = 0, + ) -> None: + self._children = [] + self._line = line + self._col = col + self._end_line = end_line + self._end_col = end_col + + @property + def parent(self) -> Optional[BaseNode]: + return self._parent + + @parent.setter + def parent(self, value: Optional[BaseNode]) -> None: + self._parent = value + + def add_child(self, child: BaseNode) -> None: + child._parent = self + self._children.append(child) + + def remove_child(self, child: BaseNode) -> None: + self._children.remove(child) + child._parent = None + + def walk(self) -> Iterator[BaseNode]: + yield self + for child in self._children: + yield from child.walk() + + def find_parent(self, kind: str) -> Optional[BaseNode]: + node = self._parent + while node is not None: + if node.kind == kind: + return node + node = node._parent + return None + + def depth(self) -> int: + d = 0 + node = self._parent + while node is not None: + d += 1 + node = node._parent + return d + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(line={self._line}, col={self._col})" + + def __eq__(self, other: object) -> bool: + if not isinstance(other, BaseNode): + return NotImplemented + return ( + self.kind == other.kind + and self._line == other._line + and self._col == other._col + ) + + def __hash__(self) -> int: + return hash((self.kind, self._line, self._col)) + + +class Expression(BaseNode): + kind: ClassVar[str] = "expression" + + def evaluate(self) -> Any: + raise NotImplementedError + + +class Statement(BaseNode): + kind: ClassVar[str] = "statement" + + def execute(self) -> None: + raise NotImplementedError + + +class Module(BaseNode): + kind: ClassVar[str] = "module" + name: str + docstring: Optional[str] + imports: List[ImportStatement] + body: List[Statement] + + def __init__(self, name: str, docstring: Optional[str] = None) -> None: + super().__init__() + self.name = name + self.docstring = docstring + self.imports = [] + self.body = [] + + +class ImportStatement(Statement): + kind: ClassVar[str] = "import" + module_name: str + alias: Optional[str] + names: List[Tuple[str, Optional[str]]] + + def __init__( + self, + module_name: str, + alias: Optional[str] = None, + names: Optional[List[Tuple[str, Optional[str]]]] = None, + ) -> None: + super().__init__() + self.module_name = module_name + self.alias = alias + self.names = names or [] + + def execute(self) -> None: + pass + + +class FunctionDef(Statement): + kind: ClassVar[str] = "funcdef" + name: str + args: List[Argument] + return_type: Optional[Expression] + body: List[Statement] + decorators: List[Expression] + is_async: bool + + def __init__( + self, + name: str, + args: Optional[List[Argument]] = None, + return_type: Optional[Expression] = None, + is_async: bool = False, + ) -> None: + super().__init__() + self.name = name + self.args = args or [] + self.return_type = return_type + self.body = [] + self.decorators = [] + self.is_async = is_async + + def execute(self) -> None: + pass + + +class ClassDef(Statement): + kind: ClassVar[str] = "classdef" + name: str + bases: List[Expression] + body: List[Statement] + decorators: List[Expression] + metaclass: Optional[Expression] + + def __init__( + self, + name: str, + bases: Optional[List[Expression]] = None, + metaclass: Optional[Expression] = None, + ) -> None: + super().__init__() + self.name = name + self.bases = bases or [] + self.body = [] + self.decorators = [] + self.metaclass = metaclass + + def execute(self) -> None: + pass + + +class Argument: + name: str + annotation: Optional[Expression] + default: Optional[Expression] + kind: str # "positional", "keyword", "*args", "**kwargs" + + def __init__( + self, + name: str, + annotation: Optional[Expression] = None, + default: Optional[Expression] = None, + kind: str = "positional", + ) -> None: + self.name = name + self.annotation = annotation + self.default = default + self.kind = kind + + +class AssignStatement(Statement): + kind: ClassVar[str] = "assign" + targets: List[Expression] + value: Expression + type_comment: Optional[str] + + def __init__( + self, + targets: List[Expression], + value: Expression, + type_comment: Optional[str] = None, + ) -> None: + super().__init__() + self.targets = targets + self.value = value + self.type_comment = type_comment + + def execute(self) -> None: + pass + + +class ReturnStatement(Statement): + kind: ClassVar[str] = "return" + value: Optional[Expression] + + def __init__(self, value: Optional[Expression] = None) -> None: + super().__init__() + self.value = value + + def execute(self) -> None: + pass + + +class IfStatement(Statement): + kind: ClassVar[str] = "if" + condition: Expression + body: List[Statement] + elif_clauses: List[Tuple[Expression, List[Statement]]] + else_body: Optional[List[Statement]] + + def __init__(self, condition: Expression) -> None: + super().__init__() + self.condition = condition + self.body = [] + self.elif_clauses = [] + self.else_body = None + + def execute(self) -> None: + pass + + +class ForStatement(Statement): + kind: ClassVar[str] = "for" + target: Expression + iterable: Expression + body: List[Statement] + else_body: Optional[List[Statement]] + is_async: bool + + def __init__( + self, + target: Expression, + iterable: Expression, + is_async: bool = False, + ) -> None: + super().__init__() + self.target = target + self.iterable = iterable + self.body = [] + self.else_body = None + self.is_async = is_async + + def execute(self) -> None: + pass + + +class WhileStatement(Statement): + kind: ClassVar[str] = "while" + condition: Expression + body: List[Statement] + else_body: Optional[List[Statement]] + + def __init__(self, condition: Expression) -> None: + super().__init__() + self.condition = condition + self.body = [] + self.else_body = None + + def execute(self) -> None: + pass + + +class TryStatement(Statement): + kind: ClassVar[str] = "try" + body: List[Statement] + handlers: List[ExceptHandler] + else_body: Optional[List[Statement]] + finally_body: Optional[List[Statement]] + + def __init__(self) -> None: + super().__init__() + self.body = [] + self.handlers = [] + self.else_body = None + self.finally_body = None + + def execute(self) -> None: + pass + + +class ExceptHandler(BaseNode): + kind: ClassVar[str] = "except_handler" + exception_type: Optional[Expression] + name: Optional[str] + body: List[Statement] + + def __init__( + self, + exception_type: Optional[Expression] = None, + name: Optional[str] = None, + ) -> None: + super().__init__() + self.exception_type = exception_type + self.name = name + self.body = [] + + +class WithStatement(Statement): + kind: ClassVar[str] = "with" + items: List[Tuple[Expression, Optional[Expression]]] + body: List[Statement] + is_async: bool + + def __init__(self, is_async: bool = False) -> None: + super().__init__() + self.items = [] + self.body = [] + self.is_async = is_async + + def execute(self) -> None: + pass + + +class RaiseStatement(Statement): + kind: ClassVar[str] = "raise" + exception: Optional[Expression] + cause: Optional[Expression] + + def __init__( + self, + exception: Optional[Expression] = None, + cause: Optional[Expression] = None, + ) -> None: + super().__init__() + self.exception = exception + self.cause = cause + + def execute(self) -> None: + pass + + +class AssertStatement(Statement): + kind: ClassVar[str] = "assert" + test: Expression + msg: Optional[Expression] + + def __init__( + self, + test: Expression, + msg: Optional[Expression] = None, + ) -> None: + super().__init__() + self.test = test + self.msg = msg + + def execute(self) -> None: + pass + + +# --- Expressions --- + + +class NameExpr(Expression): + kind: ClassVar[str] = "name" + id: str + + def __init__(self, id: str) -> None: + super().__init__() + self.id = id + + def evaluate(self) -> str: + return self.id + + +class NumberLiteral(Expression): + kind: ClassVar[str] = "number" + value: Union[int, float, complex] + + def __init__(self, value: Union[int, float, complex]) -> None: + super().__init__() + self.value = value + + def evaluate(self) -> Union[int, float, complex]: + return self.value + + +class StringLiteral(Expression): + kind: ClassVar[str] = "string" + value: str + is_fstring: bool + is_bytes: bool + is_raw: bool + + def __init__( + self, + value: str, + is_fstring: bool = False, + is_bytes: bool = False, + is_raw: bool = False, + ) -> None: + super().__init__() + self.value = value + self.is_fstring = is_fstring + self.is_bytes = is_bytes + self.is_raw = is_raw + + def evaluate(self) -> str: + return self.value + + +class BoolLiteral(Expression): + kind: ClassVar[str] = "bool" + value: bool + + def __init__(self, value: bool) -> None: + super().__init__() + self.value = value + + def evaluate(self) -> bool: + return self.value + + +class NoneLiteral(Expression): + kind: ClassVar[str] = "none" + + def evaluate(self) -> None: + return None + + +class EllipsisLiteral(Expression): + kind: ClassVar[str] = "ellipsis" + + def evaluate(self) -> Any: + return ... + + +class BinaryOp(Expression): + kind: ClassVar[str] = "binop" + left: Expression + op: str + right: Expression + + def __init__(self, left: Expression, op: str, right: Expression) -> None: + super().__init__() + self.left = left + self.op = op + self.right = right + + def evaluate(self) -> Any: + raise NotImplementedError + + +class UnaryOp(Expression): + kind: ClassVar[str] = "unaryop" + op: str + operand: Expression + + def __init__(self, op: str, operand: Expression) -> None: + super().__init__() + self.op = op + self.operand = operand + + def evaluate(self) -> Any: + raise NotImplementedError + + +class CompareExpr(Expression): + kind: ClassVar[str] = "compare" + left: Expression + comparators: List[Tuple[str, Expression]] + + def __init__(self, left: Expression) -> None: + super().__init__() + self.left = left + self.comparators = [] + + def evaluate(self) -> bool: + raise NotImplementedError + + +class CallExpr(Expression): + kind: ClassVar[str] = "call" + func: Expression + args: List[Expression] + kwargs: Dict[str, Expression] + starargs: List[Expression] + starkwargs: List[Expression] + + def __init__(self, func: Expression) -> None: + super().__init__() + self.func = func + self.args = [] + self.kwargs = {} + self.starargs = [] + self.starkwargs = [] + + def evaluate(self) -> Any: + raise NotImplementedError + + +class AttributeExpr(Expression): + kind: ClassVar[str] = "attribute" + value: Expression + attr: str + + def __init__(self, value: Expression, attr: str) -> None: + super().__init__() + self.value = value + self.attr = attr + + def evaluate(self) -> Any: + raise NotImplementedError + + +class SubscriptExpr(Expression): + kind: ClassVar[str] = "subscript" + value: Expression + index: Expression + + def __init__(self, value: Expression, index: Expression) -> None: + super().__init__() + self.value = value + self.index = index + + def evaluate(self) -> Any: + raise NotImplementedError + + +class ListExpr(Expression): + kind: ClassVar[str] = "list" + elements: List[Expression] + + def __init__(self, elements: Optional[List[Expression]] = None) -> None: + super().__init__() + self.elements = elements or [] + + def evaluate(self) -> list: + raise NotImplementedError + + +class DictExpr(Expression): + kind: ClassVar[str] = "dict" + keys: List[Optional[Expression]] + values: List[Expression] + + def __init__(self) -> None: + super().__init__() + self.keys = [] + self.values = [] + + def evaluate(self) -> dict: + raise NotImplementedError + + +class SetExpr(Expression): + kind: ClassVar[str] = "set" + elements: List[Expression] + + def __init__(self, elements: Optional[List[Expression]] = None) -> None: + super().__init__() + self.elements = elements or [] + + def evaluate(self) -> set: + raise NotImplementedError + + +class TupleExpr(Expression): + kind: ClassVar[str] = "tuple" + elements: List[Expression] + + def __init__(self, elements: Optional[List[Expression]] = None) -> None: + super().__init__() + self.elements = elements or [] + + def evaluate(self) -> tuple: + raise NotImplementedError + + +class LambdaExpr(Expression): + kind: ClassVar[str] = "lambda" + args: List[Argument] + body: Expression + + def __init__(self, body: Expression) -> None: + super().__init__() + self.args = [] + self.body = body + + def evaluate(self) -> Any: + raise NotImplementedError + + +class ListCompExpr(Expression): + kind: ClassVar[str] = "listcomp" + element: Expression + generators: List[Tuple[Expression, Expression, List[Expression]]] + + def __init__(self, element: Expression) -> None: + super().__init__() + self.element = element + self.generators = [] + + def evaluate(self) -> list: + raise NotImplementedError + + +class DictCompExpr(Expression): + kind: ClassVar[str] = "dictcomp" + key: Expression + value: Expression + generators: List[Tuple[Expression, Expression, List[Expression]]] + + def __init__(self, key: Expression, value: Expression) -> None: + super().__init__() + self.key = key + self.value = value + self.generators = [] + + def evaluate(self) -> dict: + raise NotImplementedError + + +class SetCompExpr(Expression): + kind: ClassVar[str] = "setcomp" + element: Expression + generators: List[Tuple[Expression, Expression, List[Expression]]] + + def __init__(self, element: Expression) -> None: + super().__init__() + self.element = element + self.generators = [] + + def evaluate(self) -> set: + raise NotImplementedError + + +class GeneratorExpr(Expression): + kind: ClassVar[str] = "genexpr" + element: Expression + generators: List[Tuple[Expression, Expression, List[Expression]]] + + def __init__(self, element: Expression) -> None: + super().__init__() + self.element = element + self.generators = [] + + def evaluate(self) -> Any: + raise NotImplementedError + + +class ConditionalExpr(Expression): + kind: ClassVar[str] = "conditional" + body: Expression + test: Expression + orelse: Expression + + def __init__( + self, + body: Expression, + test: Expression, + orelse: Expression, + ) -> None: + super().__init__() + self.body = body + self.test = test + self.orelse = orelse + + def evaluate(self) -> Any: + raise NotImplementedError + + +class SliceExpr(Expression): + kind: ClassVar[str] = "slice" + lower: Optional[Expression] + upper: Optional[Expression] + step: Optional[Expression] + + def __init__( + self, + lower: Optional[Expression] = None, + upper: Optional[Expression] = None, + step: Optional[Expression] = None, + ) -> None: + super().__init__() + self.lower = lower + self.upper = upper + self.step = step + + def evaluate(self) -> slice: + raise NotImplementedError + + +class StarredExpr(Expression): + kind: ClassVar[str] = "starred" + value: Expression + + def __init__(self, value: Expression) -> None: + super().__init__() + self.value = value + + def evaluate(self) -> Any: + raise NotImplementedError + + +class WalrusExpr(Expression): + kind: ClassVar[str] = "walrus" + target: NameExpr + value: Expression + + def __init__(self, target: NameExpr, value: Expression) -> None: + super().__init__() + self.target = target + self.value = value + + def evaluate(self) -> Any: + raise NotImplementedError + + +class MatchStatement(Statement): + kind: ClassVar[str] = "match" + subject: Expression + cases: List[MatchCase] + + def __init__(self, subject: Expression) -> None: + super().__init__() + self.subject = subject + self.cases = [] + + def execute(self) -> None: + pass + + +class MatchCase(BaseNode): + kind: ClassVar[str] = "match_case" + pattern: Expression + guard: Optional[Expression] + body: List[Statement] + + def __init__( + self, + pattern: Expression, + guard: Optional[Expression] = None, + ) -> None: + super().__init__() + self.pattern = pattern + self.guard = guard + self.body = [] + + +# --- Generic containers --- + + +class Container(Generic[_T]): + """A generic container with multiple operations.""" + + _items: List[_T] + _capacity: int + _name: str + + def __init__(self, name: str, capacity: int = 100) -> None: + self._items = [] + self._capacity = capacity + self._name = name + + def add(self, item: _T) -> bool: + if len(self._items) >= self._capacity: + return False + self._items.append(item) + return True + + def remove(self, item: _T) -> bool: + try: + self._items.remove(item) + return True + except ValueError: + return False + + def get(self, index: int) -> _T: + return self._items[index] + + def __len__(self) -> int: + return len(self._items) + + def __iter__(self) -> Iterator[_T]: + return iter(self._items) + + def __contains__(self, item: _T) -> bool: + return item in self._items + + def clear(self) -> None: + self._items.clear() + + def sort(self: Container[_S]) -> None: + self._items.sort() + + @property + def capacity(self) -> int: + return self._capacity + + @property + def is_full(self) -> bool: + return len(self._items) >= self._capacity + + @property + def is_empty(self) -> bool: + return len(self._items) == 0 + + +class OrderedContainer(Container[_T]): + """Container that maintains insertion order with index access.""" + + _index_map: Dict[int, _T] + + def __init__(self, name: str, capacity: int = 100) -> None: + super().__init__(name, capacity) + self._index_map = {} + + def add(self, item: _T) -> bool: + result = super().add(item) + if result: + self._index_map[len(self._items) - 1] = item + return result + + def get_by_index(self, index: int) -> Optional[_T]: + return self._index_map.get(index) + + +class MappedContainer(Generic[_KT, _VT]): + """A dictionary-like container.""" + + _store: Dict[_KT, _VT] + _max_size: int + + def __init__(self, max_size: int = 1000) -> None: + self._store = {} + self._max_size = max_size + + def put(self, key: _KT, value: _VT) -> bool: + if len(self._store) >= self._max_size and key not in self._store: + return False + self._store[key] = value + return True + + def get(self, key: _KT, default: Optional[_VT] = None) -> Optional[_VT]: + return self._store.get(key, default) + + def remove(self, key: _KT) -> Optional[_VT]: + return self._store.pop(key, None) + + def keys(self) -> Set[_KT]: + return set(self._store.keys()) + + def values(self) -> List[_VT]: + return list(self._store.values()) + + def items(self) -> List[Tuple[_KT, _VT]]: + return list(self._store.items()) + + def __len__(self) -> int: + return len(self._store) + + def __contains__(self, key: _KT) -> bool: + return key in self._store + + +# --- Overloaded functions --- + + +@overload +def process(value: int) -> str: ... +@overload +def process(value: str) -> int: ... +@overload +def process(value: bytes) -> List[int]: ... +@overload +def process(value: List[int]) -> bytes: ... +@overload +def process(value: Dict[str, Any]) -> List[Tuple[str, Any]]: ... + + +def process( + value: Union[int, str, bytes, List[int], Dict[str, Any]], +) -> Union[str, int, List[int], bytes, List[Tuple[str, Any]]]: + if isinstance(value, int): + return str(value) + elif isinstance(value, str): + return len(value) + elif isinstance(value, bytes): + return list(value) + elif isinstance(value, list): + return bytes(value) + else: + return list(value.items()) + + +@overload +def convert(src: str, target: type[int]) -> int: ... +@overload +def convert(src: str, target: type[float]) -> float: ... +@overload +def convert(src: str, target: type[bool]) -> bool: ... +@overload +def convert(src: str, target: type[bytes]) -> bytes: ... + + +def convert( + src: str, + target: Union[type[int], type[float], type[bool], type[bytes]], +) -> Union[int, float, bool, bytes]: + return target(src) # type: ignore + + +# --- Protocol examples --- + + +@runtime_checkable +class Serializable(Protocol): + def serialize(self) -> bytes: ... + def deserialize(self, data: bytes) -> None: ... + + +@runtime_checkable +class Comparable(Protocol[_T_co]): + def compare_to(self, other: _T_co) -> int: ... + + +class Hashable(Protocol): + def __hash__(self) -> int: ... + def __eq__(self, other: object) -> bool: ... + + +class Sizeable(Protocol): + def __len__(self) -> int: ... + def __sizeof__(self) -> int: ... + + +class Printable(Protocol): + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + + +# --- Complex type annotations --- + + +ConfigValue = Union[str, int, float, bool, None, List["ConfigValue"], Dict[str, "ConfigValue"]] + +NestedDict = Dict[str, Union[str, int, Dict[str, Union[str, int, Dict[str, Any]]]]] + +CallbackType = typing.Callable[[str, int, Optional[Dict[str, Any]]], bool] + +EventHandler = typing.Callable[..., Optional[bool]] + +TreeNode = Union[ + "LeafNode", + "BranchNode", + Tuple["TreeNode", "TreeNode"], +] + + +class LeafNode: + value: Any + + def __init__(self, value: Any) -> None: + self.value = value + + +class BranchNode: + children: List[TreeNode] + label: str + + def __init__(self, label: str) -> None: + self.children = [] + self.label = label + + +# --- Large function set (simulating stdlib coverage) --- + + +def compute_checksum(data: bytes, algorithm: str = "crc32") -> int: + """Compute a checksum of the given data.""" + if algorithm == "crc32": + result = 0 + for byte in data: + result = (result >> 8) ^ byte + return result & 0xFFFFFFFF + elif algorithm == "simple": + return sum(data) & 0xFFFFFFFF + else: + raise ValueError(f"Unknown algorithm: {algorithm}") + + +def format_bytes(size: int) -> str: + """Format a byte count as a human-readable string.""" + for unit in ["B", "KB", "MB", "GB", "TB"]: + if abs(size) < 1024.0: + return f"{size:.1f} {unit}" + size = int(size / 1024) + return f"{size:.1f} PB" + + +def parse_version(version_str: str) -> Tuple[int, int, int]: + """Parse a version string like '1.2.3' into a tuple.""" + parts = version_str.split(".") + if len(parts) != 3: + raise ValueError(f"Invalid version: {version_str}") + return (int(parts[0]), int(parts[1]), int(parts[2])) + + +def merge_dicts( + *dicts: Dict[str, Any], + deep: bool = False, +) -> Dict[str, Any]: + """Merge multiple dictionaries.""" + result: Dict[str, Any] = {} + for d in dicts: + if deep: + for key, value in d.items(): + if ( + key in result + and isinstance(result[key], dict) + and isinstance(value, dict) + ): + result[key] = merge_dicts(result[key], value, deep=True) + else: + result[key] = value + else: + result.update(d) + return result + + +def flatten_list(nested: List[Any], max_depth: int = -1) -> List[Any]: + """Flatten a nested list up to max_depth levels.""" + result: List[Any] = [] + for item in nested: + if isinstance(item, list) and max_depth != 0: + result.extend(flatten_list(item, max_depth - 1)) + else: + result.append(item) + return result + + +def chunk_list(lst: List[_T], size: int) -> List[List[_T]]: + """Split a list into chunks of the given size.""" + return [lst[i : i + size] for i in range(0, len(lst), size)] + + +def deduplicate(items: Sequence[_T]) -> List[_T]: + """Remove duplicates while preserving order.""" + seen: Set[Any] = set() + result: List[_T] = [] + for item in items: + key = id(item) if not isinstance(item, (str, int, float, bool, bytes)) else item + if key not in seen: + seen.add(key) + result.append(item) + return result + + +def retry( + func: typing.Callable[[], _T], + max_attempts: int = 3, + delay: float = 1.0, + backoff: float = 2.0, + exceptions: Tuple[type, ...] = (Exception,), +) -> _T: + """Retry a function with exponential backoff.""" + last_exception: Optional[Exception] = None + current_delay = delay + for attempt in range(max_attempts): + try: + return func() + except exceptions as e: + last_exception = e + if attempt < max_attempts - 1: + current_delay *= backoff + raise last_exception # type: ignore + + +def memoize(func: typing.Callable[..., _T]) -> typing.Callable[..., _T]: + """Simple memoization decorator.""" + cache: Dict[str, _T] = {} + + def wrapper(*args: Any, **kwargs: Any) -> _T: + key = str((args, sorted(kwargs.items()))) + if key not in cache: + cache[key] = func(*args, **kwargs) + return cache[key] + + return wrapper + + +# --- More node types to add bulk --- + + +class YieldExpr(Expression): + kind: ClassVar[str] = "yield" + value: Optional[Expression] + + def __init__(self, value: Optional[Expression] = None) -> None: + super().__init__() + self.value = value + + def evaluate(self) -> Any: + raise NotImplementedError + + +class YieldFromExpr(Expression): + kind: ClassVar[str] = "yield_from" + value: Expression + + def __init__(self, value: Expression) -> None: + super().__init__() + self.value = value + + def evaluate(self) -> Any: + raise NotImplementedError + + +class AwaitExpr(Expression): + kind: ClassVar[str] = "await" + value: Expression + + def __init__(self, value: Expression) -> None: + super().__init__() + self.value = value + + def evaluate(self) -> Any: + raise NotImplementedError + + +class FormattedValue(Expression): + kind: ClassVar[str] = "formatted_value" + value: Expression + conversion: Optional[str] + format_spec: Optional[Expression] + + def __init__( + self, + value: Expression, + conversion: Optional[str] = None, + format_spec: Optional[Expression] = None, + ) -> None: + super().__init__() + self.value = value + self.conversion = conversion + self.format_spec = format_spec + + def evaluate(self) -> str: + raise NotImplementedError + + +class JoinedStr(Expression): + """Represents an f-string.""" + + kind: ClassVar[str] = "fstring" + values: List[Expression] + + def __init__(self, values: Optional[List[Expression]] = None) -> None: + super().__init__() + self.values = values or [] + + def evaluate(self) -> str: + raise NotImplementedError + + +class TypeAlias(Statement): + kind: ClassVar[str] = "type_alias" + name: str + type_params: List[Expression] + value: Expression + + def __init__( + self, + name: str, + value: Expression, + type_params: Optional[List[Expression]] = None, + ) -> None: + super().__init__() + self.name = name + self.value = value + self.type_params = type_params or [] + + def execute(self) -> None: + pass + + +class GlobalStatement(Statement): + kind: ClassVar[str] = "global" + names: List[str] + + def __init__(self, names: List[str]) -> None: + super().__init__() + self.names = names + + def execute(self) -> None: + pass + + +class NonlocalStatement(Statement): + kind: ClassVar[str] = "nonlocal" + names: List[str] + + def __init__(self, names: List[str]) -> None: + super().__init__() + self.names = names + + def execute(self) -> None: + pass + + +class DeleteStatement(Statement): + kind: ClassVar[str] = "del" + targets: List[Expression] + + def __init__(self, targets: List[Expression]) -> None: + super().__init__() + self.targets = targets + + def execute(self) -> None: + pass + + +class PassStatement(Statement): + kind: ClassVar[str] = "pass" + + def execute(self) -> None: + pass + + +class BreakStatement(Statement): + kind: ClassVar[str] = "break" + + def execute(self) -> None: + pass + + +class ContinueStatement(Statement): + kind: ClassVar[str] = "continue" + + def execute(self) -> None: + pass + + +# --- Visitor pattern --- + + +class NodeVisitor(Generic[_T]): + """AST node visitor with generic return type.""" + + def visit(self, node: BaseNode) -> _T: + method_name = f"visit_{node.kind}" + visitor = getattr(self, method_name, self.generic_visit) + return visitor(node) + + def generic_visit(self, node: BaseNode) -> _T: + raise NotImplementedError(f"No visitor for {node.kind}") + + def visit_module(self, node: Module) -> _T: + return self.generic_visit(node) + + def visit_funcdef(self, node: FunctionDef) -> _T: + return self.generic_visit(node) + + def visit_classdef(self, node: ClassDef) -> _T: + return self.generic_visit(node) + + def visit_import(self, node: ImportStatement) -> _T: + return self.generic_visit(node) + + def visit_assign(self, node: AssignStatement) -> _T: + return self.generic_visit(node) + + def visit_return(self, node: ReturnStatement) -> _T: + return self.generic_visit(node) + + def visit_if(self, node: IfStatement) -> _T: + return self.generic_visit(node) + + def visit_for(self, node: ForStatement) -> _T: + return self.generic_visit(node) + + def visit_while(self, node: WhileStatement) -> _T: + return self.generic_visit(node) + + def visit_try(self, node: TryStatement) -> _T: + return self.generic_visit(node) + + def visit_with(self, node: WithStatement) -> _T: + return self.generic_visit(node) + + def visit_raise(self, node: RaiseStatement) -> _T: + return self.generic_visit(node) + + def visit_assert(self, node: AssertStatement) -> _T: + return self.generic_visit(node) + + def visit_expression(self, node: Expression) -> _T: + return self.generic_visit(node) + + def visit_name(self, node: NameExpr) -> _T: + return self.visit_expression(node) + + def visit_number(self, node: NumberLiteral) -> _T: + return self.visit_expression(node) + + def visit_string(self, node: StringLiteral) -> _T: + return self.visit_expression(node) + + def visit_bool(self, node: BoolLiteral) -> _T: + return self.visit_expression(node) + + def visit_none(self, node: NoneLiteral) -> _T: + return self.visit_expression(node) + + def visit_ellipsis(self, node: EllipsisLiteral) -> _T: + return self.visit_expression(node) + + def visit_binop(self, node: BinaryOp) -> _T: + return self.visit_expression(node) + + def visit_unaryop(self, node: UnaryOp) -> _T: + return self.visit_expression(node) + + def visit_compare(self, node: CompareExpr) -> _T: + return self.visit_expression(node) + + def visit_call(self, node: CallExpr) -> _T: + return self.visit_expression(node) + + def visit_attribute(self, node: AttributeExpr) -> _T: + return self.visit_expression(node) + + def visit_subscript(self, node: SubscriptExpr) -> _T: + return self.visit_expression(node) + + def visit_list(self, node: ListExpr) -> _T: + return self.visit_expression(node) + + def visit_dict(self, node: DictExpr) -> _T: + return self.visit_expression(node) + + def visit_set(self, node: SetExpr) -> _T: + return self.visit_expression(node) + + def visit_tuple(self, node: TupleExpr) -> _T: + return self.visit_expression(node) + + def visit_lambda(self, node: LambdaExpr) -> _T: + return self.visit_expression(node) + + def visit_listcomp(self, node: ListCompExpr) -> _T: + return self.visit_expression(node) + + def visit_dictcomp(self, node: DictCompExpr) -> _T: + return self.visit_expression(node) + + def visit_setcomp(self, node: SetCompExpr) -> _T: + return self.visit_expression(node) + + def visit_genexpr(self, node: GeneratorExpr) -> _T: + return self.visit_expression(node) + + def visit_conditional(self, node: ConditionalExpr) -> _T: + return self.visit_expression(node) + + def visit_slice(self, node: SliceExpr) -> _T: + return self.visit_expression(node) + + def visit_starred(self, node: StarredExpr) -> _T: + return self.visit_expression(node) + + def visit_walrus(self, node: WalrusExpr) -> _T: + return self.visit_expression(node) + + def visit_match(self, node: MatchStatement) -> _T: + return self.generic_visit(node) + + def visit_yield(self, node: YieldExpr) -> _T: + return self.visit_expression(node) + + def visit_yield_from(self, node: YieldFromExpr) -> _T: + return self.visit_expression(node) + + def visit_await(self, node: AwaitExpr) -> _T: + return self.visit_expression(node) + + def visit_fstring(self, node: JoinedStr) -> _T: + return self.visit_expression(node) + + +# --- Transformer subclass --- + + +class NodeTransformer(NodeVisitor[BaseNode]): + """Visitor that returns transformed nodes.""" + + def generic_visit(self, node: BaseNode) -> BaseNode: + return node + + +# --- Registry pattern --- + + +class NodeRegistry: + """Registry of node factories.""" + + _factories: Dict[str, typing.Callable[..., BaseNode]] + + def __init__(self) -> None: + self._factories = {} + + def register( + self, kind: str + ) -> typing.Callable[ + [typing.Callable[..., BaseNode]], typing.Callable[..., BaseNode] + ]: + def decorator( + factory: typing.Callable[..., BaseNode], + ) -> typing.Callable[..., BaseNode]: + self._factories[kind] = factory + return factory + + return decorator + + def create(self, kind: str, **kwargs: Any) -> BaseNode: + factory = self._factories.get(kind) + if factory is None: + raise KeyError(f"No factory registered for kind: {kind}") + return factory(**kwargs) + + def kinds(self) -> List[str]: + return list(self._factories.keys()) + + +# --- Utility constants --- + +MAX_RECURSION_DEPTH: Final[int] = 256 +DEFAULT_INDENT: Final[str] = " " +BUILTIN_TYPES: Final[Tuple[str, ...]] = ( + "int", + "float", + "complex", + "bool", + "str", + "bytes", + "bytearray", + "memoryview", + "list", + "tuple", + "dict", + "set", + "frozenset", + "range", + "slice", + "type", + "object", + "None", +) + +COMPARISON_OPS: Final[Tuple[str, ...]] = ( + "==", + "!=", + "<", + "<=", + ">", + ">=", + "is", + "is not", + "in", + "not in", +) + +BOOLEAN_OPS: Final[Tuple[str, ...]] = ("and", "or") + +UNARY_OPS: Final[Tuple[str, ...]] = ("+", "-", "~", "not") + +BINARY_OPS: Final[Tuple[str, ...]] = ( + "+", + "-", + "*", + "/", + "//", + "%", + "**", + "<<", + ">>", + "|", + "^", + "&", + "@", +) + +AUGMENTED_ASSIGN_OPS: Final[Tuple[str, ...]] = ( + "+=", + "-=", + "*=", + "/=", + "//=", + "%=", + "**=", + "<<=", + ">>=", + "|=", + "^=", + "&=", + "@=", +) + + +# --- Large function set to add line count --- + + +def validate_identifier(name: str) -> bool: + """Check if a string is a valid Python identifier.""" + if not name: + return False + if name[0].isdigit(): + return False + return all(c.isalnum() or c == "_" for c in name) + + +def escape_string(s: str, quote: str = '"') -> str: + """Escape a string for Python source output.""" + result = s.replace("\\", "\\\\") + result = result.replace(quote, "\\" + quote) + result = result.replace("\n", "\\n") + result = result.replace("\r", "\\r") + result = result.replace("\t", "\\t") + return f"{quote}{result}{quote}" + + +def indent_code(code: str, level: int = 1, indent: str = DEFAULT_INDENT) -> str: + """Indent each line of code by the given level.""" + prefix = indent * level + lines = code.split("\n") + return "\n".join(prefix + line if line.strip() else line for line in lines) + + +def strip_comments(source: str) -> str: + """Remove line comments from Python source code (naive).""" + lines = source.split("\n") + result: List[str] = [] + for line in lines: + in_string = False + quote_char = "" + comment_start = -1 + i = 0 + while i < len(line): + ch = line[i] + if in_string: + if ch == "\\" and i + 1 < len(line): + i += 2 + continue + if ch == quote_char: + in_string = False + else: + if ch in ('"', "'"): + in_string = True + quote_char = ch + elif ch == "#": + comment_start = i + break + i += 1 + if comment_start >= 0: + result.append(line[:comment_start].rstrip()) + else: + result.append(line) + return "\n".join(result) + + +def count_lines(source: str) -> Dict[str, int]: + """Count types of lines in a source file.""" + lines = source.split("\n") + total = len(lines) + blank = sum(1 for l in lines if not l.strip()) + comment = sum(1 for l in lines if l.strip().startswith("#")) + code = total - blank - comment + return { + "total": total, + "blank": blank, + "comment": comment, + "code": code, + } + + +def find_all_names(source: str) -> List[str]: + """Find all potential identifiers in source (naive regex-free scan).""" + names: List[str] = [] + current = "" + for ch in source: + if ch.isalnum() or ch == "_": + current += ch + else: + if current and not current[0].isdigit(): + names.append(current) + current = "" + if current and not current[0].isdigit(): + names.append(current) + return deduplicate(names) + + +def build_scope_chain(node: BaseNode) -> List[str]: + """Build a list of enclosing scope names for a given node.""" + chain: List[str] = [] + current: Optional[BaseNode] = node + while current is not None: + if isinstance(current, (FunctionDef, ClassDef)): + chain.append(current.name) + elif isinstance(current, Module): + chain.append(current.name) + current = current._parent + chain.reverse() + return chain + + +def compute_complexity(node: BaseNode) -> int: + """Compute a naive cyclomatic complexity for a node.""" + complexity = 1 + for child in node.walk(): + if isinstance(child, (IfStatement, ForStatement, WhileStatement)): + complexity += 1 + elif isinstance(child, TryStatement): + complexity += len(child.handlers) + elif isinstance(child, (BinaryOp,)) and child.op in BOOLEAN_OPS: + complexity += 1 + return complexity + + +# --- Type alias collection --- + +JsonPrimitive = Union[str, int, float, bool, None] +JsonArray = List["JsonValue"] +JsonObject = Dict[str, "JsonValue"] +JsonValue = Union[JsonPrimitive, JsonArray, JsonObject] + +HttpMethod = Literal["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"] +StatusCode = Literal[200, 201, 204, 301, 302, 400, 401, 403, 404, 500, 502, 503] + +Color = Tuple[int, int, int] +ColorWithAlpha = Tuple[int, int, int, float] +AnyColor = Union[Color, ColorWithAlpha, str] + +Point2D = Tuple[float, float] +Point3D = Tuple[float, float, float] +BoundingBox = Tuple[Point2D, Point2D] +BoundingBox3D = Tuple[Point3D, Point3D] + +Matrix = List[List[float]] +SparseMatrix = Dict[Tuple[int, int], float] + +PathLike = Union[str, os.PathLike[str]] + +Callback = typing.Callable[[], None] +ErrorHandler = typing.Callable[[Exception], bool] +Predicate = typing.Callable[[Any], bool] +Comparator = typing.Callable[[Any, Any], int] +Transformer = typing.Callable[[_T], _T] + +# End of large_stdlib.py diff --git a/packages/pyright-internal/src/tests/benchmarkData/repetitive_identifiers.py b/packages/pyright-internal/src/tests/benchmarkData/repetitive_identifiers.py new file mode 100644 index 000000000000..36a22fbe4c16 --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarkData/repetitive_identifiers.py @@ -0,0 +1,233 @@ +# repetitive_identifiers.py — stresses the tokenizer's identifier intern +# cache by using a small set of identifiers (self, cls, T, K, V, str, int, +# list, dict, None, True, False, etc.) thousands of times. Tokenizing this +# file should hit the identifier intern cache on the vast majority of +# identifier tokens. + +from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar, Union + +T = TypeVar("T") +K = TypeVar("K") +V = TypeVar("V") + + +class C1(Generic[T, K, V]): + def __init__(self, x: T, y: K, z: V) -> None: + self.x = x + self.y = y + self.z = z + + def get_x(self) -> T: + return self.x + + def get_y(self) -> K: + return self.y + + def get_z(self) -> V: + return self.z + + def set_x(self, x: T) -> None: + self.x = x + + def set_y(self, y: K) -> None: + self.y = y + + def set_z(self, z: V) -> None: + self.z = z + + def swap(self, other: "C1[T, K, V]") -> None: + self.x, other.x = other.x, self.x + self.y, other.y = other.y, self.y + self.z, other.z = other.z, self.z + + @classmethod + def make(cls, x: T, y: K, z: V) -> "C1[T, K, V]": + return cls(x, y, z) + + @classmethod + def pair(cls, x: T, y: K, z: V) -> Tuple["C1[T, K, V]", "C1[T, K, V]"]: + return cls(x, y, z), cls(x, y, z) + + +class C2(Generic[T, K, V]): + def __init__(self, x: T, y: K, z: V) -> None: + self.x = x + self.y = y + self.z = z + + def get_x(self) -> T: + return self.x + + def get_y(self) -> K: + return self.y + + def get_z(self) -> V: + return self.z + + def set_x(self, x: T) -> None: + self.x = x + + def set_y(self, y: K) -> None: + self.y = y + + def set_z(self, z: V) -> None: + self.z = z + + @classmethod + def make(cls, x: T, y: K, z: V) -> "C2[T, K, V]": + return cls(x, y, z) + + +def f1(x: int, y: int, z: int) -> int: + return x + y + z + + +def f2(x: int, y: int, z: int) -> int: + return x + y + z + + +def f3(x: int, y: int, z: int) -> int: + return x + y + z + + +def f4(x: int, y: int, z: int) -> int: + return x + y + z + + +def f5(x: int, y: int, z: int) -> int: + return x + y + z + + +def build_list(x: int, y: int, z: int) -> List[int]: + return [x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y, z] + + +def build_dict(x: str, y: str, z: str) -> Dict[str, str]: + return {"x": x, "y": y, "z": z, "a": x, "b": y, "c": z, "d": x, "e": y, "f": z} + + +def build_tuple(x: int, y: int, z: int) -> Tuple[int, int, int, int, int, int]: + return (x, y, z, x, y, z) + + +def deep(x: int, y: int, z: int) -> Optional[int]: + if x is None: + return None + if y is None: + return None + if z is None: + return None + if x == 0: + return x + if y == 0: + return y + if z == 0: + return z + return x + y + z + + +def union_of(x: Union[int, str], y: Union[int, str], z: Union[int, str]) -> Union[int, str]: + if isinstance(x, int) and isinstance(y, int) and isinstance(z, int): + return x + y + z + return str(x) + str(y) + str(z) + + +def any_of(x: Any, y: Any, z: Any) -> Any: + return x or y or z or x or y or z or x or y or z + + +# Lots of calls, each one touches the same identifiers repeatedly. +_ = f1(1, 2, 3) +_ = f2(1, 2, 3) +_ = f3(1, 2, 3) +_ = f4(1, 2, 3) +_ = f5(1, 2, 3) +_ = f1(1, 2, 3) +_ = f2(1, 2, 3) +_ = f3(1, 2, 3) +_ = f4(1, 2, 3) +_ = f5(1, 2, 3) +_ = f1(1, 2, 3) +_ = f2(1, 2, 3) +_ = f3(1, 2, 3) +_ = f4(1, 2, 3) +_ = f5(1, 2, 3) + +_ = build_list(1, 2, 3) +_ = build_list(1, 2, 3) +_ = build_list(1, 2, 3) +_ = build_list(1, 2, 3) +_ = build_list(1, 2, 3) + +_ = build_dict("a", "b", "c") +_ = build_dict("a", "b", "c") +_ = build_dict("a", "b", "c") +_ = build_dict("a", "b", "c") +_ = build_dict("a", "b", "c") + +_ = build_tuple(1, 2, 3) +_ = build_tuple(1, 2, 3) +_ = build_tuple(1, 2, 3) +_ = build_tuple(1, 2, 3) +_ = build_tuple(1, 2, 3) + +_ = deep(1, 2, 3) +_ = deep(1, 2, 3) +_ = deep(1, 2, 3) +_ = deep(1, 2, 3) +_ = deep(1, 2, 3) + +_ = union_of(1, 2, 3) +_ = union_of(1, 2, 3) +_ = union_of(1, 2, 3) +_ = union_of(1, 2, 3) +_ = union_of(1, 2, 3) + +_ = any_of(1, 2, 3) +_ = any_of(1, 2, 3) +_ = any_of(1, 2, 3) +_ = any_of(1, 2, 3) +_ = any_of(1, 2, 3) + +c1 = C1(1, "a", [1, 2, 3]) +c2 = C1(1, "a", [1, 2, 3]) +c3 = C1(1, "a", [1, 2, 3]) +c4 = C1(1, "a", [1, 2, 3]) +c5 = C1(1, "a", [1, 2, 3]) +c6 = C2(1, "a", [1, 2, 3]) +c7 = C2(1, "a", [1, 2, 3]) +c8 = C2(1, "a", [1, 2, 3]) +c9 = C2(1, "a", [1, 2, 3]) +c10 = C2(1, "a", [1, 2, 3]) + +# Flat attribute-access cascade — each line references self-like receivers +# multiple times, producing many repeated identifier tokens per line. +r1 = c1.get_x() + c2.get_x() + c3.get_x() + c4.get_x() + c5.get_x() +r2 = c1.get_y() + c2.get_y() + c3.get_y() + c4.get_y() + c5.get_y() +r3 = c1.get_z() + c2.get_z() + c3.get_z() + c4.get_z() + c5.get_z() +r4 = c6.get_x() + c7.get_x() + c8.get_x() + c9.get_x() + c10.get_x() +r5 = c6.get_y() + c7.get_y() + c8.get_y() + c9.get_y() + c10.get_y() +r6 = c6.get_z() + c7.get_z() + c8.get_z() + c9.get_z() + c10.get_z() + +# A batch of nearly-identical lines to really hammer the intern cache. +v1 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v2 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v3 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v4 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v5 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v6 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v7 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v8 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v9 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] +v10 = [x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9] if x > 0 and x < 10 and x != 5] + +w1 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w2 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w3 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w4 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w5 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w6 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w7 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w8 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w9 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} +w10 = {k: v for k, v in [("a", 1), ("b", 2), ("c", 3)] if v > 0 and k != "x"} diff --git a/packages/pyright-internal/src/tests/benchmarkData/union_heavy.py b/packages/pyright-internal/src/tests/benchmarkData/union_heavy.py new file mode 100644 index 000000000000..0f5f0602e54e --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarkData/union_heavy.py @@ -0,0 +1,525 @@ +# union_heavy.py — complex union/intersection type scenarios +# Stresses the type evaluator's union handling, narrowing, and type guard paths. + +from __future__ import annotations + +from typing import ( + Any, + Dict, + Generic, + List, + Literal, + Never, + Optional, + Protocol, + Sequence, + Tuple, + TypeAlias, + TypeGuard, + TypeVar, + Union, + overload, + runtime_checkable, +) +from dataclasses import dataclass + +_T = TypeVar("_T") + +# --- Large literal unions --- + +HttpStatus: TypeAlias = Literal[ + 100, 101, 102, 103, + 200, 201, 202, 203, 204, 205, 206, 207, 208, 226, + 300, 301, 302, 303, 304, 305, 307, 308, + 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, + 410, 411, 412, 413, 414, 415, 416, 417, 418, 421, + 422, 423, 424, 425, 426, 428, 429, 431, 451, + 500, 501, 502, 503, 504, 505, 506, 507, 508, 510, 511, +] + +Color: TypeAlias = Literal[ + "red", "green", "blue", "yellow", "cyan", "magenta", + "white", "black", "gray", "grey", "orange", "purple", + "pink", "brown", "gold", "silver", "navy", "teal", + "maroon", "olive", "lime", "aqua", "coral", "salmon", + "crimson", "indigo", "violet", "turquoise", "khaki", + "orchid", "plum", "sienna", "tomato", "wheat", +] + +Country: TypeAlias = Literal[ + "US", "UK", "CA", "AU", "NZ", "IE", "DE", "FR", "ES", "IT", + "PT", "NL", "BE", "CH", "AT", "SE", "NO", "DK", "FI", "PL", + "CZ", "SK", "HU", "RO", "BG", "HR", "SI", "EE", "LV", "LT", + "JP", "KR", "CN", "TW", "HK", "SG", "MY", "TH", "VN", "PH", + "IN", "PK", "BD", "LK", "NP", "ID", "BR", "AR", "CL", "CO", + "MX", "PE", "VE", "EC", "UY", "PY", "BO", "ZA", "NG", "KE", + "EG", "MA", "TN", "GH", "ET", "TZ", "UG", "RW", "SN", "CI", +] + +# --- Discriminated unions --- + +@dataclass +class Circle: + kind: Literal["circle"] = "circle" + radius: float = 1.0 + + +@dataclass +class Rectangle: + kind: Literal["rectangle"] = "rectangle" + width: float = 1.0 + height: float = 1.0 + + +@dataclass +class Triangle: + kind: Literal["triangle"] = "triangle" + base: float = 1.0 + height: float = 1.0 + + +@dataclass +class Polygon: + kind: Literal["polygon"] = "polygon" + sides: int = 3 + side_length: float = 1.0 + + +@dataclass +class Ellipse: + kind: Literal["ellipse"] = "ellipse" + semi_major: float = 2.0 + semi_minor: float = 1.0 + + +Shape = Union[Circle, Rectangle, Triangle, Polygon, Ellipse] + + +def area(shape: Shape) -> float: + if shape.kind == "circle": + return 3.14159 * shape.radius ** 2 + elif shape.kind == "rectangle": + return shape.width * shape.height + elif shape.kind == "triangle": + return 0.5 * shape.base * shape.height + elif shape.kind == "polygon": + import math + return (shape.sides * shape.side_length ** 2) / (4 * math.tan(math.pi / shape.sides)) + elif shape.kind == "ellipse": + return 3.14159 * shape.semi_major * shape.semi_minor + else: + _: Never = shape + raise ValueError(f"Unknown shape: {shape}") + + +def perimeter(shape: Shape) -> float: + if shape.kind == "circle": + return 2 * 3.14159 * shape.radius + elif shape.kind == "rectangle": + return 2 * (shape.width + shape.height) + elif shape.kind == "triangle": + return shape.base * 3 + elif shape.kind == "polygon": + return shape.sides * shape.side_length + elif shape.kind == "ellipse": + import math + a = shape.semi_major + b = shape.semi_minor + return 3.14159 * (3 * (a + b) - math.sqrt((3 * a + b) * (a + 3 * b))) + else: + _: Never = shape + raise ValueError + + +# --- Nested unions --- + +JsonPrimitive = Union[str, int, float, bool, None] +JsonArray = List["JsonValue"] +JsonObject = Dict[str, "JsonValue"] +JsonValue = Union[JsonPrimitive, JsonArray, JsonObject] + + +def json_depth(value: JsonValue) -> int: + if isinstance(value, dict): + if not value: + return 1 + return 1 + max(json_depth(v) for v in value.values()) + elif isinstance(value, list): + if not value: + return 1 + return 1 + max(json_depth(v) for v in value) + else: + return 0 + + +def json_size(value: JsonValue) -> int: + if isinstance(value, dict): + return sum(json_size(v) for v in value.values()) + len(value) + elif isinstance(value, list): + return sum(json_size(v) for v in value) + len(value) + elif isinstance(value, str): + return len(value) + elif value is None: + return 0 + else: + return 1 + + +# --- Union narrowing stress --- + +def narrow_union_1(x: Union[int, str, float, bool, bytes, None]) -> str: + if isinstance(x, int): + return f"int: {x}" + elif isinstance(x, str): + return f"str: {x}" + elif isinstance(x, float): + return f"float: {x}" + elif isinstance(x, bool): + return f"bool: {x}" + elif isinstance(x, bytes): + return f"bytes: {x!r}" + elif x is None: + return "none" + else: + _: Never = x + return "unreachable" + + +def narrow_union_2( + x: Union[int, str, List[int], Dict[str, int], Tuple[int, ...], set, frozenset], +) -> int: + if isinstance(x, int): + return x + elif isinstance(x, str): + return len(x) + elif isinstance(x, list): + return sum(x) + elif isinstance(x, dict): + return sum(x.values()) + elif isinstance(x, tuple): + return sum(x) + elif isinstance(x, set): + return len(x) + elif isinstance(x, frozenset): + return len(x) + else: + _: Never = x + raise ValueError + + +def narrow_union_chained( + x: Union[int, str, float, bytes, list, dict, tuple, set, frozenset, None], +) -> str: + if x is None: + return "None" + if isinstance(x, (int, float)): + return f"number: {x}" + if isinstance(x, (str, bytes)): + return f"text: {x!r}" + if isinstance(x, (list, tuple)): + return f"sequence: len={len(x)}" + if isinstance(x, (set, frozenset)): + return f"set: len={len(x)}" + if isinstance(x, dict): + return f"dict: keys={len(x)}" + _: Never = x + return "unreachable" + + +# --- Type guards --- + +def is_string_list(val: List[Any]) -> TypeGuard[List[str]]: + return all(isinstance(item, str) for item in val) + + +def is_int_dict(val: Dict[str, Any]) -> TypeGuard[Dict[str, int]]: + return all(isinstance(v, int) for v in val.values()) + + +def is_non_empty(val: Optional[List[_T]]) -> TypeGuard[List[_T]]: + return val is not None and len(val) > 0 + + +def is_positive_int(val: Union[int, str, None]) -> TypeGuard[int]: + return isinstance(val, int) and val > 0 + + +# --- Overloaded functions with union args --- + +@overload +def transform(value: int) -> str: ... +@overload +def transform(value: str) -> int: ... +@overload +def transform(value: float) -> bool: ... +@overload +def transform(value: bool) -> float: ... +@overload +def transform(value: bytes) -> List[int]: ... +@overload +def transform(value: List[int]) -> bytes: ... +@overload +def transform(value: None) -> Literal["none"]: ... + + +def transform( + value: Union[int, str, float, bool, bytes, List[int], None], +) -> Union[str, int, bool, float, List[int], bytes, Literal["none"]]: + if isinstance(value, bool): + return float(value) + elif isinstance(value, int): + return str(value) + elif isinstance(value, str): + return len(value) + elif isinstance(value, float): + return value > 0 + elif isinstance(value, bytes): + return list(value) + elif isinstance(value, list): + return bytes(value) + elif value is None: + return "none" + else: + raise TypeError + + +# --- Complex generic unions --- + +@dataclass +class Success(Generic[_T]): + value: _T + + +@dataclass +class Failure: + error: str + code: int = 0 + + +Result = Union[Success[_T], Failure] + + +def handle_result(r: Result[int]) -> str: + if isinstance(r, Success): + return f"OK: {r.value}" + else: + return f"ERR[{r.code}]: {r.error}" + + +def chain_results(results: List[Result[int]]) -> Result[List[int]]: + values: List[int] = [] + for r in results: + if isinstance(r, Failure): + return r + values.append(r.value) + return Success(values) + + +# --- Protocol unions --- + +@runtime_checkable +class Printable(Protocol): + def __str__(self) -> str: ... + +@runtime_checkable +class Measurable(Protocol): + def __len__(self) -> int: ... + +@runtime_checkable +class Numeric(Protocol): + def __add__(self, other: Any) -> Any: ... + def __mul__(self, other: Any) -> Any: ... + + +def describe_value(val: Union[Printable, Measurable, Numeric]) -> str: + parts: List[str] = [] + if isinstance(val, Printable): + parts.append(f"str={val}") + if isinstance(val, Measurable): + parts.append(f"len={len(val)}") + return ", ".join(parts) if parts else "unknown" + + +# --- TypedDict unions --- + +from typing import TypedDict + + +class UserInfo(TypedDict): + name: str + age: int + email: str + + +class CompanyInfo(TypedDict): + name: str + employees: int + industry: str + + +class ProductInfo(TypedDict): + name: str + price: float + category: str + + +Entity = Union[UserInfo, CompanyInfo, ProductInfo] + + +def entity_name(entity: Entity) -> str: + return entity["name"] + + +def entity_summary(entity: Entity) -> str: + if "age" in entity: + e: UserInfo = entity # type: ignore + return f"User: {e['name']}, age {e['age']}" + elif "employees" in entity: + e2: CompanyInfo = entity # type: ignore + return f"Company: {e2['name']}, {e2['employees']} employees" + else: + e3: ProductInfo = entity # type: ignore + return f"Product: {e3['name']}, ${e3['price']}" + + +# --- Deep union chains --- + +Level0 = Union[int, str] +Level1 = Union[Level0, float, bool] +Level2 = Union[Level1, bytes, list] +Level3 = Union[Level2, dict, tuple] +Level4 = Union[Level3, set, frozenset] +Level5 = Union[Level4, complex, memoryview] + +DeepUnion = Level5 + + +def process_deep(val: DeepUnion) -> str: + if isinstance(val, int): + return "int" + elif isinstance(val, str): + return "str" + elif isinstance(val, float): + return "float" + elif isinstance(val, bool): + return "bool" + elif isinstance(val, bytes): + return "bytes" + elif isinstance(val, list): + return "list" + elif isinstance(val, dict): + return "dict" + elif isinstance(val, tuple): + return "tuple" + elif isinstance(val, set): + return "set" + elif isinstance(val, frozenset): + return "frozenset" + elif isinstance(val, complex): + return "complex" + elif isinstance(val, memoryview): + return "memoryview" + else: + return "unknown" + + +# --- Union of many dataclasses --- + +@dataclass +class EventA: + kind: Literal["a"] = "a" + payload: str = "" + +@dataclass +class EventB: + kind: Literal["b"] = "b" + count: int = 0 + +@dataclass +class EventC: + kind: Literal["c"] = "c" + flag: bool = False + +@dataclass +class EventD: + kind: Literal["d"] = "d" + value: float = 0.0 + +@dataclass +class EventE: + kind: Literal["e"] = "e" + items: List[str] = None # type: ignore + +@dataclass +class EventF: + kind: Literal["f"] = "f" + data: Dict[str, Any] = None # type: ignore + +@dataclass +class EventG: + kind: Literal["g"] = "g" + source: str = "" + +@dataclass +class EventH: + kind: Literal["h"] = "h" + target: str = "" + +@dataclass +class EventI: + kind: Literal["i"] = "i" + timestamp: float = 0.0 + +@dataclass +class EventJ: + kind: Literal["j"] = "j" + priority: int = 0 + +Event = Union[EventA, EventB, EventC, EventD, EventE, EventF, EventG, EventH, EventI, EventJ] + + +def dispatch_event(event: Event) -> str: + if event.kind == "a": + return f"A: {event.payload}" + elif event.kind == "b": + return f"B: {event.count}" + elif event.kind == "c": + return f"C: {event.flag}" + elif event.kind == "d": + return f"D: {event.value}" + elif event.kind == "e": + return f"E: {event.items}" + elif event.kind == "f": + return f"F: {event.data}" + elif event.kind == "g": + return f"G: {event.source}" + elif event.kind == "h": + return f"H: {event.target}" + elif event.kind == "i": + return f"I: {event.timestamp}" + elif event.kind == "j": + return f"J: {event.priority}" + else: + _: Never = event + raise ValueError + + +# --- Conditional types via overload --- + +@overload +def maybe_parse(raw: str, strict: Literal[True]) -> int: ... +@overload +def maybe_parse(raw: str, strict: Literal[False]) -> Optional[int]: ... +@overload +def maybe_parse(raw: str, strict: bool = ...) -> Optional[int]: ... + +def maybe_parse(raw: str, strict: bool = False) -> Optional[int]: + try: + return int(raw) + except ValueError: + if strict: + raise + return None + + +# End of union_heavy.py diff --git a/packages/pyright-internal/src/tests/benchmarks/parserBenchmark.test.ts b/packages/pyright-internal/src/tests/benchmarks/parserBenchmark.test.ts new file mode 100644 index 000000000000..2869777706cc --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarks/parserBenchmark.test.ts @@ -0,0 +1,296 @@ +/* + * parserBenchmark.test.ts + * Copyright (c) Microsoft Corporation. + * + * Microbenchmark for the Python parser. + * Measures nodes/sec, parse time, AST node count across representative corpora. + * + * Run with: + * cd packages/pyright/packages/pyright-internal + * node node_modules\jest\bin\jest parserBenchmark.test --runInBand --detectOpenHandles --forceExit --testTimeout=300000 + * + * Results are written as JSON to: + * src/tests/benchmarks/.generated/benchmark-results/parser/ + */ + +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; + +import { DiagnosticSink } from '../../common/diagnosticSink'; +import { ParseOptions, Parser } from '../../parser/parser'; + +// --- Configuration --- + +const WARMUP_ITERATIONS = 3; +const BENCHMARK_ITERATIONS = 10; + +const BENCHMARK_OUTPUT_DIR = path.join(__dirname, '.generated', 'benchmark-results', 'parser'); + +// --- Types --- + +interface BenchmarkResult { + corpus: string; + fileSizeBytes: number; + iterations: number; + timesMs: number[]; + medianMs: number; + p95Ms: number; + minMs: number; + maxMs: number; + avgMs: number; + nodeCount: number; + nodesPerSec: number; + statementCount: number; + errorCount: number; +} + +interface BenchmarkReport { + timestamp: string; + system: { + platform: string; + arch: string; + cpus: string; + cpuCount: number; + totalMemoryMB: number; + nodeVersion: string; + }; + config: { + warmupIterations: number; + benchmarkIterations: number; + }; + results: BenchmarkResult[]; +} + +// --- Helpers --- + +function calculateStats(times: ReadonlyArray): { + median: number; + p95: number; + min: number; + max: number; + avg: number; +} { + const sorted = [...times].sort((a, b) => a - b); + const len = sorted.length; + + const median = len % 2 === 0 ? (sorted[len / 2 - 1] + sorted[len / 2]) / 2 : sorted[Math.floor(len / 2)]; + const p95Index = Math.ceil(len * 0.95) - 1; + const p95 = sorted[Math.min(p95Index, len - 1)]; + const min = sorted[0]; + const max = sorted[len - 1]; + const avg = times.reduce((a, b) => a + b, 0) / len; + + return { median, p95, min, max, avg }; +} + +function loadCorpus(filename: string): string { + const filePath = path.resolve(__dirname, '..', 'benchmarkData', filename); + return fs.readFileSync(filePath, 'utf-8'); +} + +function getSystemInfo(): BenchmarkReport['system'] { + const cpus = os.cpus(); + return { + platform: os.platform(), + arch: os.arch(), + cpus: cpus[0]?.model ?? 'unknown', + cpuCount: cpus.length, + totalMemoryMB: Math.round(os.totalmem() / (1024 * 1024)), + nodeVersion: process.version, + }; +} + +function writeReport(report: BenchmarkReport): void { + fs.mkdirSync(BENCHMARK_OUTPUT_DIR, { recursive: true }); + const filename = `parser-benchmark-${new Date().toISOString().replace(/[:.]/g, '-')}.json`; + const outputPath = path.join(BENCHMARK_OUTPUT_DIR, filename); + fs.writeFileSync(outputPath, JSON.stringify(report, undefined, 2), 'utf-8'); + console.log(`\nBenchmark results written to: ${outputPath}`); +} + +function printResultTable(results: ReadonlyArray): void { + console.log('\n=== Parser Benchmark Results ===\n'); + console.log( + `${'Corpus'.padEnd(25)} ${'Size'.padStart(8)} ${'Nodes'.padStart(8)} ${'Stmts'.padStart(7)} ${'Errors'.padStart( + 7 + )} ${'Median'.padStart(10)} ${'Min'.padStart(10)} ${'Max'.padStart(10)} ${'Avg'.padStart( + 10 + )} ${'Nodes/s'.padStart(12)}` + ); + console.log('-'.repeat(117)); + + for (const r of results) { + const sizeKB = `${(r.fileSizeBytes / 1024).toFixed(1)}KB`; + console.log( + `${r.corpus.padEnd(25)} ${sizeKB.padStart(8)} ${String(r.nodeCount).padStart(8)} ${String( + r.statementCount + ).padStart(7)} ${String(r.errorCount).padStart(7)} ${r.medianMs.toFixed(2).padStart(10)} ${r.minMs + .toFixed(2) + .padStart(10)} ${r.maxMs.toFixed(2).padStart(10)} ${r.avgMs.toFixed(2).padStart(10)} ${Math.round( + r.nodesPerSec + ) + .toLocaleString() + .padStart(12)}` + ); + } + console.log(''); +} + +/** + * Count all AST nodes by walking the tree recursively. + * Pyright parse nodes have: { nodeType, d: { ...children }, ... } + */ +function countNodes(node: any): number { + if (!node || typeof node !== 'object' || !('nodeType' in node)) { + return 0; + } + + let count = 1; + + // Walk the .d data bag where child nodes live + const data = node.d; + if (data && typeof data === 'object') { + for (const key of Object.keys(data)) { + const val = data[key]; + if (val && typeof val === 'object') { + if ('nodeType' in val) { + count += countNodes(val); + } else if (Array.isArray(val)) { + for (const item of val) { + if (item && typeof item === 'object' && 'nodeType' in item) { + count += countNodes(item); + } + } + } + } + } + } + + return count; +} + +function benchmarkParse(corpusName: string, code: string): BenchmarkResult { + const times: number[] = []; + let nodeCount = 0; + let statementCount = 0; + let errorCount = 0; + + const parseOptions = new ParseOptions(); + + // Warmup + for (let i = 0; i < WARMUP_ITERATIONS; i++) { + const parser = new Parser(); + const diagSink = new DiagnosticSink(); + parser.parseSourceFile(code, parseOptions, diagSink); + } + + // Benchmark + for (let i = 0; i < BENCHMARK_ITERATIONS; i++) { + const parser = new Parser(); + const diagSink = new DiagnosticSink(); + + const start = performance.now(); + const result = parser.parseSourceFile(code, parseOptions, diagSink); + const elapsed = performance.now() - start; + + times.push(elapsed); + statementCount = result.parserOutput.parseTree.d.statements.length; + errorCount = diagSink.getErrors().length; + + // Count nodes on the last iteration only (it's expensive) + if (i === BENCHMARK_ITERATIONS - 1) { + nodeCount = countNodes(result.parserOutput.parseTree); + } + } + + const stats = calculateStats(times); + + return { + corpus: corpusName, + fileSizeBytes: Buffer.byteLength(code, 'utf-8'), + iterations: BENCHMARK_ITERATIONS, + timesMs: times, + medianMs: stats.median, + p95Ms: stats.p95, + minMs: stats.min, + maxMs: stats.max, + avgMs: stats.avg, + nodeCount, + nodesPerSec: nodeCount / (stats.median / 1000), + statementCount, + errorCount, + }; +} + +// --- Corpus definitions --- + +const corpora: { name: string; file: string }[] = [ + { name: 'large_stdlib', file: 'large_stdlib.py' }, + { name: 'fstring_heavy', file: 'fstring_heavy.py' }, + { name: 'comment_heavy', file: 'comment_heavy.py' }, + { name: 'large_class', file: 'large_class.py' }, + { name: 'import_heavy', file: 'import_heavy.py' }, + { name: 'union_heavy', file: 'union_heavy.py' }, +]; + +// --- Tests --- + +describe('Parser Benchmark', () => { + const allResults: BenchmarkResult[] = []; + + for (const { name, file } of corpora) { + test(`parse ${name}`, () => { + const code = loadCorpus(file); + const result = benchmarkParse(name, code); + allResults.push(result); + + console.log( + ` ${name}: median=${result.medianMs.toFixed(2)}ms, nodes=${result.nodeCount}, stmts=${ + result.statementCount + }, nodes/sec=${Math.round(result.nodesPerSec).toLocaleString()}` + ); + + // Sanity: parser should produce statements + expect(result.statementCount).toBeGreaterThan(0); + // Sanity: should complete in reasonable time (< 10s per file) + expect(result.medianMs).toBeLessThan(10000); + }); + } + + test('scaled corpus (10x large_stdlib)', () => { + const base = loadCorpus('large_stdlib.py'); + const scaled = Array(10).fill(base).join('\n'); + + const result = benchmarkParse('large_stdlib_10x', scaled); + allResults.push(result); + + console.log( + ` large_stdlib_10x: median=${result.medianMs.toFixed(2)}ms, nodes=${ + result.nodeCount + }, nodes/sec=${Math.round(result.nodesPerSec).toLocaleString()}` + ); + + expect(result.statementCount).toBeGreaterThan(0); + }); + + afterAll(() => { + if (allResults.length === 0) { + return; + } + + printResultTable(allResults); + + const report: BenchmarkReport = { + timestamp: new Date().toISOString(), + system: getSystemInfo(), + config: { + warmupIterations: WARMUP_ITERATIONS, + benchmarkIterations: BENCHMARK_ITERATIONS, + }, + results: allResults, + }; + + writeReport(report); + }); +}); diff --git a/packages/pyright-internal/src/tests/benchmarks/tokenizerBenchmark.test.ts b/packages/pyright-internal/src/tests/benchmarks/tokenizerBenchmark.test.ts new file mode 100644 index 000000000000..48c1521badfb --- /dev/null +++ b/packages/pyright-internal/src/tests/benchmarks/tokenizerBenchmark.test.ts @@ -0,0 +1,316 @@ +/* + * tokenizerBenchmark.test.ts + * Copyright (c) Microsoft Corporation. + * + * Microbenchmark for the Python tokenizer. + * Measures tokens/sec and time-to-tokenize across representative corpora. + * + * Run with: + * cd packages/pyright-internal + * node node_modules\jest\bin\jest tokenizerBenchmark.test --runInBand --detectOpenHandles --forceExit --testTimeout=300000 + * + * Results are written as JSON to: + * src/tests/benchmarks/.generated/benchmark-results/tokenizer/ + */ + +import { execFileSync } from 'child_process'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; + +import { Tokenizer } from '../../parser/tokenizer'; + +// --- Configuration --- + +const WARMUP_ITERATIONS = 3; +const BENCHMARK_ITERATIONS = 10; + +const BENCHMARK_OUTPUT_DIR = path.join(__dirname, '.generated', 'benchmark-results', 'tokenizer'); +const JEST_BIN_PATH = path.resolve(__dirname, '..', '..', '..', 'node_modules', 'jest', 'bin', 'jest.js'); +const CHILD_RESULT_PREFIX = '__TOKENIZER_BENCHMARK_RESULT__'; +const CHILD_MODE_ENV = 'PYRIGHT_TOKENIZER_BENCH_CHILD'; +const RUN_BENCHMARKS_ENV = 'PYRIGHT_RUN_BENCHMARKS'; + +// --- Types --- + +interface BenchmarkResult { + corpus: string; + fileSizeBytes: number; + iterations: number; + timesMs: number[]; + medianMs: number; + p95Ms: number; + minMs: number; + maxMs: number; + avgMs: number; + tokenCount: number; + tokensPerSec: number; +} + +interface BenchmarkReport { + timestamp: string; + system: { + platform: string; + arch: string; + cpus: string; + cpuCount: number; + totalMemoryMB: number; + nodeVersion: string; + }; + config: { + warmupIterations: number; + benchmarkIterations: number; + }; + results: BenchmarkResult[]; +} + +// --- Helpers --- + +function calculateStats(times: ReadonlyArray): { + median: number; + p95: number; + min: number; + max: number; + avg: number; +} { + const sorted = [...times].sort((a, b) => a - b); + const len = sorted.length; + + const median = len % 2 === 0 ? (sorted[len / 2 - 1] + sorted[len / 2]) / 2 : sorted[Math.floor(len / 2)]; + const p95Index = Math.ceil(len * 0.95) - 1; + const p95 = sorted[Math.min(p95Index, len - 1)]; + const min = sorted[0]; + const max = sorted[len - 1]; + const avg = times.reduce((a, b) => a + b, 0) / len; + + return { median, p95, min, max, avg }; +} + +function loadCorpus(filename: string): string { + const filePath = path.resolve(__dirname, '..', 'benchmarkData', filename); + return fs.readFileSync(filePath, 'utf-8'); +} + +function getSystemInfo(): BenchmarkReport['system'] { + const cpus = os.cpus(); + return { + platform: os.platform(), + arch: os.arch(), + cpus: cpus[0]?.model ?? 'unknown', + cpuCount: cpus.length, + totalMemoryMB: Math.round(os.totalmem() / (1024 * 1024)), + nodeVersion: process.version, + }; +} + +function writeReport(report: BenchmarkReport): void { + fs.mkdirSync(BENCHMARK_OUTPUT_DIR, { recursive: true }); + const filename = `tokenizer-benchmark-${new Date().toISOString().replace(/[:.]/g, '-')}.json`; + const outputPath = path.join(BENCHMARK_OUTPUT_DIR, filename); + fs.writeFileSync(outputPath, JSON.stringify(report, undefined, 2), 'utf-8'); + console.log(`\nBenchmark results written to: ${outputPath}`); +} + +function printResultTable(results: ReadonlyArray): void { + console.log('\n=== Tokenizer Benchmark Results ===\n'); + console.log( + `${'Corpus'.padEnd(25)} ${'Size'.padStart(8)} ${'Tokens'.padStart(8)} ${'Median'.padStart(10)} ${'Min'.padStart( + 10 + )} ${'Max'.padStart(10)} ${'Avg'.padStart(10)} ${'p95'.padStart(10)} ${'Tok/sec'.padStart(12)}` + ); + console.log('-'.repeat(113)); + + for (const result of results) { + const sizeKB = `${(result.fileSizeBytes / 1024).toFixed(1)}KB`; + console.log( + `${result.corpus.padEnd(25)} ${sizeKB.padStart(8)} ${String(result.tokenCount).padStart( + 8 + )} ${result.medianMs.toFixed(2).padStart(10)} ${result.minMs.toFixed(2).padStart(10)} ${result.maxMs + .toFixed(2) + .padStart(10)} ${result.avgMs.toFixed(2).padStart(10)} ${result.p95Ms + .toFixed(2) + .padStart(10)} ${Math.round(result.tokensPerSec).toLocaleString().padStart(12)}` + ); + } + console.log(''); +} + +function emitChildResult(result: BenchmarkResult): void { + process.stdout.write(`${CHILD_RESULT_PREFIX}${JSON.stringify(result)}\n`); +} + +function getChildOutput(error: unknown): string { + if (!(error instanceof Error)) { + return ''; + } + + const stdout = 'stdout' in error && typeof error.stdout === 'string' ? error.stdout : ''; + const stderr = 'stderr' in error && typeof error.stderr === 'string' ? error.stderr : ''; + return [stdout, stderr].filter((part) => part.length > 0).join('\n'); +} + +function escapeRegExp(text: string): string { + return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function runBenchmarkInFreshProcess(testName: string): BenchmarkResult { + try { + const output = execFileSync( + process.execPath, + [ + JEST_BIN_PATH, + __filename, + '--runInBand', + '--forceExit', + '--testTimeout=300000', + '--testNamePattern', + `^Tokenizer Benchmark ${escapeRegExp(testName)}$`, + ], + { + cwd: path.resolve(__dirname, '..', '..', '..'), + encoding: 'utf-8', + env: { + ...process.env, + [CHILD_MODE_ENV]: '1', + }, + } + ); + + const resultLine = output.split(/\r?\n/).find((line) => line.startsWith(CHILD_RESULT_PREFIX)); + + if (!resultLine) { + throw new Error(`Child benchmark for "${testName}" did not emit a result.\n${output}`); + } + + return JSON.parse(resultLine.slice(CHILD_RESULT_PREFIX.length)) as BenchmarkResult; + } catch (error) { + const output = getChildOutput(error); + const message = error instanceof Error ? error.message : String(error); + throw new Error(`Child benchmark for "${testName}" failed.\n${message}${output ? `\n${output}` : ''}`); + } +} + +function benchmarkTokenize(corpusName: string, code: string): BenchmarkResult { + const times: number[] = []; + let tokenCount = 0; + + for (let i = 0; i < WARMUP_ITERATIONS; i++) { + const tokenizer = new Tokenizer(); + tokenizer.tokenize(code); + } + + for (let i = 0; i < BENCHMARK_ITERATIONS; i++) { + const tokenizer = new Tokenizer(); + + const start = performance.now(); + const results = tokenizer.tokenize(code); + const elapsed = performance.now() - start; + + times.push(elapsed); + tokenCount = results.tokens.count; + } + + const stats = calculateStats(times); + + return { + corpus: corpusName, + fileSizeBytes: Buffer.byteLength(code, 'utf-8'), + iterations: BENCHMARK_ITERATIONS, + timesMs: times, + medianMs: stats.median, + p95Ms: stats.p95, + minMs: stats.min, + maxMs: stats.max, + avgMs: stats.avg, + tokenCount, + tokensPerSec: tokenCount / (stats.median / 1000), + }; +} + +// --- Corpus definitions --- + +const corpora: { name: string; file: string }[] = [ + { name: 'large_stdlib', file: 'large_stdlib.py' }, + { name: 'fstring_heavy', file: 'fstring_heavy.py' }, + { name: 'comment_heavy', file: 'comment_heavy.py' }, + { name: 'large_class', file: 'large_class.py' }, + { name: 'import_heavy', file: 'import_heavy.py' }, + { name: 'union_heavy', file: 'union_heavy.py' }, + { name: 'repetitive_identifiers', file: 'repetitive_identifiers.py' }, +]; + +// --- Tests --- + +const benchmarkSuite = process.env[RUN_BENCHMARKS_ENV] === '1' ? describe : describe.skip; + +benchmarkSuite('Tokenizer Benchmark', () => { + const allResults: BenchmarkResult[] = []; + const isChildProcess = process.env[CHILD_MODE_ENV] === '1'; + + for (const { name, file } of corpora) { + test(`tokenize ${name}`, () => { + const result = isChildProcess + ? benchmarkTokenize(name, loadCorpus(file)) + : runBenchmarkInFreshProcess(`tokenize ${name}`); + + if (!isChildProcess) { + allResults.push(result); + } + + console.log( + ` ${name}: median=${result.medianMs.toFixed(2)}ms, tokens=${result.tokenCount}, tok/sec=${Math.round( + result.tokensPerSec + ).toLocaleString()}` + ); + + if (isChildProcess) { + emitChildResult(result); + } + + expect(result.tokenCount).toBeGreaterThan(0); + expect(result.medianMs).toBeLessThan(5000); + }); + } + + test('scaled corpus (10x large_stdlib)', () => { + const result = isChildProcess + ? benchmarkTokenize('large_stdlib_10x', Array(10).fill(loadCorpus('large_stdlib.py')).join('\n')) + : runBenchmarkInFreshProcess('scaled corpus (10x large_stdlib)'); + + if (!isChildProcess) { + allResults.push(result); + } + + console.log( + ` large_stdlib_10x: median=${result.medianMs.toFixed(2)}ms, tokens=${ + result.tokenCount + }, tok/sec=${Math.round(result.tokensPerSec).toLocaleString()}` + ); + + if (isChildProcess) { + emitChildResult(result); + } + + expect(result.tokenCount).toBeGreaterThan(0); + }); + + afterAll(() => { + if (isChildProcess || allResults.length === 0) { + return; + } + + printResultTable(allResults); + + const report: BenchmarkReport = { + timestamp: new Date().toISOString(), + system: getSystemInfo(), + config: { + warmupIterations: WARMUP_ITERATIONS, + benchmarkIterations: BENCHMARK_ITERATIONS, + }, + results: allResults, + }; + + writeReport(report); + }); +}); diff --git a/packages/pyright-internal/src/tests/tokenizer.test.ts b/packages/pyright-internal/src/tests/tokenizer.test.ts index 32f92009bbd1..9cec1fcbec68 100644 --- a/packages/pyright-internal/src/tests/tokenizer.test.ts +++ b/packages/pyright-internal/src/tests/tokenizer.test.ts @@ -1676,7 +1676,8 @@ test('Lines1', () => { test('Comments1', () => { const t = new Tokenizer(); - const results = t.tokenize('# hello\n# good bye\n\n\n""" test """ # another\n\n\npass'); + const text = '# hello\n# good bye\n\n\n""" test """ # another\n\n\npass'; + const results = t.tokenize(text); assert.equal(results.tokens.count, 4 + _implicitTokenCount); const token0 = results.tokens.getItemAt(0); @@ -1711,7 +1712,8 @@ test('Comments1', () => { test('Comments2', () => { const t = new Tokenizer(); - const results = t.tokenize('class A:\n def func(self):\n pass\n # comment\n '); + const text = 'class A:\n def func(self):\n pass\n # comment\n '; + const results = t.tokenize(text); assert.equal(results.tokens.count, 16 + _implicitTokenCount); const token17 = results.tokens.getItemAt(17); @@ -1815,6 +1817,21 @@ test('TypeIgnoreLine2', () => { assert.equal(results.tokens.contains(42), false); }); +test('TypeIgnoreLineMalformedBracket', () => { + const t = new Tokenizer(); + const results = t.tokenize('a = 3 # type: ignore[broken'); + assert.equal(results.typeIgnoreLines.size, 0); +}); + +// A space-separated unclosed bracket (e.g. `# type: ignore [broken`) is also +// rejected entirely. The tokenizer does not fall back to treating the +// directive as "ignore all" when the bracket list is present but malformed. +test('TypeIgnoreLineMalformedBracketWithSpace', () => { + const t = new Tokenizer(); + const results = t.tokenize('a = 3 # type: ignore [broken'); + assert.equal(results.typeIgnoreLines.size, 0); +}); + // Regression test for https://github.com/microsoft/pyright/issues/11345. // type: ignore comments containing tool-namespaced codes (e.g. "ty:rule-name") // must be recognised as type: ignore comments.