Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions src/yaml/lexer.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,35 @@ struct Lexer:
var tokens = List[Token]()

while self.pos < len(self.input):
# Handle indentation at line start
if self.at_line_start:
var indent_level = self.count_leading_spaces()

# Skip blank lines and comment-only lines for indentation tracking
var temp_pos = self.pos + indent_level
if temp_pos >= len(self.input) or String(self.input[temp_pos]) == "\n" or String(self.input[temp_pos]) == "#":
# Blank or comment line - don't change indentation
self.at_line_start = False
else:
# Real content - process indentation change
var current_indent = self.indent_stack[len(self.indent_stack) - 1]

if indent_level > current_indent:
# Increased indentation - emit INDENT
tokens.append(Token(TokenKind.INDENT(), "", Position(self.line, self.column)))
self.indent_stack.append(indent_level)
elif indent_level < current_indent:
# Decreased indentation - emit DEDENT(s)
while len(self.indent_stack) > 1 and self.indent_stack[len(self.indent_stack) - 1] > indent_level:
tokens.append(Token(TokenKind.DEDENT(), "", Position(self.line, self.column)))
_ = self.indent_stack.pop()

# Check for indentation mismatch
if len(self.indent_stack) > 0 and self.indent_stack[len(self.indent_stack) - 1] != indent_level:
raise Error("Indentation mismatch at line " + String(self.line))

self.at_line_start = False

var c = self.current()

# Skip whitespace (but track for indentation later)
Expand All @@ -429,6 +458,7 @@ struct Lexer:
if c == "\n":
tokens.append(Token(TokenKind.NEWLINE(), "\n", Position(self.line, self.column)))
_ = self.advance()
self.at_line_start = True
continue

# Comment
Expand Down Expand Up @@ -477,6 +507,11 @@ struct Lexer:
# Unknown character - skip for now (in real impl, should error)
_ = self.advance()

# Emit remaining DEDENT tokens at EOF
while len(self.indent_stack) > 1:
tokens.append(Token(TokenKind.DEDENT(), "", Position(self.line, self.column)))
_ = self.indent_stack.pop()

# Add EOF token
tokens.append(Token(TokenKind.EOF(), "", Position(self.line, self.column)))

Expand Down
184 changes: 184 additions & 0 deletions tests/test_lexer_indentation.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""Tests for YAML lexer indentation tracking.

Validates INDENT/DEDENT token emission based on indentation changes.
"""

from testing import assert_equal, assert_true, TestSuite
from yaml.lexer import Lexer, TokenKind


def test_simple_indent():
"""Test single level of indentation."""
var lexer = Lexer("parent:\n child: value")
var tokens = lexer.tokenize()

# parent : \n INDENT child : value DEDENT EOF
assert_equal(len(tokens), 9)
assert_true(tokens[0].kind == TokenKind.STRING())
assert_equal(tokens[0].value, "parent")
assert_true(tokens[1].kind == TokenKind.COLON())
assert_true(tokens[2].kind == TokenKind.NEWLINE())
assert_true(tokens[3].kind == TokenKind.INDENT())
assert_true(tokens[4].kind == TokenKind.STRING())
assert_equal(tokens[4].value, "child")
assert_true(tokens[5].kind == TokenKind.COLON())
assert_true(tokens[6].kind == TokenKind.STRING())
assert_equal(tokens[6].value, "value")
assert_true(tokens[7].kind == TokenKind.DEDENT())


def test_simple_dedent():
"""Test dedent back to base level."""
var lexer = Lexer("outer:\n inner: 1\nback: 2")
var tokens = lexer.tokenize()

# outer : \n INDENT inner : 1 \n DEDENT back : 2 EOF
assert_equal(len(tokens), 13)
assert_true(tokens[0].kind == TokenKind.STRING()) # outer
assert_true(tokens[1].kind == TokenKind.COLON())
assert_true(tokens[2].kind == TokenKind.NEWLINE())
assert_true(tokens[3].kind == TokenKind.INDENT())
assert_true(tokens[4].kind == TokenKind.STRING()) # inner
assert_true(tokens[5].kind == TokenKind.COLON())
assert_true(tokens[6].kind == TokenKind.INTEGER()) # 1
assert_true(tokens[7].kind == TokenKind.NEWLINE())
assert_true(tokens[8].kind == TokenKind.DEDENT())
assert_true(tokens[9].kind == TokenKind.STRING()) # back
assert_true(tokens[10].kind == TokenKind.COLON())
assert_true(tokens[11].kind == TokenKind.INTEGER()) # 2


def test_multiple_indent_levels():
"""Test nested indentation (multiple levels)."""
var lexer = Lexer("a:\n b:\n c: value")
var tokens = lexer.tokenize()

# a : \n INDENT b : \n INDENT c : value DEDENT DEDENT EOF
assert_equal(len(tokens), 14)
assert_true(tokens[0].kind == TokenKind.STRING()) # a
assert_true(tokens[2].kind == TokenKind.NEWLINE())
assert_true(tokens[3].kind == TokenKind.INDENT())
assert_true(tokens[4].kind == TokenKind.STRING()) # b
assert_true(tokens[6].kind == TokenKind.NEWLINE())
assert_true(tokens[7].kind == TokenKind.INDENT())
assert_true(tokens[8].kind == TokenKind.STRING()) # c
assert_true(tokens[11].kind == TokenKind.DEDENT())
assert_true(tokens[12].kind == TokenKind.DEDENT())


def test_multiple_dedents():
"""Test dedenting multiple levels at once."""
var lexer = Lexer("a:\n b:\n c: 1\nback: 2")
var tokens = lexer.tokenize()

# Should have 2 DEDENT tokens when going from level 2 back to 0
var dedent_count = 0
for i in range(len(tokens)):
if tokens[i].kind == TokenKind.DEDENT():
dedent_count += 1

assert_equal(dedent_count, 2)


def test_blank_line_ignored():
"""Test that blank lines don't affect indentation."""
var lexer = Lexer("parent:\n\n child: value")
var tokens = lexer.tokenize()

# Blank line should not create extra INDENT/DEDENT
var indent_count = 0
var dedent_count = 0
for i in range(len(tokens)):
if tokens[i].kind == TokenKind.INDENT():
indent_count += 1
if tokens[i].kind == TokenKind.DEDENT():
dedent_count += 1

assert_equal(indent_count, 1)
assert_equal(dedent_count, 1)


def test_comment_line_ignored():
"""Test that comment-only lines don't affect indentation."""
var lexer = Lexer("parent:\n # comment\n child: value")
var tokens = lexer.tokenize()

# Comment line should not create extra INDENT/DEDENT
var indent_count = 0
var dedent_count = 0
for i in range(len(tokens)):
if tokens[i].kind == TokenKind.INDENT():
indent_count += 1
if tokens[i].kind == TokenKind.DEDENT():
dedent_count += 1

assert_equal(indent_count, 1)
assert_equal(dedent_count, 1)


def test_list_with_indented_items():
"""Test list items with nested content."""
var lexer = Lexer("items:\n - name: Alice\n age: 30")
var tokens = lexer.tokenize()

# items : \n INDENT - name : Alice \n INDENT age : 30 DEDENT DEDENT EOF
# Note: 4-space indent for 'age' is deeper than 2-space indent for dash
assert_equal(len(tokens), 16)
assert_true(tokens[0].kind == TokenKind.STRING()) # items
assert_true(tokens[2].kind == TokenKind.NEWLINE())
assert_true(tokens[3].kind == TokenKind.INDENT())
assert_true(tokens[4].kind == TokenKind.DASH())
assert_true(tokens[5].kind == TokenKind.STRING()) # name


def test_same_indent_no_tokens():
"""Test that same indentation level doesn't emit tokens."""
var lexer = Lexer("key1: val1\nkey2: val2")
var tokens = lexer.tokenize()

# No INDENT/DEDENT tokens should be present
for i in range(len(tokens)):
assert_true(tokens[i].kind != TokenKind.INDENT())
assert_true(tokens[i].kind != TokenKind.DEDENT())


def test_dedent_at_eof():
"""Test that DEDENT tokens are emitted at EOF."""
var lexer = Lexer("a:\n b:\n c: value")
var tokens = lexer.tokenize()

# Should have 2 INDENT and 2 DEDENT (at EOF)
var indent_count = 0
var dedent_count = 0
for i in range(len(tokens)):
if tokens[i].kind == TokenKind.INDENT():
indent_count += 1
if tokens[i].kind == TokenKind.DEDENT():
dedent_count += 1

assert_equal(indent_count, 2)
assert_equal(dedent_count, 2)


def test_mixed_indentation_with_lists():
"""Test complex structure with mappings and sequences."""
var lexer = Lexer("config:\n servers:\n - host: localhost\n port: 8080")
var tokens = lexer.tokenize()

# config : \n INDENT servers : \n INDENT - host : localhost \n INDENT port : 8080 DEDENT DEDENT DEDENT EOF
# Note: 6-space indent for 'port' is deeper than 4-space dash line
var indent_count = 0
var dedent_count = 0
for i in range(len(tokens)):
if tokens[i].kind == TokenKind.INDENT():
indent_count += 1
if tokens[i].kind == TokenKind.DEDENT():
dedent_count += 1

assert_equal(indent_count, 3)
assert_equal(dedent_count, 3)


def main():
"""Run all indentation tests."""
TestSuite.discover_tests[__functions_in_module()]().run()
30 changes: 16 additions & 14 deletions tests/test_lexer_structural.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -60,38 +60,40 @@ def test_sequence_with_mapping():
var lexer = Lexer("- name: Alice\n age: 30")
var tokens = lexer.tokenize()

# - name : Alice \n age : 30 EOF
assert_equal(len(tokens), 9)
# - name : Alice \n INDENT age : 30 DEDENT EOF
assert_equal(len(tokens), 11)
assert_true(tokens[0].kind == TokenKind.DASH())
assert_true(tokens[1].kind == TokenKind.STRING())
assert_equal(tokens[1].value, "name")
assert_true(tokens[2].kind == TokenKind.COLON())
assert_true(tokens[3].kind == TokenKind.STRING())
assert_equal(tokens[3].value, "Alice")
assert_true(tokens[4].kind == TokenKind.NEWLINE())
assert_true(tokens[5].kind == TokenKind.STRING())
assert_equal(tokens[5].value, "age")
assert_true(tokens[6].kind == TokenKind.COLON())
assert_true(tokens[7].kind == TokenKind.INTEGER())
assert_equal(tokens[7].value, "30")
assert_true(tokens[5].kind == TokenKind.INDENT())
assert_true(tokens[6].kind == TokenKind.STRING())
assert_equal(tokens[6].value, "age")
assert_true(tokens[7].kind == TokenKind.COLON())
assert_true(tokens[8].kind == TokenKind.INTEGER())
assert_equal(tokens[8].value, "30")


def test_nested_mapping():
"""Test mapping containing nested mapping."""
var lexer = Lexer("parent:\n child: value")
var tokens = lexer.tokenize()

# parent : \n child : value EOF
assert_equal(len(tokens), 7)
# parent : \n INDENT child : value DEDENT EOF
assert_equal(len(tokens), 9)
assert_true(tokens[0].kind == TokenKind.STRING())
assert_equal(tokens[0].value, "parent")
assert_true(tokens[1].kind == TokenKind.COLON())
assert_true(tokens[2].kind == TokenKind.NEWLINE())
assert_true(tokens[3].kind == TokenKind.STRING())
assert_equal(tokens[3].value, "child")
assert_true(tokens[4].kind == TokenKind.COLON())
assert_true(tokens[5].kind == TokenKind.STRING())
assert_equal(tokens[5].value, "value")
assert_true(tokens[3].kind == TokenKind.INDENT())
assert_true(tokens[4].kind == TokenKind.STRING())
assert_equal(tokens[4].value, "child")
assert_true(tokens[5].kind == TokenKind.COLON())
assert_true(tokens[6].kind == TokenKind.STRING())
assert_equal(tokens[6].value, "value")


def test_sequence_spacing_variations():
Expand Down
Loading