diff --git a/src/yaml/lexer.mojo b/src/yaml/lexer.mojo index d62fbcb..77e02e9 100644 --- a/src/yaml/lexer.mojo +++ b/src/yaml/lexer.mojo @@ -418,6 +418,35 @@ struct Lexer: var tokens = List[Token]() while self.pos < len(self.input): + # Handle indentation at line start + if self.at_line_start: + var indent_level = self.count_leading_spaces() + + # Skip blank lines and comment-only lines for indentation tracking + var temp_pos = self.pos + indent_level + if temp_pos >= len(self.input) or String(self.input[temp_pos]) == "\n" or String(self.input[temp_pos]) == "#": + # Blank or comment line - don't change indentation + self.at_line_start = False + else: + # Real content - process indentation change + var current_indent = self.indent_stack[len(self.indent_stack) - 1] + + if indent_level > current_indent: + # Increased indentation - emit INDENT + tokens.append(Token(TokenKind.INDENT(), "", Position(self.line, self.column))) + self.indent_stack.append(indent_level) + elif indent_level < current_indent: + # Decreased indentation - emit DEDENT(s) + while len(self.indent_stack) > 1 and self.indent_stack[len(self.indent_stack) - 1] > indent_level: + tokens.append(Token(TokenKind.DEDENT(), "", Position(self.line, self.column))) + _ = self.indent_stack.pop() + + # Check for indentation mismatch + if len(self.indent_stack) > 0 and self.indent_stack[len(self.indent_stack) - 1] != indent_level: + raise Error("Indentation mismatch at line " + String(self.line)) + + self.at_line_start = False + var c = self.current() # Skip whitespace (but track for indentation later) @@ -429,6 +458,7 @@ struct Lexer: if c == "\n": tokens.append(Token(TokenKind.NEWLINE(), "\n", Position(self.line, self.column))) _ = self.advance() + self.at_line_start = True continue # Comment @@ -477,6 +507,11 @@ struct Lexer: # Unknown character - skip for now (in real impl, should error) _ = self.advance() + # Emit remaining DEDENT tokens at EOF + while len(self.indent_stack) > 1: + tokens.append(Token(TokenKind.DEDENT(), "", Position(self.line, self.column))) + _ = self.indent_stack.pop() + # Add EOF token tokens.append(Token(TokenKind.EOF(), "", Position(self.line, self.column))) diff --git a/tests/test_lexer_indentation.mojo b/tests/test_lexer_indentation.mojo new file mode 100644 index 0000000..cace3cf --- /dev/null +++ b/tests/test_lexer_indentation.mojo @@ -0,0 +1,184 @@ +"""Tests for YAML lexer indentation tracking. + +Validates INDENT/DEDENT token emission based on indentation changes. +""" + +from testing import assert_equal, assert_true, TestSuite +from yaml.lexer import Lexer, TokenKind + + +def test_simple_indent(): + """Test single level of indentation.""" + var lexer = Lexer("parent:\n child: value") + var tokens = lexer.tokenize() + + # parent : \n INDENT child : value DEDENT EOF + assert_equal(len(tokens), 9) + assert_true(tokens[0].kind == TokenKind.STRING()) + assert_equal(tokens[0].value, "parent") + assert_true(tokens[1].kind == TokenKind.COLON()) + assert_true(tokens[2].kind == TokenKind.NEWLINE()) + assert_true(tokens[3].kind == TokenKind.INDENT()) + assert_true(tokens[4].kind == TokenKind.STRING()) + assert_equal(tokens[4].value, "child") + assert_true(tokens[5].kind == TokenKind.COLON()) + assert_true(tokens[6].kind == TokenKind.STRING()) + assert_equal(tokens[6].value, "value") + assert_true(tokens[7].kind == TokenKind.DEDENT()) + + +def test_simple_dedent(): + """Test dedent back to base level.""" + var lexer = Lexer("outer:\n inner: 1\nback: 2") + var tokens = lexer.tokenize() + + # outer : \n INDENT inner : 1 \n DEDENT back : 2 EOF + assert_equal(len(tokens), 13) + assert_true(tokens[0].kind == TokenKind.STRING()) # outer + assert_true(tokens[1].kind == TokenKind.COLON()) + assert_true(tokens[2].kind == TokenKind.NEWLINE()) + assert_true(tokens[3].kind == TokenKind.INDENT()) + assert_true(tokens[4].kind == TokenKind.STRING()) # inner + assert_true(tokens[5].kind == TokenKind.COLON()) + assert_true(tokens[6].kind == TokenKind.INTEGER()) # 1 + assert_true(tokens[7].kind == TokenKind.NEWLINE()) + assert_true(tokens[8].kind == TokenKind.DEDENT()) + assert_true(tokens[9].kind == TokenKind.STRING()) # back + assert_true(tokens[10].kind == TokenKind.COLON()) + assert_true(tokens[11].kind == TokenKind.INTEGER()) # 2 + + +def test_multiple_indent_levels(): + """Test nested indentation (multiple levels).""" + var lexer = Lexer("a:\n b:\n c: value") + var tokens = lexer.tokenize() + + # a : \n INDENT b : \n INDENT c : value DEDENT DEDENT EOF + assert_equal(len(tokens), 14) + assert_true(tokens[0].kind == TokenKind.STRING()) # a + assert_true(tokens[2].kind == TokenKind.NEWLINE()) + assert_true(tokens[3].kind == TokenKind.INDENT()) + assert_true(tokens[4].kind == TokenKind.STRING()) # b + assert_true(tokens[6].kind == TokenKind.NEWLINE()) + assert_true(tokens[7].kind == TokenKind.INDENT()) + assert_true(tokens[8].kind == TokenKind.STRING()) # c + assert_true(tokens[11].kind == TokenKind.DEDENT()) + assert_true(tokens[12].kind == TokenKind.DEDENT()) + + +def test_multiple_dedents(): + """Test dedenting multiple levels at once.""" + var lexer = Lexer("a:\n b:\n c: 1\nback: 2") + var tokens = lexer.tokenize() + + # Should have 2 DEDENT tokens when going from level 2 back to 0 + var dedent_count = 0 + for i in range(len(tokens)): + if tokens[i].kind == TokenKind.DEDENT(): + dedent_count += 1 + + assert_equal(dedent_count, 2) + + +def test_blank_line_ignored(): + """Test that blank lines don't affect indentation.""" + var lexer = Lexer("parent:\n\n child: value") + var tokens = lexer.tokenize() + + # Blank line should not create extra INDENT/DEDENT + var indent_count = 0 + var dedent_count = 0 + for i in range(len(tokens)): + if tokens[i].kind == TokenKind.INDENT(): + indent_count += 1 + if tokens[i].kind == TokenKind.DEDENT(): + dedent_count += 1 + + assert_equal(indent_count, 1) + assert_equal(dedent_count, 1) + + +def test_comment_line_ignored(): + """Test that comment-only lines don't affect indentation.""" + var lexer = Lexer("parent:\n # comment\n child: value") + var tokens = lexer.tokenize() + + # Comment line should not create extra INDENT/DEDENT + var indent_count = 0 + var dedent_count = 0 + for i in range(len(tokens)): + if tokens[i].kind == TokenKind.INDENT(): + indent_count += 1 + if tokens[i].kind == TokenKind.DEDENT(): + dedent_count += 1 + + assert_equal(indent_count, 1) + assert_equal(dedent_count, 1) + + +def test_list_with_indented_items(): + """Test list items with nested content.""" + var lexer = Lexer("items:\n - name: Alice\n age: 30") + var tokens = lexer.tokenize() + + # items : \n INDENT - name : Alice \n INDENT age : 30 DEDENT DEDENT EOF + # Note: 4-space indent for 'age' is deeper than 2-space indent for dash + assert_equal(len(tokens), 16) + assert_true(tokens[0].kind == TokenKind.STRING()) # items + assert_true(tokens[2].kind == TokenKind.NEWLINE()) + assert_true(tokens[3].kind == TokenKind.INDENT()) + assert_true(tokens[4].kind == TokenKind.DASH()) + assert_true(tokens[5].kind == TokenKind.STRING()) # name + + +def test_same_indent_no_tokens(): + """Test that same indentation level doesn't emit tokens.""" + var lexer = Lexer("key1: val1\nkey2: val2") + var tokens = lexer.tokenize() + + # No INDENT/DEDENT tokens should be present + for i in range(len(tokens)): + assert_true(tokens[i].kind != TokenKind.INDENT()) + assert_true(tokens[i].kind != TokenKind.DEDENT()) + + +def test_dedent_at_eof(): + """Test that DEDENT tokens are emitted at EOF.""" + var lexer = Lexer("a:\n b:\n c: value") + var tokens = lexer.tokenize() + + # Should have 2 INDENT and 2 DEDENT (at EOF) + var indent_count = 0 + var dedent_count = 0 + for i in range(len(tokens)): + if tokens[i].kind == TokenKind.INDENT(): + indent_count += 1 + if tokens[i].kind == TokenKind.DEDENT(): + dedent_count += 1 + + assert_equal(indent_count, 2) + assert_equal(dedent_count, 2) + + +def test_mixed_indentation_with_lists(): + """Test complex structure with mappings and sequences.""" + var lexer = Lexer("config:\n servers:\n - host: localhost\n port: 8080") + var tokens = lexer.tokenize() + + # config : \n INDENT servers : \n INDENT - host : localhost \n INDENT port : 8080 DEDENT DEDENT DEDENT EOF + # Note: 6-space indent for 'port' is deeper than 4-space dash line + var indent_count = 0 + var dedent_count = 0 + for i in range(len(tokens)): + if tokens[i].kind == TokenKind.INDENT(): + indent_count += 1 + if tokens[i].kind == TokenKind.DEDENT(): + dedent_count += 1 + + assert_equal(indent_count, 3) + assert_equal(dedent_count, 3) + + +def main(): + """Run all indentation tests.""" + TestSuite.discover_tests[__functions_in_module()]().run() diff --git a/tests/test_lexer_structural.mojo b/tests/test_lexer_structural.mojo index 77241ae..d2601c6 100644 --- a/tests/test_lexer_structural.mojo +++ b/tests/test_lexer_structural.mojo @@ -60,8 +60,8 @@ def test_sequence_with_mapping(): var lexer = Lexer("- name: Alice\n age: 30") var tokens = lexer.tokenize() - # - name : Alice \n age : 30 EOF - assert_equal(len(tokens), 9) + # - name : Alice \n INDENT age : 30 DEDENT EOF + assert_equal(len(tokens), 11) assert_true(tokens[0].kind == TokenKind.DASH()) assert_true(tokens[1].kind == TokenKind.STRING()) assert_equal(tokens[1].value, "name") @@ -69,11 +69,12 @@ def test_sequence_with_mapping(): assert_true(tokens[3].kind == TokenKind.STRING()) assert_equal(tokens[3].value, "Alice") assert_true(tokens[4].kind == TokenKind.NEWLINE()) - assert_true(tokens[5].kind == TokenKind.STRING()) - assert_equal(tokens[5].value, "age") - assert_true(tokens[6].kind == TokenKind.COLON()) - assert_true(tokens[7].kind == TokenKind.INTEGER()) - assert_equal(tokens[7].value, "30") + assert_true(tokens[5].kind == TokenKind.INDENT()) + assert_true(tokens[6].kind == TokenKind.STRING()) + assert_equal(tokens[6].value, "age") + assert_true(tokens[7].kind == TokenKind.COLON()) + assert_true(tokens[8].kind == TokenKind.INTEGER()) + assert_equal(tokens[8].value, "30") def test_nested_mapping(): @@ -81,17 +82,18 @@ def test_nested_mapping(): var lexer = Lexer("parent:\n child: value") var tokens = lexer.tokenize() - # parent : \n child : value EOF - assert_equal(len(tokens), 7) + # parent : \n INDENT child : value DEDENT EOF + assert_equal(len(tokens), 9) assert_true(tokens[0].kind == TokenKind.STRING()) assert_equal(tokens[0].value, "parent") assert_true(tokens[1].kind == TokenKind.COLON()) assert_true(tokens[2].kind == TokenKind.NEWLINE()) - assert_true(tokens[3].kind == TokenKind.STRING()) - assert_equal(tokens[3].value, "child") - assert_true(tokens[4].kind == TokenKind.COLON()) - assert_true(tokens[5].kind == TokenKind.STRING()) - assert_equal(tokens[5].value, "value") + assert_true(tokens[3].kind == TokenKind.INDENT()) + assert_true(tokens[4].kind == TokenKind.STRING()) + assert_equal(tokens[4].value, "child") + assert_true(tokens[5].kind == TokenKind.COLON()) + assert_true(tokens[6].kind == TokenKind.STRING()) + assert_equal(tokens[6].value, "value") def test_sequence_spacing_variations():