From eebffebc9658167ed9a8400ea3c189c9a935580a Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Sun, 6 Jul 2025 21:53:05 +0100 Subject: [PATCH 01/19] lexer: Make most errors recoverable --- src/b.rs | 92 ++++++++++++++++++++++++++++------------------------ src/lexer.rs | 76 +++++++++++++++++++++---------------------- 2 files changed, 87 insertions(+), 81 deletions(-) diff --git a/src/b.rs b/src/b.rs index 17c0b45e..e85993ed 100644 --- a/src/b.rs +++ b/src/b.rs @@ -27,7 +27,15 @@ use crust::libc::*; use crust::assoc_lookup_cstr; use arena::Arena; use targets::*; -use lexer::{Lexer, Loc, Token}; +use lexer::{Lexer, Loc, Token, ErrorKind}; + +pub unsafe fn get_token(l: *mut Lexer, c: *mut Compiler) -> Option<()> { + match lexer::get_token(l) { + Ok(()) => Some(()), + Err(ErrorKind::Error) => bump_error_count(c), + Err(ErrorKind::Fatal) => bump_error_count(c).and(None), + } +} pub unsafe fn expect_tokens(l: *mut Lexer, tokens: *const [Token]) -> Option<()> { for i in 0..tokens.len() { @@ -59,14 +67,14 @@ pub unsafe fn expect_token(l: *mut Lexer, token: Token) -> Option<()> { expect_tokens(l, &[token]) } -pub unsafe fn get_and_expect_token(l: *mut Lexer, token: Token) -> Option<()> { - lexer::get_token(l)?; +pub unsafe fn get_and_expect_token(l: *mut Lexer, c: *mut Compiler, token: Token) -> Option<()> { + get_token(l, c)?; expect_token(l, token) } pub unsafe fn get_and_expect_token_but_continue(l: *mut Lexer, c: *mut Compiler, token: Token) -> Option<()> { let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; if let None = expect_token(l, token) { (*l).parse_point = saved_point; bump_error_count(c) @@ -75,8 +83,8 @@ pub unsafe fn get_and_expect_token_but_continue(l: *mut Lexer, c: *mut Compiler, } } -pub unsafe fn get_and_expect_tokens(l: *mut Lexer, clexes: *const [Token]) -> Option<()> { - lexer::get_token(l)?; +pub unsafe fn get_and_expect_tokens(l: *mut Lexer, c: *mut Compiler, clexes: *const [Token]) -> Option<()> { + get_token(l, c)?; expect_tokens(l, clexes) } @@ -89,8 +97,8 @@ pub unsafe fn expect_token_id(l: *mut Lexer, id: *const c_char) -> Option<()> { Some(()) } -pub unsafe fn get_and_expect_token_id(l: *mut Lexer, id: *const c_char) -> Option<()> { - lexer::get_token(l)?; +pub unsafe fn get_and_expect_token_id(l: *mut Lexer, c: *mut Compiler, id: *const c_char) -> Option<()> { + get_token(l, c)?; expect_token_id(l, id) } @@ -383,7 +391,7 @@ pub unsafe fn compile_string(string: *const c_char, c: *mut Compiler) -> usize { } pub unsafe fn compile_primary_expression(l: *mut Lexer, c: *mut Compiler) -> Option<(Arg, bool)> { - lexer::get_token(l)?; + get_token(l, c)?; let arg = match (*l).token { Token::OParen => { let result = compile_expression(l, c)?; @@ -482,7 +490,7 @@ pub unsafe fn compile_primary_expression(l: *mut Lexer, c: *mut Compiler) -> Opt loop { let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; (arg, is_lvalue) = match (*l).token { Token::OParen => Some((compile_function_call(l, c, arg)?, false)), @@ -563,7 +571,7 @@ pub unsafe fn compile_binop_expression(l: *mut Lexer, c: *mut Compiler, preceden let (mut lhs, mut lvalue) = compile_binop_expression(l, c, precedence + 1)?; let mut saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; if let Some(binop) = Binop::from_token((*l).token) { if binop.precedence() == precedence { @@ -579,7 +587,7 @@ pub unsafe fn compile_binop_expression(l: *mut Lexer, c: *mut Compiler, preceden lvalue = false; saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; } } } @@ -592,7 +600,7 @@ pub unsafe fn compile_assign_expression(l: *mut Lexer, c: *mut Compiler) -> Opti let (lhs, mut lvalue) = compile_binop_expression(l, c, 0)?; let mut saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; while let Some(binop) = Binop::from_assign_token((*l).token) { let binop_loc = (*l).loc; @@ -626,7 +634,7 @@ pub unsafe fn compile_assign_expression(l: *mut Lexer, c: *mut Compiler) -> Opti lvalue = false; saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; } if (*l).token == Token::Question { @@ -661,7 +669,7 @@ pub unsafe fn compile_expression(l: *mut Lexer, c: *mut Compiler) -> Option<(Arg pub unsafe fn compile_block(l: *mut Lexer, c: *mut Compiler) -> Option<()> { loop { let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; if (*l).token == Token::CCurly { return Some(()); } (*l).parse_point = saved_point; @@ -671,13 +679,13 @@ pub unsafe fn compile_block(l: *mut Lexer, c: *mut Compiler) -> Option<()> { unsafe fn compile_function_call(l: *mut Lexer, c: *mut Compiler, fun: Arg) -> Option { let mut args: Array = zeroed(); let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; if (*l).token != Token::CParen { (*l).parse_point = saved_point; loop { let (expr, _) = compile_expression(l, c)?; da_append(&mut args, expr); - get_and_expect_tokens(l, &[Token::CParen, Token::Comma])?; + get_and_expect_tokens(l, c, &[Token::CParen, Token::Comma])?; match (*l).token { Token::CParen => break, Token::Comma => continue, @@ -703,11 +711,11 @@ pub unsafe fn name_declare_if_not_exists(names: *mut Array<*const c_char>, name: pub unsafe fn compile_asm_stmts(l: *mut Lexer, c: *mut Compiler, stmts: *mut Array) -> Option<()> { get_and_expect_token_but_continue(l, c, Token::OParen)?; let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; if (*l).token != Token::CParen { (*l).parse_point = saved_point; loop { - get_and_expect_token(l, Token::String)?; + get_and_expect_token(l, c, Token::String)?; match (*l).token { Token::String => { let line = arena::strdup(&mut (*c).arena_names, (*l).string); @@ -717,7 +725,7 @@ pub unsafe fn compile_asm_stmts(l: *mut Lexer, c: *mut Compiler, stmts: *mut Arr _ => unreachable!(), } - get_and_expect_tokens(l, &[Token::Comma, Token::CParen])?; + get_and_expect_tokens(l, c, &[Token::Comma, Token::CParen])?; match (*l).token { Token::Comma => {} Token::CParen => break, @@ -731,7 +739,7 @@ pub unsafe fn compile_asm_stmts(l: *mut Lexer, c: *mut Compiler, stmts: *mut Arr pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; match (*l).token { Token::OCurly => { @@ -744,24 +752,24 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { } Token::Extrn => { while (*l).token != Token::SemiColon { - get_and_expect_token(l, Token::ID)?; + get_and_expect_token(l, c, Token::ID)?; let name = arena::strdup(&mut (*c).arena_names, (*l).string); name_declare_if_not_exists(&mut (*c).extrns, name); declare_var(c, name, (*l).loc, Storage::External {name})?; - get_and_expect_tokens(l, &[Token::SemiColon, Token::Comma])?; + get_and_expect_tokens(l, c, &[Token::SemiColon, Token::Comma])?; } Some(()) } Token::Auto => { while (*l).token != Token::SemiColon { - get_and_expect_token(l, Token::ID)?; + get_and_expect_token(l, c, Token::ID)?; // TODO: Automatic variable names should only need function lifetime. // Could use .arena_labels here but naming would be confusing. // Rename .arena_labels to indicate function lifetime first? let name = arena::strdup(&mut (*c).arena_names, (*l).string); let index = allocate_auto_var(&mut (*c).auto_vars_ator); declare_var(c, name, (*l).loc, Storage::Auto {index})?; - get_and_expect_tokens(l, &[Token::SemiColon, Token::Comma, Token::IntLit, Token::CharLit])?; + get_and_expect_tokens(l, c, &[Token::SemiColon, Token::Comma, Token::IntLit, Token::CharLit])?; if (*l).token == Token::IntLit || (*l).token == Token::CharLit { let size = (*l).int_number as usize; if size == 0 { @@ -775,7 +783,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { // See TODO(2025-06-05 17:45:36) let arg = Arg::RefAutoVar(index + size); push_opcode(Op::AutoAssign {index, arg}, (*l).loc, c); - get_and_expect_tokens(l, &[Token::SemiColon, Token::Comma])?; + get_and_expect_tokens(l, c, &[Token::SemiColon, Token::Comma])?; } } Some(()) @@ -792,7 +800,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { compile_statement(l, c)?; let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; if (*l).token == Token::Else { let out_label = allocate_label_index(c); push_opcode(Op::JmpLabel{label: out_label}, (*l).loc, c); @@ -826,7 +834,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } Token::Return => { - get_and_expect_tokens(l, &[Token::SemiColon, Token::OParen])?; + get_and_expect_tokens(l, c, &[Token::SemiColon, Token::OParen])?; if (*l).token == Token::SemiColon { push_opcode(Op::Return {arg: None}, (*l).loc, c); } else if (*l).token == Token::OParen { @@ -840,7 +848,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } Token::Goto => { - get_and_expect_token(l, Token::ID)?; + get_and_expect_token(l, c, Token::ID)?; let name = arena::strdup(&mut (*c).arena_labels, (*l).string); let loc = (*l).loc; let addr = (*c).func_body.count; @@ -858,7 +866,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { } Token::Case => { let case_loc = (*l).loc; - lexer::get_token(l); + get_token(l, c)?; expect_tokens(l, &[Token::IntLit, Token::CharLit])?; // TODO: String ??! let case_value = (*l).int_number; get_and_expect_token_but_continue(l, c, Token::Colon)?; @@ -917,7 +925,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { if (*l).token == Token::ID { let name = arena::strdup(&mut (*c).arena_labels, (*l).string); let name_loc = (*l).loc; - lexer::get_token(l)?; + get_token(l, c)?; if (*l).token == Token::Colon { let label = allocate_label_index(c); push_opcode(Op::Label{label}, name_loc, c); @@ -1019,7 +1027,7 @@ pub unsafe fn bump_error_count(c: *mut Compiler) -> Option<()> { pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { 'def: loop { - lexer::get_token(l)?; + get_token(l, c)?; if (*l).token == Token::EOF { break 'def } if (*l).token == Token::Variadic { @@ -1054,23 +1062,23 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { declare_var(c, name, name_loc, Storage::External{name})?; let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; if (*l).token == Token::OParen { // Function definition scope_push(&mut (*c).vars); // begin function scope let mut params_count = 0; let saved_point = (*l).parse_point; - lexer::get_token(l)?; + get_token(l, c)?; if (*l).token != Token::CParen { (*l).parse_point = saved_point; 'params: loop { - get_and_expect_token(l, Token::ID)?; + get_and_expect_token(l, c, Token::ID)?; let name = arena::strdup(&mut (*c).arena_names, (*l).string); let name_loc = (*l).loc; let index = allocate_auto_var(&mut (*c).auto_vars_ator); declare_var(c, name, name_loc, Storage::Auto{index})?; params_count += 1; - get_and_expect_tokens(l, &[Token::CParen, Token::Comma])?; + get_and_expect_tokens(l, c, &[Token::CParen, Token::Comma])?; match (*l).token { Token::CParen => break 'params, Token::Comma => continue 'params, @@ -1121,22 +1129,22 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { // TODO: This code is ugly // couldn't find a better way to write it while keeping accurate error messages - get_and_expect_tokens(l, &[Token::Minus, Token::IntLit, Token::CharLit, Token::String, Token::ID, Token::SemiColon, Token::OBracket])?; + get_and_expect_tokens(l, c, &[Token::Minus, Token::IntLit, Token::CharLit, Token::String, Token::ID, Token::SemiColon, Token::OBracket])?; if (*l).token == Token::OBracket { global.is_vec = true; - get_and_expect_tokens(l, &[Token::IntLit, Token::CBracket])?; + get_and_expect_tokens(l, c, &[Token::IntLit, Token::CBracket])?; if (*l).token == Token::IntLit { global.minimum_size = (*l).int_number as usize; get_and_expect_token_but_continue(l, c, Token::CBracket)?; } - get_and_expect_tokens(l, &[Token::Minus, Token::IntLit, Token::CharLit, Token::String, Token::ID, Token::SemiColon])?; + get_and_expect_tokens(l, c, &[Token::Minus, Token::IntLit, Token::CharLit, Token::String, Token::ID, Token::SemiColon])?; } while (*l).token != Token::SemiColon { let value = match (*l).token { Token::Minus => { - get_and_expect_token(l, Token::IntLit)?; + get_and_expect_token(l, c, Token::IntLit)?; ImmediateValue::Literal(!(*l).int_number + 1) } Token::IntLit | Token::CharLit => ImmediateValue::Literal((*l).int_number), @@ -1155,9 +1163,9 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { }; da_append(&mut global.values, value); - get_and_expect_tokens(l, &[Token::SemiColon, Token::Comma])?; + get_and_expect_tokens(l, c, &[Token::SemiColon, Token::Comma])?; if (*l).token == Token::Comma { - get_and_expect_tokens(l, &[Token::Minus, Token::IntLit, Token::CharLit, Token::String, Token::ID])?; + get_and_expect_tokens(l, c, &[Token::Minus, Token::IntLit, Token::CharLit, Token::String, Token::ID])?; } else { break; } diff --git a/src/lexer.rs b/src/lexer.rs index fb6349bb..c10af597 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -29,6 +29,13 @@ macro_rules! missingf { }} } +pub type Result = core::result::Result<(), ErrorKind>; + +pub enum ErrorKind { + Error, + Fatal, +} + #[derive(Clone, Copy, PartialEq)] pub enum Token { // Terminal @@ -372,17 +379,19 @@ pub unsafe fn loc(l: *mut Lexer) -> Loc { } } -pub unsafe fn parse_string_into_storage(l: *mut Lexer, delim: c_char) -> Option<()> { +pub unsafe fn parse_string_into_storage(l: *mut Lexer, delim: c_char) -> Result { let escape_char = if !(*l).historical { '\\' } else {'*'} as c_char; + skip_char(l); + let mut result = Ok(()); while let Some(x) = peek_char(l) { match x { x if x == escape_char => { skip_char(l); let Some(x) = peek_char(l) else { - (*l).token = Token::ParseError; diagf!(loc(l), c!("LEXER ERROR: Unfinished escape sequence\n")); - return None; + result = Err(ErrorKind::Fatal); + break; }; let x = match x { x if x == '0' as c_char => '\0' as c_char, @@ -392,9 +401,9 @@ pub unsafe fn parse_string_into_storage(l: *mut Lexer, delim: c_char) -> Option< x if x == delim => delim, x if x == escape_char => escape_char, x => { - (*l).token = Token::ParseError; diagf!(loc(l), c!("LEXER ERROR: Unknown escape sequence starting with `%c`\n"), x as c_int); - return None; + result = Err(ErrorKind::Error); + continue; } }; da_append(&mut (*l).string_storage, x); @@ -407,7 +416,8 @@ pub unsafe fn parse_string_into_storage(l: *mut Lexer, delim: c_char) -> Option< }, } } - Some(()) + if !is_eof(l) { skip_char(l); } + result } #[repr(u8)] @@ -439,7 +449,7 @@ unsafe fn parse_digit(c: c_char, radix: Radix) -> Option { return None; } -unsafe fn parse_number(l: *mut Lexer, radix: Radix, report_point: Parse_Point) -> Option<()> { +unsafe fn parse_number(l: *mut Lexer, radix: Radix, report_point: Parse_Point) -> Result { while let Some(x) = peek_char(l) { let Some(d) = parse_digit(x, radix) else { break; @@ -448,23 +458,23 @@ unsafe fn parse_number(l: *mut Lexer, radix: Radix, report_point: Parse_Point) - let Some(r) = i64::checked_mul((*l).int_number as i64, radix as i64) else { (*l).parse_point = report_point; diagf!(loc(l), c!("LEXER ERROR: Constant integer overflow\n")); - return None; + return Err(ErrorKind::Error); }; (*l).int_number = r as u64; let Some(r) = i64::checked_add((*l).int_number as i64, d as i64) else { (*l).parse_point = report_point; diagf!(loc(l), c!("LEXER ERROR: Constant integer overflow.\n")); - return None; + return Err(ErrorKind::Error); }; (*l).int_number = r as u64; skip_char(l); }; - return Some(()); + return Ok(()); } -pub unsafe fn get_token(l: *mut Lexer) -> Option<()> { +pub unsafe fn get_token(l: *mut Lexer) -> Result { 'comments: loop { skip_whitespaces(l); @@ -473,8 +483,8 @@ pub unsafe fn get_token(l: *mut Lexer) -> Option<()> { if (*l).historical { (*l).parse_point = saved_point; diagf!(loc(l), c!("LEXER ERROR: C++ style comments are not available in the historical mode.\n")); - (*l).token = Token::ParseError; - return None; + // TODO: Convert to recoverable error. Need to advance lexer to not get stuck. + return Err(ErrorKind::Fatal); } skip_until(l, c!("\n")); continue 'comments; @@ -492,7 +502,7 @@ pub unsafe fn get_token(l: *mut Lexer) -> Option<()> { let Some(x) = peek_char(l) else { (*l).token = Token::EOF; - return Some(()) + return Ok(()) }; let puncs = if !(*l).historical { PUNCTS } else { HISTORICAL_PUNCTS }; @@ -500,7 +510,7 @@ pub unsafe fn get_token(l: *mut Lexer) -> Option<()> { let (prefix, token) = (*puncs)[i]; if skip_prefix(l, prefix) { (*l).token = token; - return Some(()) + return Ok(()) } } @@ -522,24 +532,22 @@ pub unsafe fn get_token(l: *mut Lexer) -> Option<()> { let (id, token) = (*KEYWORDS)[i]; if strcmp((*l).string, id) == 0 { (*l).token = token; - return Some(()); + return Ok(()); } } - return Some(()) + return Ok(()) } let start_of_number = (*l).parse_point; if skip_prefix(l, c!("0x")) { + (*l).token = Token::IntLit; + (*l).int_number = 0; if (*l).historical { (*l).parse_point = start_of_number; diagf!(loc(l), c!("LEXER ERROR: hex literals are not available in the historical mode.\n")); - (*l).token = Token::ParseError; - return None; + return Err(ErrorKind::Error); } - - (*l).token = Token::IntLit; - (*l).int_number = 0; return parse_number(l, Radix::Hex, start_of_number); } @@ -556,55 +564,45 @@ pub unsafe fn get_token(l: *mut Lexer) -> Option<()> { } if x == '"' as c_char { - skip_char(l); (*l).token = Token::String; (*l).string_storage.count = 0; parse_string_into_storage(l, '"' as c_char)?; if is_eof(l) { diagf!(loc(l), c!("LEXER ERROR: Unfinished string literal\n")); diagf!((*l).loc, c!("LEXER INFO: Literal starts here\n")); - (*l).token = Token::ParseError; - return None; + return Err(ErrorKind::Fatal); } - skip_char(l); da_append(&mut (*l).string_storage, 0); (*l).string = (*l).string_storage.items; - return Some(()); + return Ok(()); } if x == '\'' as c_char { - skip_char(l); (*l).token = Token::CharLit; (*l).string_storage.count = 0; parse_string_into_storage(l, '\'' as c_char)?; if is_eof(l) { diagf!(loc(l), c!("LEXER ERROR: Unfinished character literal\n")); diagf!((*l).loc, c!("LEXER INFO: Literal starts here\n")); - (*l).token = Token::ParseError; - return None; + return Err(ErrorKind::Fatal); } - skip_char(l); if (*l).string_storage.count == 0 { diagf!((*l).loc, c!("LEXER ERROR: Empty character literal\n")); - (*l).token = Token::ParseError; - return None; + return Err(ErrorKind::Error); } if (*l).string_storage.count > 2 { - // TODO: maybe we should allow more on targets with 64 bits? - // TODO: such error should not terminate the compilation diagf!((*l).loc, c!("LEXER ERROR: Character literal contains more than two characters\n")); - (*l).token = Token::ParseError; - return None; + return Err(ErrorKind::Error); } (*l).int_number = 0; for i in 0..(*l).string_storage.count { (*l).int_number *= 0x100; (*l).int_number += *(*l).string_storage.items.add(i) as u64; } - return Some(()); + return Ok(()); } diagf!((*l).loc, c!("LEXER ERROR: Unknown token %c\n"), *(*l).parse_point.current as c_int); (*l).token = Token::ParseError; - None + Err(ErrorKind::Fatal) } From 130a3bb60789ae79a28cfd63adc477e7a2c950b3 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Sun, 6 Jul 2025 22:17:23 +0100 Subject: [PATCH 02/19] lexer: Remove unused ParseError token --- src/lexer.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index c10af597..f1d6f390 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -40,7 +40,6 @@ pub enum ErrorKind { pub enum Token { // Terminal EOF, - ParseError, // Values ID, @@ -106,7 +105,6 @@ pub unsafe fn display_token(token: Token) -> *const c_char { match token { // Terminal Token::EOF => c!("end of file"), - Token::ParseError => c!("parse error"), // Values Token::ID => c!("identifier"), @@ -603,6 +601,5 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { } diagf!((*l).loc, c!("LEXER ERROR: Unknown token %c\n"), *(*l).parse_point.current as c_int); - (*l).token = Token::ParseError; Err(ErrorKind::Fatal) } From 1fd0c11046d8ef26b12b6a0b91f71b09cd4b3576 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Sun, 6 Jul 2025 23:08:25 +0100 Subject: [PATCH 03/19] lexer: Fix integer overflow error recovery --- src/lexer.rs | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index f1d6f390..2a422d77 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -447,29 +447,30 @@ unsafe fn parse_digit(c: c_char, radix: Radix) -> Option { return None; } -unsafe fn parse_number(l: *mut Lexer, radix: Radix, report_point: Parse_Point) -> Result { +unsafe fn parse_number(l: *mut Lexer, radix: Radix) -> Result { + let mut result = Ok(()); while let Some(x) = peek_char(l) { let Some(d) = parse_digit(x, radix) else { break; }; + skip_char(l); let Some(r) = i64::checked_mul((*l).int_number as i64, radix as i64) else { - (*l).parse_point = report_point; - diagf!(loc(l), c!("LEXER ERROR: Constant integer overflow\n")); - return Err(ErrorKind::Error); + result = Err(ErrorKind::Error); + continue; }; (*l).int_number = r as u64; let Some(r) = i64::checked_add((*l).int_number as i64, d as i64) else { - (*l).parse_point = report_point; - diagf!(loc(l), c!("LEXER ERROR: Constant integer overflow.\n")); - return Err(ErrorKind::Error); + result = Err(ErrorKind::Error); + continue; }; (*l).int_number = r as u64; - skip_char(l); - }; - - return Ok(()); + } + if !result.is_ok() { + diagf!((*l).loc, c!("LEXER ERROR: Constant integer overflow\n")); + } + result } pub unsafe fn get_token(l: *mut Lexer) -> Result { @@ -537,28 +538,26 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { return Ok(()) } - let start_of_number = (*l).parse_point; if skip_prefix(l, c!("0x")) { (*l).token = Token::IntLit; (*l).int_number = 0; if (*l).historical { - (*l).parse_point = start_of_number; - diagf!(loc(l), c!("LEXER ERROR: hex literals are not available in the historical mode.\n")); + diagf!((*l).loc, c!("LEXER ERROR: hex literals are not available in the historical mode.\n")); return Err(ErrorKind::Error); } - return parse_number(l, Radix::Hex, start_of_number); + return parse_number(l, Radix::Hex); } if skip_prefix(l, c!("0")) { (*l).token = Token::IntLit; (*l).int_number = 0; - return parse_number(l, Radix::Oct, start_of_number); + return parse_number(l, Radix::Oct); } if isdigit(x as c_int) != 0 { (*l).token = Token::IntLit; (*l).int_number = 0; - return parse_number(l, Radix::Dec, start_of_number); + return parse_number(l, Radix::Dec); } if x == '"' as c_char { From c434821d09294498fea409776fee4885a5a1f2fe Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Sun, 6 Jul 2025 23:30:10 +0100 Subject: [PATCH 04/19] lexer: Progress past hex literal in historical mode --- src/lexer.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lexer.rs b/src/lexer.rs index 2a422d77..e2b1eaa3 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -541,11 +541,12 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { if skip_prefix(l, c!("0x")) { (*l).token = Token::IntLit; (*l).int_number = 0; + parse_number(l, Radix::Hex)?; if (*l).historical { diagf!((*l).loc, c!("LEXER ERROR: hex literals are not available in the historical mode.\n")); return Err(ErrorKind::Error); } - return parse_number(l, Radix::Hex); + return Ok(()); } if skip_prefix(l, c!("0")) { From b293e9f91f149637f43379da9d8a7949242c5755 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Mon, 7 Jul 2025 10:17:42 +0100 Subject: [PATCH 05/19] lexer: Clean up parse_number error handling --- src/lexer.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index e2b1eaa3..2d265576 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -448,7 +448,7 @@ unsafe fn parse_digit(c: c_char, radix: Radix) -> Option { } unsafe fn parse_number(l: *mut Lexer, radix: Radix) -> Result { - let mut result = Ok(()); + let mut overflow = false; while let Some(x) = peek_char(l) { let Some(d) = parse_digit(x, radix) else { break; @@ -456,21 +456,26 @@ unsafe fn parse_number(l: *mut Lexer, radix: Radix) -> Result { skip_char(l); let Some(r) = i64::checked_mul((*l).int_number as i64, radix as i64) else { - result = Err(ErrorKind::Error); + overflow = true; continue; }; (*l).int_number = r as u64; let Some(r) = i64::checked_add((*l).int_number as i64, d as i64) else { - result = Err(ErrorKind::Error); + overflow = true; continue; }; (*l).int_number = r as u64; } - if !result.is_ok() { - diagf!((*l).loc, c!("LEXER ERROR: Constant integer overflow\n")); + if (*l).historical && matches!(radix, Radix::Hex) { + diagf!((*l).loc, c!("LEXER ERROR: hex literals are not available in historical mode\n")); + return Err(ErrorKind::Error); } - result + if overflow { + diagf!((*l).loc, c!("LEXER ERROR: integer literal overflow\n")); + return Err(ErrorKind::Error); + } + Ok(()) } pub unsafe fn get_token(l: *mut Lexer) -> Result { @@ -541,12 +546,7 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { if skip_prefix(l, c!("0x")) { (*l).token = Token::IntLit; (*l).int_number = 0; - parse_number(l, Radix::Hex)?; - if (*l).historical { - diagf!((*l).loc, c!("LEXER ERROR: hex literals are not available in the historical mode.\n")); - return Err(ErrorKind::Error); - } - return Ok(()); + return parse_number(l, Radix::Hex); } if skip_prefix(l, c!("0")) { From c2c80f1eac64676cb0882ce5ce4949b1374f7d24 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Mon, 7 Jul 2025 18:14:05 +0100 Subject: [PATCH 06/19] lexer: Implement peek_token() --- src/b.rs | 13 +++++++++---- src/lexer.rs | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/b.rs b/src/b.rs index e85993ed..78fcf65f 100644 --- a/src/b.rs +++ b/src/b.rs @@ -37,6 +37,13 @@ pub unsafe fn get_token(l: *mut Lexer, c: *mut Compiler) -> Option<()> { } } +pub unsafe fn peek_token(l: *mut Lexer, c: *mut Compiler) -> Option<()> { + match lexer::peek_token(l) { + Some(_) => Some(()), + None => bump_error_count(c).and(None), + } +} + pub unsafe fn expect_tokens(l: *mut Lexer, tokens: *const [Token]) -> Option<()> { for i in 0..tokens.len() { if (*tokens)[i] == (*l).token { @@ -73,13 +80,11 @@ pub unsafe fn get_and_expect_token(l: *mut Lexer, c: *mut Compiler, token: Token } pub unsafe fn get_and_expect_token_but_continue(l: *mut Lexer, c: *mut Compiler, token: Token) -> Option<()> { - let saved_point = (*l).parse_point; - get_token(l, c)?; + peek_token(l, c)?; if let None = expect_token(l, token) { - (*l).parse_point = saved_point; bump_error_count(c) } else { - Some(()) + get_token(l, c) } } diff --git a/src/lexer.rs b/src/lexer.rs index 2d265576..b37f7cfd 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -31,6 +31,7 @@ macro_rules! missingf { pub type Result = core::result::Result<(), ErrorKind>; +#[derive(Clone, Copy)] pub enum ErrorKind { Error, Fatal, @@ -291,6 +292,7 @@ pub struct Lexer { pub string: *const c_char, pub int_number: u64, pub loc: Loc, + pub next_result: Option, } pub unsafe fn new(input_path: *const c_char, input_stream: *const c_char, eof: *const c_char, historical: bool) -> Lexer { @@ -302,6 +304,7 @@ pub unsafe fn new(input_path: *const c_char, input_stream: *const c_char, eof: * l.parse_point.line_start = input_stream; l.parse_point.line_number = 1; l.historical = historical; + l.next_result = None; l } @@ -478,7 +481,19 @@ unsafe fn parse_number(l: *mut Lexer, radix: Radix) -> Result { Ok(()) } +pub unsafe fn peek_token(l: *mut Lexer) -> Option { + match (*l).next_result.get_or_insert_with(|| get_token(l)) { + Ok(()) => Some((*l).token), + Err(ErrorKind::Error) => Some((*l).token), + Err(ErrorKind::Fatal) => None, + } +} + pub unsafe fn get_token(l: *mut Lexer) -> Result { + if let Some(result) = (*l).next_result.take() { + return result; + } + 'comments: loop { skip_whitespaces(l); From 5f212932fe2f9eda8ef76438e2f507d97df42155 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 18:29:54 +0100 Subject: [PATCH 07/19] lexer: Preserve parse_point on peek --- src/b.rs | 18 ++++++++---------- src/lexer.rs | 11 +++++++++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/b.rs b/src/b.rs index 78fcf65f..052290cc 100644 --- a/src/b.rs +++ b/src/b.rs @@ -804,16 +804,15 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { compile_statement(l, c)?; - let saved_point = (*l).parse_point; - get_token(l, c)?; + peek_token(l, c)?; if (*l).token == Token::Else { + get_token(l, c)?; let out_label = allocate_label_index(c); push_opcode(Op::JmpLabel{label: out_label}, (*l).loc, c); push_opcode(Op::Label{label: else_label}, (*l).loc, c); compile_statement(l, c)?; push_opcode(Op::Label{label: out_label}, (*l).loc, c); } else { - (*l).parse_point = saved_point; push_opcode(Op::Label{label: else_label}, (*l).loc, c); } @@ -1066,16 +1065,14 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { let name_loc = (*l).loc; declare_var(c, name, name_loc, Storage::External{name})?; - let saved_point = (*l).parse_point; - get_token(l, c)?; + peek_token(l, c)?; if (*l).token == Token::OParen { // Function definition + get_token(l, c)?; scope_push(&mut (*c).vars); // begin function scope let mut params_count = 0; - let saved_point = (*l).parse_point; - get_token(l, c)?; + peek_token(l, c)?; if (*l).token != Token::CParen { - (*l).parse_point = saved_point; 'params: loop { get_and_expect_token(l, c, Token::ID)?; let name = arena::strdup(&mut (*c).arena_names, (*l).string); @@ -1090,6 +1087,8 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { _ => unreachable!(), } } + } else { + get_token(l, c)?; } compile_statement(l, c)?; scope_pop(&mut (*c).vars); // end function scope @@ -1119,12 +1118,11 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { (*c).auto_vars_ator = zeroed(); (*c).op_label_count = 0; } else if (*l).token == Token::Asm { // Assembly function definition + get_token(l, c)?; let mut body: Array = zeroed(); compile_asm_stmts(l, c, &mut body)?; da_append(&mut (*c).asm_funcs, AsmFunc {name, name_loc, body}); } else { // Variable definition - (*l).parse_point = saved_point; - let mut global = Global { name, values: zeroed(), diff --git a/src/lexer.rs b/src/lexer.rs index b37f7cfd..40bdf6b8 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -292,7 +292,9 @@ pub struct Lexer { pub string: *const c_char, pub int_number: u64, pub loc: Loc, + pub next_result: Option, + pub next_point: Parse_Point, } pub unsafe fn new(input_path: *const c_char, input_stream: *const c_char, eof: *const c_char, historical: bool) -> Lexer { @@ -482,15 +484,20 @@ unsafe fn parse_number(l: *mut Lexer, radix: Radix) -> Result { } pub unsafe fn peek_token(l: *mut Lexer) -> Option { - match (*l).next_result.get_or_insert_with(|| get_token(l)) { + let saved_point = (*l).parse_point; + let token = match (*l).next_result.get_or_insert_with(|| get_token(l)) { Ok(()) => Some((*l).token), Err(ErrorKind::Error) => Some((*l).token), Err(ErrorKind::Fatal) => None, - } + }; + (*l).next_point = (*l).parse_point; + (*l).parse_point = saved_point; + token } pub unsafe fn get_token(l: *mut Lexer) -> Result { if let Some(result) = (*l).next_result.take() { + (*l).parse_point = (*l).next_point; return result; } From 8ec77555744cbfca382a09a51586c6f0d87c6de5 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 20:54:15 +0100 Subject: [PATCH 08/19] lexer: Save next_point only if advanced --- src/b.rs | 6 +++--- src/lexer.rs | 15 +++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/b.rs b/src/b.rs index 052290cc..670086ae 100644 --- a/src/b.rs +++ b/src/b.rs @@ -37,10 +37,10 @@ pub unsafe fn get_token(l: *mut Lexer, c: *mut Compiler) -> Option<()> { } } -pub unsafe fn peek_token(l: *mut Lexer, c: *mut Compiler) -> Option<()> { +pub unsafe fn peek_token(l: *mut Lexer, c: *mut Compiler) -> Option { match lexer::peek_token(l) { - Some(_) => Some(()), - None => bump_error_count(c).and(None), + Some(token) => Some(token), + None => bump_error_count(c).and(None), } } diff --git a/src/lexer.rs b/src/lexer.rs index 40bdf6b8..d190f5ab 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -484,15 +484,18 @@ unsafe fn parse_number(l: *mut Lexer, radix: Radix) -> Result { } pub unsafe fn peek_token(l: *mut Lexer) -> Option { - let saved_point = (*l).parse_point; - let token = match (*l).next_result.get_or_insert_with(|| get_token(l)) { + let Some(result) = (*l).next_result else { + let saved_point = (*l).parse_point; + (*l).next_result = Some(get_token(l)); + (*l).next_point = (*l).parse_point; + (*l).parse_point = saved_point; + return peek_token(l); + }; + match result { Ok(()) => Some((*l).token), Err(ErrorKind::Error) => Some((*l).token), Err(ErrorKind::Fatal) => None, - }; - (*l).next_point = (*l).parse_point; - (*l).parse_point = saved_point; - token + } } pub unsafe fn get_token(l: *mut Lexer) -> Result { From 17524a5205f1cfdaf7a04798746de5fa959b7e0e Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 21:21:17 +0100 Subject: [PATCH 09/19] clexes -> tokens --- src/b.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/b.rs b/src/b.rs index c892f79e..1400065c 100644 --- a/src/b.rs +++ b/src/b.rs @@ -84,9 +84,9 @@ pub unsafe fn get_and_expect_token_but_continue(l: *mut Lexer, c: *mut Compiler, } } -pub unsafe fn get_and_expect_tokens(l: *mut Lexer, c: *mut Compiler, clexes: *const [Token]) -> Option<()> { +pub unsafe fn get_and_expect_tokens(l: *mut Lexer, c: *mut Compiler, tokens: *const [Token]) -> Option<()> { get_token(l, c)?; - expect_tokens(l, clexes) + expect_tokens(l, tokens) } pub unsafe fn expect_token_id(l: *mut Lexer, id: *const c_char) -> Option<()> { From 3d3cacf1d82f0d0d6a0591459caf3290969814de Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 21:24:56 +0100 Subject: [PATCH 10/19] lexer: Restore accidentally deleted TODO --- src/lexer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lexer.rs b/src/lexer.rs index db6b23f7..4f86168a 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -589,6 +589,7 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { return Err(ErrorKind::Error); } if (*l).string_storage.count > 2 { + // TODO: maybe we should allow more on targets with 64 bits? diagf!((*l).loc, c!("LEXER ERROR: Character literal contains more than two characters\n")); return Err(ErrorKind::Error); } From 85a48732f91b4196560a80c61a1445a335d4faa9 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 21:26:25 +0100 Subject: [PATCH 11/19] lexer: puncs -> puncts --- src/lexer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 4f86168a..fcae1ef5 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -508,10 +508,10 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { (*l).token = Token::EOF; return Ok(()) }; - let puncs = if !(*l).historical { PUNCTS } else { HISTORICAL_PUNCTS }; - for i in 0..puncs.len() { - let (prefix, token) = (*puncs)[i]; + let puncts = if !(*l).historical { PUNCTS } else { HISTORICAL_PUNCTS }; + for i in 0..puncts.len() { + let (prefix, token) = (*puncts)[i]; if skip_prefix(l, prefix) { (*l).token = token; return Ok(()) From 35b9e211a80b0ddd4375abe3f1716544dec5a965 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 21:27:08 +0100 Subject: [PATCH 12/19] lexer: Consistent semicolons for returns --- src/lexer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index fcae1ef5..a681ce33 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -506,7 +506,7 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { let Some(x) = peek_char(l) else { (*l).token = Token::EOF; - return Ok(()) + return Ok(()); }; let puncts = if !(*l).historical { PUNCTS } else { HISTORICAL_PUNCTS }; @@ -514,7 +514,7 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { let (prefix, token) = (*puncts)[i]; if skip_prefix(l, prefix) { (*l).token = token; - return Ok(()) + return Ok(()); } } @@ -540,7 +540,7 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { } } - return Ok(()) + return Ok(()); } if skip_prefix(l, c!("0x")) { From 36e305a317ec6697683b6094be464c057aa3e5cd Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 21:27:59 +0100 Subject: [PATCH 13/19] lexer: Consistent error message capitalization --- src/lexer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index a681ce33..ad7c938e 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -468,11 +468,11 @@ unsafe fn parse_number(l: *mut Lexer, radix: Radix) -> Result { (*l).int_number = r; } if (*l).historical && matches!(radix, Radix::Hex) { - diagf!((*l).loc, c!("LEXER ERROR: hex literals are not available in historical mode\n")); + diagf!((*l).loc, c!("LEXER ERROR: Hex literals are not available in historical mode\n")); return Err(ErrorKind::Error); } if overflow { - diagf!((*l).loc, c!("LEXER ERROR: integer literal overflow\n")); + diagf!((*l).loc, c!("LEXER ERROR: Integer literal overflow\n")); return Err(ErrorKind::Error); } Ok(()) From b6645fd101047649101fce06335716a8a6730689 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 21:41:19 +0100 Subject: [PATCH 14/19] lexer: Document problem with historical comments --- src/lexer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index ad7c938e..e12143af 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -484,13 +484,13 @@ pub unsafe fn get_token(l: *mut Lexer) -> Result { let saved_point = (*l).parse_point; if skip_prefix(l, c!("//")) { + skip_until(l, c!("\n")); if (*l).historical { (*l).parse_point = saved_point; diagf!(loc(l), c!("LEXER ERROR: C++ style comments are not available in the historical mode.\n")); - // TODO: Convert to recoverable error. Need to advance lexer to not get stuck. + // TODO: Convert to recoverable error. The problem is we don't yet have a token at this point, so can't return non-fatal here. return Err(ErrorKind::Fatal); } - skip_until(l, c!("\n")); continue 'comments; } From 1fbc7f238de487f8d0b7a7d4bbadb875c94b2d95 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 22:10:02 +0100 Subject: [PATCH 15/19] Use peek_token rather than saved_point --- src/b.rs | 109 +++++++++++++++++++++++++------------------------------ 1 file changed, 50 insertions(+), 59 deletions(-) diff --git a/src/b.rs b/src/b.rs index 43565b30..8b444a58 100644 --- a/src/b.rs +++ b/src/b.rs @@ -496,12 +496,13 @@ pub unsafe fn compile_primary_expression(l: *mut Lexer, c: *mut Compiler) -> Opt let (mut arg, mut is_lvalue) = arg?; loop { - let saved_point = (*l).parse_point; - get_token(l, c)?; - - (arg, is_lvalue) = match (*l).token { - Token::OParen => Some((compile_function_call(l, c, arg)?, false)), + (arg, is_lvalue) = match peek_token(l, c)? { + Token::OParen => { + get_token(l, c)?; + Some((compile_function_call(l, c, arg)?, false)) + } Token::OBracket => { + get_token(l, c)?; let (offset, _) = compile_expression(l, c)?; get_and_expect_token_but_continue(l, c, Token::CBracket)?; @@ -511,6 +512,7 @@ pub unsafe fn compile_primary_expression(l: *mut Lexer, c: *mut Compiler) -> Opt Some((Arg::Deref(result), true)) } Token::PlusPlus => { + get_token(l, c)?; let loc = (*l).loc; if !is_lvalue { diagf!(loc, c!("ERROR: cannot increment an rvalue\n")); @@ -524,6 +526,7 @@ pub unsafe fn compile_primary_expression(l: *mut Lexer, c: *mut Compiler) -> Opt Some((Arg::AutoVar(pre), false)) } Token::MinusMinus => { + get_token(l, c)?; let loc = (*l).loc; if !is_lvalue { diagf!(loc, c!("ERROR: cannot decrement an rvalue\n")); @@ -537,7 +540,6 @@ pub unsafe fn compile_primary_expression(l: *mut Lexer, c: *mut Compiler) -> Opt Some((Arg::AutoVar(pre), false)) } _ => { - (*l).parse_point = saved_point; return Some((arg, is_lvalue)); } }?; @@ -574,39 +576,31 @@ pub unsafe fn compile_binop_expression(l: *mut Lexer, c: *mut Compiler, preceden let (mut lhs, mut lvalue) = compile_binop_expression(l, c, precedence + 1)?; - let mut saved_point = (*l).parse_point; - get_token(l, c)?; - - if let Some(binop) = Binop::from_token((*l).token) { + if let Some(binop) = Binop::from_token(peek_token(l, c)?) { if binop.precedence() == precedence { - while let Some(binop) = Binop::from_token((*l).token) { + while let Some(binop) = Binop::from_token(peek_token(l, c)?) { if binop.precedence() != precedence { break; } + get_token(l, c)?; let (rhs, _) = compile_binop_expression(l, c, precedence + 1)?; let index = allocate_auto_var(&mut (*c).auto_vars_ator); push_opcode(Op::Binop {binop, index, lhs, rhs}, (*l).loc, c); - lhs = Arg::AutoVar(index); + lhs = Arg::AutoVar(index); lvalue = false; - - saved_point = (*l).parse_point; - get_token(l, c)?; } } } - (*l).parse_point = saved_point; Some((lhs, lvalue)) } pub unsafe fn compile_assign_expression(l: *mut Lexer, c: *mut Compiler) -> Option<(Arg, bool)> { let (lhs, mut lvalue) = compile_binop_expression(l, c, 0)?; - let mut saved_point = (*l).parse_point; - get_token(l, c)?; - - while let Some(binop) = Binop::from_assign_token((*l).token) { + while let Some(binop) = Binop::from_assign_token(peek_token(l, c)?) { + get_token(l, c)?; let binop_loc = (*l).loc; let (rhs, _) = compile_assign_expression(l, c)?; @@ -636,12 +630,10 @@ pub unsafe fn compile_assign_expression(l: *mut Lexer, c: *mut Compiler) -> Opti } lvalue = false; - - saved_point = (*l).parse_point; - get_token(l, c)?; } - if (*l).token == Token::Question { + if peek_token(l, c)? == Token::Question { + get_token(l, c)?; let result = allocate_auto_var(&mut (*c).auto_vars_ator); let else_label = allocate_label_index(c); @@ -661,7 +653,6 @@ pub unsafe fn compile_assign_expression(l: *mut Lexer, c: *mut Compiler) -> Opti Some((Arg::AutoVar(result), false)) } else { - (*l).parse_point = saved_point; Some((lhs, lvalue)) } } @@ -671,21 +662,17 @@ pub unsafe fn compile_expression(l: *mut Lexer, c: *mut Compiler) -> Option<(Arg } pub unsafe fn compile_block(l: *mut Lexer, c: *mut Compiler) -> Option<()> { - loop { - let saved_point = (*l).parse_point; - get_token(l, c)?; - if (*l).token == Token::CCurly { return Some(()); } - (*l).parse_point = saved_point; - - compile_statement(l, c)? + while peek_token(l, c)? != Token::CCurly { + compile_statement(l, c)?; } + get_and_expect_token(l, c, Token::CCurly) } - unsafe fn compile_function_call(l: *mut Lexer, c: *mut Compiler, fun: Arg) -> Option { + +unsafe fn compile_function_call(l: *mut Lexer, c: *mut Compiler, fun: Arg) -> Option { let mut args: Array = zeroed(); - let saved_point = (*l).parse_point; - get_token(l, c)?; - if (*l).token != Token::CParen { - (*l).parse_point = saved_point; + if peek_token(l, c)? == Token::CParen { + get_token(l, c)?; + } else { loop { let (expr, _) = compile_expression(l, c)?; da_append(&mut args, expr); @@ -714,10 +701,9 @@ pub unsafe fn name_declare_if_not_exists(names: *mut Array<*const c_char>, name: pub unsafe fn compile_asm_stmts(l: *mut Lexer, c: *mut Compiler, stmts: *mut Array) -> Option<()> { get_and_expect_token_but_continue(l, c, Token::OParen)?; - let saved_point = (*l).parse_point; - get_token(l, c)?; - if (*l).token != Token::CParen { - (*l).parse_point = saved_point; + if peek_token(l, c)? == Token::CParen { + get_token(l, c)?; + } else { loop { get_and_expect_token(l, c, Token::String)?; match (*l).token { @@ -742,14 +728,13 @@ pub unsafe fn compile_asm_stmts(l: *mut Lexer, c: *mut Compiler, stmts: *mut Arr } pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { - let saved_point = (*l).parse_point; - get_token(l, c)?; - - match (*l).token { + match peek_token(l, c)? { Token::SemiColon => { + get_token(l, c)?; Some(()) }, Token::OCurly => { + get_token(l, c)?; scope_push(&mut (*c).vars); let saved_auto_vars_count = (*c).auto_vars_ator.count; compile_block(l, c)?; @@ -758,6 +743,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } Token::Extrn => { + get_token(l, c)?; while (*l).token != Token::SemiColon { get_and_expect_token(l, c, Token::ID)?; let name = arena::strdup(&mut (*c).arena, (*l).string); @@ -768,6 +754,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { compile_statement(l, c) } Token::Auto => { + get_token(l, c)?; while (*l).token != Token::SemiColon { get_and_expect_token(l, c, Token::ID)?; let name = arena::strdup(&mut (*c).arena, (*l).string); @@ -793,6 +780,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { compile_statement(l, c) } Token::If => { + get_token(l, c)?; get_and_expect_token_but_continue(l, c, Token::OParen)?; let saved_auto_vars_count = (*c).auto_vars_ator.count; let (cond, _) = compile_expression(l, c)?; @@ -803,8 +791,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { compile_statement(l, c)?; - peek_token(l, c)?; - if (*l).token == Token::Else { + if peek_token(l, c)? == Token::Else { get_token(l, c)?; let out_label = allocate_label_index(c); push_opcode(Op::JmpLabel{label: out_label}, (*l).loc, c); @@ -818,6 +805,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } Token::While => { + get_token(l, c)?; let cond_label = allocate_label_index(c); push_opcode(Op::Label {label: cond_label}, (*l).loc, c); @@ -837,6 +825,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } Token::Return => { + get_token(l, c)?; get_and_expect_tokens(l, c, &[Token::SemiColon, Token::OParen])?; if (*l).token == Token::SemiColon { push_opcode(Op::Return {arg: None}, (*l).loc, c); @@ -851,6 +840,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } Token::Goto => { + get_token(l, c)?; get_and_expect_token(l, c, Token::ID)?; let name = arena::strdup(&mut (*c).arena, (*l).string); let loc = (*l).loc; @@ -861,6 +851,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } Token::Asm => { + get_token(l, c)?; let loc = (*l).loc; let mut stmts: Array = zeroed(); compile_asm_stmts(l, c, &mut stmts)?; @@ -868,6 +859,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } Token::Case => { + get_token(l, c)?; let case_loc = (*l).loc; get_token(l, c)?; expect_tokens(l, &[Token::IntLit, Token::CharLit])?; // TODO: String ??! @@ -905,6 +897,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { } } Token::Switch => { + get_token(l, c)?; let saved_auto_vars_count = (*c).auto_vars_ator.count; let switch_loc = (*l).loc; @@ -925,7 +918,9 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Some(()) } _ => { - if (*l).token == Token::ID { + if peek_token(l, c)? == Token::ID { + let saved_point = (*l).parse_point; + get_token(l, c)?; let name = arena::strdup(&mut (*c).arena, (*l).string); let name_loc = (*l).loc; get_token(l, c)?; @@ -935,8 +930,8 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { define_goto_label(c, name, name_loc, label)?; return Some(()); } + (*l).parse_point = saved_point; } - (*l).parse_point = saved_point; let saved_auto_vars_count = (*c).auto_vars_ator.count; compile_expression(l, c)?; (*c).auto_vars_ator.count = saved_auto_vars_count; @@ -1091,17 +1086,14 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { let name_loc = (*l).loc; declare_var(c, name, name_loc, Storage::External{name})?; - let saved_point = (*l).parse_point; - get_token(l, c)?; - - match (*l).token { + match peek_token(l, c)? { Token::OParen => { // Function definition + get_token(l, c)?; scope_push(&mut (*c).vars); // begin function scope let mut params_count = 0; - let saved_point = (*l).parse_point; - get_token(l, c)?; - if (*l).token != Token::CParen { - (*l).parse_point = saved_point; + if peek_token(l, c)? == Token::CParen { + get_token(l, c)?; + } else { 'params: loop { get_and_expect_token(l, c, Token::ID)?; let name = arena::strdup(&mut (*c).arena, (*l).string); @@ -1145,13 +1137,12 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { (*c).op_label_count = 0; } Token::Asm => { // Assembly function definition + get_token(l, c)?; let mut body: Array = zeroed(); compile_asm_stmts(l, c, &mut body)?; da_append(&mut (*c).asm_funcs, AsmFunc {name, name_loc, body}); } _ => { // Variable definition - (*l).parse_point = saved_point; - let mut global = Global { name, values: zeroed(), From 6047287c199ec764196f14922b6056934299523c Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 22:22:32 +0100 Subject: [PATCH 16/19] Add test for lexer errors --- tests.json | 49 +++++++++++++++++++++++++++++++++++++++++++- tests/lexer_errors.b | 14 +++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 tests/lexer_errors.b diff --git a/tests.json b/tests.json index d854ee2f..21028848 100644 --- a/tests.json +++ b/tests.json @@ -1878,5 +1878,52 @@ "state": "Enabled", "comment": "" } + }, + "lexer_errors": { + "gas-x86_64-windows": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + }, + "gas-x86_64-linux": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + }, + "gas-x86_64-darwin": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + }, + "gas-aarch64-linux": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + }, + "gas-aarch64-darwin": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + }, + "uxn": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + }, + "6502": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + }, + "fasm-x86_64-windows": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + }, + "fasm-x86_64-linux": { + "expected_stdout": "", + "state": "Disabled", + "comment": "TODO: Expected to not compile." + } } -} +} \ No newline at end of file diff --git a/tests/lexer_errors.b b/tests/lexer_errors.b new file mode 100644 index 00000000..29873693 --- /dev/null +++ b/tests/lexer_errors.b @@ -0,0 +1,14 @@ +// TODO: Enable in test suite once negative testing is supported. + +main() { + // Invalid character literals + ''; + 'EEE'; + + // Literal overflow + 0xfffffffffffffffffffff; + 07777777777777777777777; + + // Unknown escape sequences + "\foo\bar\baz"; +} From 0f8f761b47efb80dabb2cde1f4a8429dc463ac6e Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 23:45:20 +0100 Subject: [PATCH 17/19] lexer: Don't be fancy --- src/lexer.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lexer.rs b/src/lexer.rs index 72114fbf..6278b2c4 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -499,7 +499,8 @@ pub unsafe fn peek_token(l: *mut Lexer) -> Option { } pub unsafe fn get_token(l: *mut Lexer) -> Result { - if let Some(result) = (*l).next_result.take() { + if let Some(result) = (*l).next_result { + (*l).next_result = None; (*l).parse_point = (*l).next_point; return result; } From 95d1f97bc0573ea607470a865dfa5f98dadd4119 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 23:48:08 +0100 Subject: [PATCH 18/19] lexer: Document ErrorKind intent --- src/lexer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 6278b2c4..bc0e937f 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -33,8 +33,8 @@ pub type Result = core::result::Result<(), ErrorKind>; #[derive(Clone, Copy)] pub enum ErrorKind { - Error, - Fatal, + Error, // An (invalid) token is available; compilation can continue. + Fatal, // No token is available; compilation must be terminated. } #[derive(Clone, Copy, PartialEq)] From 6f0cdf814809ce436980e0e6058685a740ce9e01 Mon Sep 17 00:00:00 2001 From: Mikko Marttila <13412395+mikmart@users.noreply.github.com> Date: Tue, 8 Jul 2025 23:54:46 +0100 Subject: [PATCH 19/19] Only allocate label when needed --- src/b.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/b.rs b/src/b.rs index 8b444a58..03a8fe45 100644 --- a/src/b.rs +++ b/src/b.rs @@ -921,10 +921,10 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { if peek_token(l, c)? == Token::ID { let saved_point = (*l).parse_point; get_token(l, c)?; - let name = arena::strdup(&mut (*c).arena, (*l).string); - let name_loc = (*l).loc; get_token(l, c)?; if (*l).token == Token::Colon { + let name = arena::strdup(&mut (*c).arena, (*l).string); + let name_loc = (*l).loc; let label = allocate_label_index(c); push_opcode(Op::Label{label}, name_loc, c); define_goto_label(c, name, name_loc, label)?;