diff --git a/.gitignore b/.gitignore index b1dd771e..4e2a954a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ __pycache__ .envrc *.idea *.prof +.ruff_cache/ +.mypy_cache/ aaa-stdlib/target/* aaa-stdlib-user/target/* diff --git a/aaa/parser/models.py b/aaa/parser/models.py index cdd77195..089c2ac1 100644 --- a/aaa/parser/models.py +++ b/aaa/parser/models.py @@ -341,12 +341,18 @@ def __init__( def get_source_file(self) -> Path: source_path = Path(self.source.value) - if source_path.is_file() and self.source.value.endswith(".aaa"): - return source_path + if source_path.is_absolute(): + if source_path.is_file() and self.source.value.endswith(".aaa"): + return source_path + else: - return self.position.file.parent / ( - self.source.value.replace(".", os.sep) + ".aaa" - ) + source_path = (self.position.file.parent / source_path).resolve() + if source_path.is_file() and self.source.value.endswith(".aaa"): + return source_path + + return self.position.file.parent / ( + self.source.value.replace(".", os.sep) + ".aaa" + ) @classmethod def load(cls, children: list[AaaParseModel | Token]) -> Import: diff --git a/aaa/type_checker/type_checker.py b/aaa/type_checker/type_checker.py index 4cbb03ba..cc180349 100644 --- a/aaa/type_checker/type_checker.py +++ b/aaa/type_checker/type_checker.py @@ -256,6 +256,10 @@ def _confirm_return_types( if not isinstance(computed_value, VariableType): return False + # A struct is never equal to an enum + if not isinstance(computed_value.type, type(expected_value.type)): + return False + if computed_value.type != expected_value.type: return False @@ -312,6 +316,10 @@ def _match_signature_items( if not isinstance(var_type, VariableType): raise SignatureItemMismatch + # prevent comparing enum with struct + if not isinstance(expected_var_type.type, type(var_type.type)): + raise SignatureItemMismatch + if expected_var_type.type != var_type.type: raise SignatureItemMismatch diff --git a/examples/selfhosting/main.aaa b/examples/selfhosting/main.aaa new file mode 100644 index 00000000..6c917451 --- /dev/null +++ b/examples/selfhosting/main.aaa @@ -0,0 +1,19 @@ +from "syntax_loader" import + syntax_loader_new_from_file, + SyntaxLoaderResult, + +fn main { + "syntax.json" syntax_loader_new_from_file + match { + case SyntaxLoaderResult:error as error { + error . + "\n" . + 1 exit + } + case SyntaxLoaderResult:ok as syntax_loader { + syntax_loader + } + } + + drop // TODO do something with SyntaxLoader +} diff --git a/examples/selfhosting/syntax_loader.aaa b/examples/selfhosting/syntax_loader.aaa new file mode 100644 index 00000000..034341b0 --- /dev/null +++ b/examples/selfhosting/syntax_loader.aaa @@ -0,0 +1,200 @@ +from "../json/parser.aaa" import json_from_str, Json, JsonResult, JsonError + +enum SyntaxLoaderResult { + ok as SyntaxLoader, + error as str, +} + +struct SyntaxLoader { + tokens as map[str, regex], + nodes as map[str, str], + filtered_tokens as vec[str], + root_node as str, +} + +enum FileReadError { + open_error as str, + read_error as str, +} + +fn FileReadError:to_str args error as FileReadError return str { + error + match { + case FileReadError:open_error as path { "Could not open " path str:append } + case FileReadError:read_error as path { "Could not read " path str:append } + } +} + +enum FileReadResult { + ok as str, + error as FileReadError, +} + +fn read_file args path as const str, return FileReadResult { + path 0 0 open + + use fd, open_ok { + if open_ok not { + path + copy swap drop // TODO make ticket for const arguments of enum-ctors + FileReadError:open_error FileReadResult:error return + } + + "" + use content { + while true { + fd 4096 read + + use buff, read_ok { + if read_ok not { + path + copy swap drop + FileReadError:read_error FileReadResult:error return + } + + if buff "" = { + content FileReadResult:ok return + } + + content <- { content buff str:append } + } + } + } + } +} + +fn syntax_loader_new_from_file args path as str return SyntaxLoaderResult { + path read_file + match { + case FileReadResult:ok as text { text syntax_loader_new_from_str } + case FileReadResult:error as error { error FileReadError:to_str SyntaxLoaderResult:error } + } +} + +enum LoadTokensResult { + ok as map[str, regex], + error as str, +} + +enum LoadNodesResult { + ok as map[str, str], + error as str, +} + +enum LoadFilteredTokensResult { + ok as vec[str], + error as str, +} + +enum LoadRootNodeResult { + ok as str, + error as str, +} + +fn load_regular_tokens args root_object as map[str, Json] return LoadTokensResult { + todo // TODO +} + +fn load_keyword_tokens args root_object as map[str, Json] return LoadTokensResult { + todo // TODO +} + +enum MergeMapsResult { + ok as map[str, regex], + error as vec[str], +} + +// Merge maps but fail if maps have overlapping keys +fn merge_maps args lhs as map[str, regex], rhs as map[str, regex] return MergeMapsResult { + vec[str] map[str, regex] + use overlap, merged { + lhs foreach { + use key, value { + if rhs key map:has_key { + overlap key vec:push + } + merged key value map:set + } + } + + if overlap vec:empty not { + overlap MergeMapsResult:error return + } + + rhs foreach { + use key, value { + merged key value map:set + } + } + + merged MergeMapsResult:ok + } +} + +fn load_tokens args root_object as map[str, Json] return LoadTokensResult { + root_object load_keyword_tokens + match { + case LoadTokensResult:error as error { error LoadTokensResult:error return } + case LoadTokensResult:ok as keyword_tokens { + root_object load_regular_tokens + match { + case LoadTokensResult:error as error { error LoadTokensResult:error return } + case LoadTokensResult:ok as regular_tokens { keyword_tokens regular_tokens } + } + } + } + + use keyword_tokens, regular_tokens { + keyword_tokens regular_tokens merge_maps + match { + case MergeMapsResult:error { todo } + case MergeMapsResult:ok as tokens { tokens LoadTokensResult:ok } + } + } +} + +fn load_nodes args root_object as map[str, Json] return LoadNodesResult { + todo // TODO +} + +fn load_filtered_tokens args root_object as map[str, Json] return LoadFilteredTokensResult { + todo // TODO +} + +fn load_root_node args root_object as map[str, Json] return LoadRootNodeResult { + todo // TODO +} + +fn syntax_loader_new_from_str args text as str return SyntaxLoaderResult { + text json_from_str + match { + case JsonResult:error as error { error JsonError:to_str SyntaxLoaderResult:error return } + case JsonResult:ok as json { json } + } + + use json { + json + match { + case Json:object as object { object } + default { "json root should be an object" SyntaxLoaderResult:error return } + } + } + + SyntaxLoader + use root_object, syntax_loader { + root_object load_tokens + match { + case LoadTokensResult:ok as tokens { syntax_loader "tokens" { tokens } ! } + case LoadTokensResult:error as error { error SyntaxLoaderResult:error return } + } + + todo + // TODO filtered tokens + // TODO load nodes + // TODO root node + // TODO check for extra values in root_dict + } + + // TODO run equivalent of `_check_values()` + // TODO run equivalent of `_load_parsers()` +} diff --git a/examples/selfhosting/test_tokenizer.aaa b/examples/selfhosting/test_tokenizer.aaa deleted file mode 100644 index 20b0a9b0..00000000 --- a/examples/selfhosting/test_tokenizer.aaa +++ /dev/null @@ -1,389 +0,0 @@ -from "tokenizer" import - make_tokenizer, - OptionalToken, - Token, - Tokenizer, - TokenType, - - -fn check_tokenize_whitespace_fail args input as str, offset as int { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_whitespace - match { - case OptionalToken:some { unreachable } - case OptionalToken:none { nop } - } -} - -fn test_tokenize_whitespace_fail { - "" 0 check_tokenize_whitespace_fail - "." 0 check_tokenize_whitespace_fail - "a" 0 check_tokenize_whitespace_fail - "\\" 0 check_tokenize_whitespace_fail - "/" 0 check_tokenize_whitespace_fail -} - -fn check_tokenize_whitespace args input as str, offset as int, expected_value as const str { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_whitespace - match { - case OptionalToken:some { - use token_ { - token_ "value" ? - use value { - value expected_value = assert - } - } - } - case OptionalToken:none { unreachable } - } -} - -fn test_tokenize_whitespace_ok { - "a a" 1 " " check_tokenize_whitespace - "a\na" 1 "\n" check_tokenize_whitespace - "a\ra" 1 "\r" check_tokenize_whitespace - "a \n \r\r\n \n\na" 1 " \n \r\r\n \n\n" check_tokenize_whitespace -} - -fn check_tokenize_comment_fail args input as str, offset as int { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_comment - match { - case OptionalToken:some { unreachable } - case OptionalToken:none { nop } - } -} - -fn test_tokenize_comment_fail { - "" 0 check_tokenize_comment_fail - "." 0 check_tokenize_comment_fail - "a" 0 check_tokenize_comment_fail - "/" 0 check_tokenize_comment_fail - "a/" 0 check_tokenize_comment_fail - "/ab" 0 check_tokenize_comment_fail -} - -fn check_tokenize_comment args input as str, offset as int, expected_value as const str { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_comment - match { - case OptionalToken:some { - use token_ { - token_ "value" ? - use value { - value expected_value = assert - } - } - } - case OptionalToken:none { unreachable } - } -} - -fn test_tokenize_comment_ok { - "//" 0 "//" check_tokenize_comment - "// something " 0 "// something " check_tokenize_comment - "a// something " 1 "// something " check_tokenize_comment - "// something \nb" 0 "// something " check_tokenize_comment - "a// something \nb" 1 "// something " check_tokenize_comment - "a// something \n" 1 "// something " check_tokenize_comment -} - -fn check_tokenize_integer_fail args input as str, offset as int { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_integer - match { - case OptionalToken:some { unreachable } - case OptionalToken:none { nop } - } -} - -fn test_tokenize_integer_fail { - "" 0 check_tokenize_integer_fail - "." 0 check_tokenize_integer_fail - "a" 0 check_tokenize_integer_fail - "/" 0 check_tokenize_integer_fail - "\\" 0 check_tokenize_integer_fail - "-" 0 check_tokenize_integer_fail -} - -fn check_tokenize_integer args input as str, offset as int, expected_value as const str { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_integer - match { - case OptionalToken:some { - use token_ { - token_ "value" ? - use value { - value expected_value = assert - } - } - } - case OptionalToken:none { unreachable } - } - "123a" 0 "123" check_tokenize_integer - "a123a" 1 "123" check_tokenize_integer - "a123" 1 "123" check_tokenize_integer - "-123a" 0 "-123" check_tokenize_integer - "a-123a" 1 "-123" check_tokenize_integer - "a-123" 1 "-123" check_tokenize_integer -} - -fn check_tokenize_fixed_size_fail args input as str, offset as int { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_fixed_size - match { - case OptionalToken:some { unreachable } - case OptionalToken:none { nop } - } -} - -fn test_tokenize_fixed_size_fail { - "argsabc" 0 check_tokenize_fixed_size_fail - "asabc" 0 check_tokenize_fixed_size_fail - "caseabc" 0 check_tokenize_fixed_size_fail - "constabc" 0 check_tokenize_fixed_size_fail - "defaultabc" 0 check_tokenize_fixed_size_fail - "elseabc" 0 check_tokenize_fixed_size_fail - "enumabc" 0 check_tokenize_fixed_size_fail - "falseabc" 0 check_tokenize_fixed_size_fail - "fnabc" 0 check_tokenize_fixed_size_fail - "foreachabc" 0 check_tokenize_fixed_size_fail - "fromabc" 0 check_tokenize_fixed_size_fail - "ifabc" 0 check_tokenize_fixed_size_fail - "importabc" 0 check_tokenize_fixed_size_fail - "matchabc" 0 check_tokenize_fixed_size_fail - "neverabc" 0 check_tokenize_fixed_size_fail - "returnabc" 0 check_tokenize_fixed_size_fail - "structabc" 0 check_tokenize_fixed_size_fail - "trueabc" 0 check_tokenize_fixed_size_fail - "typeabc" 0 check_tokenize_fixed_size_fail - "useabc" 0 check_tokenize_fixed_size_fail - "whileabc" 0 check_tokenize_fixed_size_fail -} - -fn check_tokenize_fixed_size args input as str, offset as int, expected_type as TokenType, expected_value as str { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_fixed_size - match { - case OptionalToken:some { - use token_ { - token_ - - Token - dup "type_" { expected_type } ! - dup "value" { expected_value } ! - - = assert - } - } - case OptionalToken:none { unreachable } - } -} - -fn test_tokenize_fixed_size_ok_bare { - "-" 0 TokenType:identifier "-" check_tokenize_fixed_size - "," 0 TokenType:comma "," check_tokenize_fixed_size - ":" 0 TokenType:colon ":" check_tokenize_fixed_size - "!" 0 TokenType:set_field "!" check_tokenize_fixed_size - "?" 0 TokenType:get_field "?" check_tokenize_fixed_size - "." 0 TokenType:identifier "." check_tokenize_fixed_size - "[" 0 TokenType:sq_start "[" check_tokenize_fixed_size - "]" 0 TokenType:sq_end "]" check_tokenize_fixed_size - "{" 0 TokenType:start "{" check_tokenize_fixed_size - "}" 0 TokenType:end "}" check_tokenize_fixed_size - "*" 0 TokenType:identifier "*" check_tokenize_fixed_size - "/" 0 TokenType:identifier "/" check_tokenize_fixed_size - "%" 0 TokenType:identifier "%" check_tokenize_fixed_size - "+" 0 TokenType:identifier "+" check_tokenize_fixed_size - "<-" 0 TokenType:assign "<-" check_tokenize_fixed_size - "<" 0 TokenType:identifier "<" check_tokenize_fixed_size - "<=" 0 TokenType:identifier "<=" check_tokenize_fixed_size - "=" 0 TokenType:identifier "=" check_tokenize_fixed_size - ">" 0 TokenType:identifier ">" check_tokenize_fixed_size - ">=" 0 TokenType:identifier ">=" check_tokenize_fixed_size - "args" 0 TokenType:args_ "args" check_tokenize_fixed_size - "as" 0 TokenType:as_ "as" check_tokenize_fixed_size - "call" 0 TokenType:call_ "call" check_tokenize_fixed_size - "case" 0 TokenType:case_ "case" check_tokenize_fixed_size - "const" 0 TokenType:const_ "const" check_tokenize_fixed_size - "default" 0 TokenType:default_ "default" check_tokenize_fixed_size - "else" 0 TokenType:else_ "else" check_tokenize_fixed_size - "enum" 0 TokenType:enum_ "enum" check_tokenize_fixed_size - "false" 0 TokenType:false_ "false" check_tokenize_fixed_size - "fn" 0 TokenType:fn_ "fn" check_tokenize_fixed_size - "foreach" 0 TokenType:foreach_ "foreach" check_tokenize_fixed_size - "from" 0 TokenType:from_ "from" check_tokenize_fixed_size - "if" 0 TokenType:if_ "if" check_tokenize_fixed_size - "import" 0 TokenType:import_ "import" check_tokenize_fixed_size - "match" 0 TokenType:match_ "match" check_tokenize_fixed_size - "never" 0 TokenType:never_ "never" check_tokenize_fixed_size - "return" 0 TokenType:return_ "return" check_tokenize_fixed_size - "struct" 0 TokenType:struct_ "struct" check_tokenize_fixed_size - "true" 0 TokenType:true_ "true" check_tokenize_fixed_size - "use" 0 TokenType:use_ "use" check_tokenize_fixed_size - "while" 0 TokenType:while_ "while" check_tokenize_fixed_size -} - -fn test_tokenize_fixed_size_ok_middle { - " - " 1 TokenType:identifier "-" check_tokenize_fixed_size - " , " 1 TokenType:comma "," check_tokenize_fixed_size - " : " 1 TokenType:colon ":" check_tokenize_fixed_size - " ! " 1 TokenType:set_field "!" check_tokenize_fixed_size - " ? " 1 TokenType:get_field "?" check_tokenize_fixed_size - " . " 1 TokenType:identifier "." check_tokenize_fixed_size - " [ " 1 TokenType:sq_start "[" check_tokenize_fixed_size - " ] " 1 TokenType:sq_end "]" check_tokenize_fixed_size - " { " 1 TokenType:start "{" check_tokenize_fixed_size - " } " 1 TokenType:end "}" check_tokenize_fixed_size - " * " 1 TokenType:identifier "*" check_tokenize_fixed_size - " / " 1 TokenType:identifier "/" check_tokenize_fixed_size - " % " 1 TokenType:identifier "%" check_tokenize_fixed_size - " + " 1 TokenType:identifier "+" check_tokenize_fixed_size - " <- " 1 TokenType:assign "<-" check_tokenize_fixed_size - " < " 1 TokenType:identifier "<" check_tokenize_fixed_size - " <= " 1 TokenType:identifier "<=" check_tokenize_fixed_size - " = " 1 TokenType:identifier "=" check_tokenize_fixed_size - " > " 1 TokenType:identifier ">" check_tokenize_fixed_size - " >= " 1 TokenType:identifier ">=" check_tokenize_fixed_size - " args " 1 TokenType:args_ "args" check_tokenize_fixed_size - " as " 1 TokenType:as_ "as" check_tokenize_fixed_size - " call " 1 TokenType:call_ "call" check_tokenize_fixed_size - " case " 1 TokenType:case_ "case" check_tokenize_fixed_size - " const " 1 TokenType:const_ "const" check_tokenize_fixed_size - " default " 1 TokenType:default_ "default" check_tokenize_fixed_size - " else " 1 TokenType:else_ "else" check_tokenize_fixed_size - " enum " 1 TokenType:enum_ "enum" check_tokenize_fixed_size - " false " 1 TokenType:false_ "false" check_tokenize_fixed_size - " fn " 1 TokenType:fn_ "fn" check_tokenize_fixed_size - " foreach " 1 TokenType:foreach_ "foreach" check_tokenize_fixed_size - " from " 1 TokenType:from_ "from" check_tokenize_fixed_size - " if " 1 TokenType:if_ "if" check_tokenize_fixed_size - " import " 1 TokenType:import_ "import" check_tokenize_fixed_size - " match " 1 TokenType:match_ "match" check_tokenize_fixed_size - " never " 1 TokenType:never_ "never" check_tokenize_fixed_size - " return " 1 TokenType:return_ "return" check_tokenize_fixed_size - " struct " 1 TokenType:struct_ "struct" check_tokenize_fixed_size - " true " 1 TokenType:true_ "true" check_tokenize_fixed_size - " use " 1 TokenType:use_ "use" check_tokenize_fixed_size - " while " 1 TokenType:while_ "while" check_tokenize_fixed_size -} - -fn check_tokenize_string_fail args input as str, offset as int { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_string - match { - case OptionalToken:some { unreachable } - case OptionalToken:none { nop } - } -} - -fn test_tokenize_string_fail { - "" 0 check_tokenize_string_fail - "a" 0 check_tokenize_string_fail - "." 0 check_tokenize_string_fail - "." 0 check_tokenize_string_fail - "\"" 0 check_tokenize_string_fail - "a\"" 0 check_tokenize_string_fail - "\"a" 0 check_tokenize_string_fail - "\"\\" 0 check_tokenize_string_fail -} - -fn check_tokenize_string args input as str, offset as int, expected_value as const str { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_string - match { - case OptionalToken:some { - use token_ { - token_ "value" ? - use value { - value expected_value = assert - } - } - } - case OptionalToken:none { unreachable } - } -} - -fn test_tokenize_string_ok { - "\"\"" 0 "\"\"" check_tokenize_string - "\"\" " 0 "\"\"" check_tokenize_string - " \"\"" 1 "\"\"" check_tokenize_string - " \"\" " 1 "\"\"" check_tokenize_string - "\"a\"" 0 "\"a\"" check_tokenize_string - " \"a\"" 1 "\"a\"" check_tokenize_string - " \"a\" " 1 "\"a\"" check_tokenize_string - "\"\n\"" 0 "\"\n\"" check_tokenize_string - " \"\n\"" 1 "\"\n\"" check_tokenize_string - " \"\n\" " 1 "\"\n\"" check_tokenize_string - "\"\\n\"" 0 "\"\\n\"" check_tokenize_string - " \"\\n\"" 1 "\"\\n\"" check_tokenize_string - " \"\\n\" " 1 "\"\\n\"" check_tokenize_string - "\"\\\\\"" 0 "\"\\\\\"" check_tokenize_string - " \"\\\\\"" 1 "\"\\\\\"" check_tokenize_string - " \"\\\\\" " 1 "\"\\\\\"" check_tokenize_string -} - -fn check_tokenize_identifier_fail args input as str, offset as int { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_string - match { - case OptionalToken:some { unreachable } - case OptionalToken:none { nop } - } -} - -fn test_tokenize_identifier_fail { - "" 0 check_tokenize_identifier_fail - "3" 0 check_tokenize_identifier_fail - "." 0 check_tokenize_identifier_fail - "\n" 0 check_tokenize_identifier_fail -} - -fn check_tokenize_identifier args input as str, offset as int, expected_value as const str { - "" input make_tokenizer - dup "offset" { offset } ! - Tokenizer:tokenize_identifier - match { - case OptionalToken:some { - use token_ { - token_ "value" ? - use value { - value expected_value = assert - } - } - } - case OptionalToken:none { unreachable } - } -} - -fn test_tokenize_identifier_ok { - "a" 0 "a" check_tokenize_identifier - "z" 0 "z" check_tokenize_identifier - "A" 0 "A" check_tokenize_identifier - "Z" 0 "Z" check_tokenize_identifier - "_" 0 "_" check_tokenize_identifier - - " a " 1 "a" check_tokenize_identifier - " z " 1 "z" check_tokenize_identifier - " A " 1 "A" check_tokenize_identifier - " Z " 1 "Z" check_tokenize_identifier - " _ " 1 "_" check_tokenize_identifier - - "aaaa" 0 "aaaa" check_tokenize_identifier - "zzzz" 0 "zzzz" check_tokenize_identifier - "AAAA" 0 "AAAA" check_tokenize_identifier - "ZZZZ" 0 "ZZZZ" check_tokenize_identifier - "____" 0 "____" check_tokenize_identifier -} diff --git a/examples/selfhosting/tokenizer.aaa b/examples/selfhosting/tokenizer.aaa deleted file mode 100644 index 35443772..00000000 --- a/examples/selfhosting/tokenizer.aaa +++ /dev/null @@ -1,628 +0,0 @@ -struct Range { - next_value as int, - end as int, -} - -fn make_range args start as int, end as int return Range { - Range - dup "next_value" { start } ! - dup "end" { end } ! -} - -fn Range:iter args r as Range return Range { - r -} - -fn Range:next args r as Range return int, bool { - r "next_value" ? - r "end" ? - use next_value, end { - if next_value end >= { - 0 false return - } - - r "next_value" { next_value 1 + } ! - next_value true - } -} - -enum FileReadResult { - ok as str, - open_error, - read_error, -} - -fn read_file args path as const str, return FileReadResult { - path 0 0 open - - use fd, open_ok { - if open_ok not { - FileReadResult:open_error return - } - - "" - use content { - while true { - fd 4096 read - - use buff, read_ok { - if read_ok not { - FileReadResult:read_error return - } - - if buff "" = { - content FileReadResult:ok return - } - - content <- { content buff str:append } - } - } - } - } -} - -enum TokenType { - args_, - as_, - assign, - builtin_, - call_, - case_, - char_, - colon, - comma, - comment, - const_, - default_, - else_, - end, - enum_, - false_, - fn_, - foreach_, - from_, - get_field, - identifier, - if_, - import_, - integer, - match_, - never_, - return_, - set_field, - sq_end, - sq_start, - start, - string, - struct_, - true_, - use_, - while_, - whitespace, -} - -fn TokenType:to_str args token_type as const TokenType return str { - token_type - match { - case TokenType:args_ { "args" } - case TokenType:as_ { "as" } - case TokenType:assign { "assign" } - case TokenType:builtin_ { "builtin" } - case TokenType:call_ { "call" } - case TokenType:case_ { "case" } - case TokenType:char_ { "char" } - case TokenType:colon { "colon" } - case TokenType:comma { "comma" } - case TokenType:comment { "comment" } - case TokenType:const_ { "const" } - case TokenType:default_ { "default" } - case TokenType:else_ { "else" } - case TokenType:end { "end" } - case TokenType:enum_ { "enum" } - case TokenType:false_ { "false" } - case TokenType:fn_ { "fn" } - case TokenType:foreach_ { "foreach" } - case TokenType:from_ { "from" } - case TokenType:get_field { "get_field" } - case TokenType:identifier { "identifier" } - case TokenType:if_ { "if" } - case TokenType:import_ { "import" } - case TokenType:integer { "integer" } - case TokenType:match_ { "match" } - case TokenType:never_ { "never" } - case TokenType:return_ { "return" } - case TokenType:set_field { "set_field" } - case TokenType:sq_end { "sq_end" } - case TokenType:sq_start { "sq_start" } - case TokenType:start { "start" } - case TokenType:string { "string" } - case TokenType:struct_ { "struct" } - case TokenType:true_ { "true" } - case TokenType:use_ { "use" } - case TokenType:while_ { "while" } - case TokenType:whitespace { "whitespace" } - } -} - -struct FixedSizedToken { - value as str, - type_ as TokenType, -} - -fn make_fixed_sized_token args value as str, type_ as TokenType return FixedSizedToken { - FixedSizedToken - dup "type_" { type_ } ! - dup "value" { value } ! -} - -fn get_fixed_sized_tokens return vec[FixedSizedToken] { - vec[FixedSizedToken] - use v { - // NOTE: keep this sorted by longest first, and for same length alphabetical - v "builtin" TokenType:builtin_ make_fixed_sized_token vec:push - v "default" TokenType:default_ make_fixed_sized_token vec:push - v "foreach" TokenType:foreach_ make_fixed_sized_token vec:push - v "import" TokenType:import_ make_fixed_sized_token vec:push - v "return" TokenType:return_ make_fixed_sized_token vec:push - v "struct" TokenType:struct_ make_fixed_sized_token vec:push - v "const" TokenType:const_ make_fixed_sized_token vec:push - v "false" TokenType:false_ make_fixed_sized_token vec:push - v "match" TokenType:match_ make_fixed_sized_token vec:push - v "never" TokenType:never_ make_fixed_sized_token vec:push - v "while" TokenType:while_ make_fixed_sized_token vec:push - v "args" TokenType:args_ make_fixed_sized_token vec:push - v "call" TokenType:call_ make_fixed_sized_token vec:push - v "case" TokenType:case_ make_fixed_sized_token vec:push - v "else" TokenType:else_ make_fixed_sized_token vec:push - v "enum" TokenType:enum_ make_fixed_sized_token vec:push - v "from" TokenType:from_ make_fixed_sized_token vec:push - v "true" TokenType:true_ make_fixed_sized_token vec:push - v "use" TokenType:use_ make_fixed_sized_token vec:push - v "<-" TokenType:assign make_fixed_sized_token vec:push - v "<=" TokenType:identifier make_fixed_sized_token vec:push - v ">=" TokenType:identifier make_fixed_sized_token vec:push - v "as" TokenType:as_ make_fixed_sized_token vec:push - v "fn" TokenType:fn_ make_fixed_sized_token vec:push - v "if" TokenType:if_ make_fixed_sized_token vec:push - v "-" TokenType:identifier make_fixed_sized_token vec:push - v "," TokenType:comma make_fixed_sized_token vec:push - v ":" TokenType:colon make_fixed_sized_token vec:push - v "!" TokenType:set_field make_fixed_sized_token vec:push - v "?" TokenType:get_field make_fixed_sized_token vec:push - v "." TokenType:identifier make_fixed_sized_token vec:push - v "[" TokenType:sq_start make_fixed_sized_token vec:push - v "]" TokenType:sq_end make_fixed_sized_token vec:push - v "{" TokenType:start make_fixed_sized_token vec:push - v "}" TokenType:end make_fixed_sized_token vec:push - v "*" TokenType:identifier make_fixed_sized_token vec:push - v "/" TokenType:identifier make_fixed_sized_token vec:push - v "%" TokenType:identifier make_fixed_sized_token vec:push - v "+" TokenType:identifier make_fixed_sized_token vec:push - v "<" TokenType:identifier make_fixed_sized_token vec:push - v "=" TokenType:identifier make_fixed_sized_token vec:push - v ">" TokenType:identifier make_fixed_sized_token vec:push - - v - } -} - -struct Position { - file as str, - line as int, - column as int, -} - -fn make_position args file as str, line as int, column as int return Position { - Position - dup "file" { file } ! - dup "line" { line } ! - dup "column" { column } ! -} - -fn Position:update args position as Position, token as Token { - token "value" ? - use value { - 0 value str:len make_range - foreach { - use i { - - value i str:at - use char_, ok { - if ok not { - unreachable - } - - if char_ '\n' = { - position "line" { position "line" ? 1 + } ! - position "column" { 1 } ! - } - else { - position "column" { position "column" ? 1 + } ! - } - } - } - } - } -} - -struct Token { - position as Position, - type_ as TokenType, - value as str, -} - -fn Token:= args lhs as const Token, rhs as const Token return bool { - if - lhs "type_" ? TokenType:to_str - rhs "type_" ? TokenType:to_str - = not - { - false return - } - - lhs "value" ? - rhs "value" ? - = -} - -enum OptionalToken { - some as Token, - none, -} - -struct Tokenizer { - filename as str, - input as str, - offset as int, - offset_position as Position, - fixed_sized_tokens as vec[FixedSizedToken], - whitespace_regex as regex, - comment_regex as regex, - integer_regex as regex, - identifier_regex as regex, - string_regex as regex, - character_regex as regex, -} - -fn make_tokenizer args filename as str, input as str return Tokenizer { - Tokenizer - dup "filename" { filename } ! - dup "input" { input } ! - dup "offset_position" { filename 1 1 make_position } ! - dup "fixed_sized_tokens" { get_fixed_sized_tokens } ! - dup "whitespace_regex" { "\\s+" make_regex assert } ! - dup "comment_regex" { "//[^\n]*" make_regex assert } ! - dup "integer_regex" { "(-)?[0-9]+" make_regex assert } ! - dup "identifier_regex" { "[a-zA-Z_]+" make_regex assert } ! - dup "string_regex" { "\"(\\\\.|.|\n)*?\"" make_regex assert } ! - dup "character_regex" { "'(\\\\.|.|\n)'" make_regex assert } ! -} - -fn Tokenizer:tokenize_whitespace args tokenizer as Tokenizer return OptionalToken { - tokenizer "input" ? - tokenizer "offset" ? - tokenizer "whitespace_regex" ? - use input, offset, whitespace_regex { - whitespace_regex input offset regex:find - use matched_str, matched_offset, matched { - if matched not matched_offset offset = not or { - OptionalToken:none - } else { - Token - dup "type_" { TokenType:whitespace } ! - dup "value" { matched_str } ! - OptionalToken:some - } - } - } -} - -fn Tokenizer:tokenize_comment args tokenizer as Tokenizer return OptionalToken { - tokenizer "input" ? - tokenizer "offset" ? - tokenizer "comment_regex" ? - use input, offset, comment_regex { - comment_regex input offset regex:find - use matched_str, matched_offset, matched { - if matched not matched_offset offset = not or { - OptionalToken:none return - } - - Token - dup "type_" { TokenType:comment } ! - dup "value" { matched_str } ! - OptionalToken:some - } - } -} - -fn Tokenizer:tokenize_integer args tokenizer as Tokenizer return OptionalToken { - tokenizer "input" ? - tokenizer "offset" ? - tokenizer "integer_regex" ? - use input, offset, integer_regex { - integer_regex input offset regex:find - use matched_str, matched_offset, matched { - if matched not matched_offset offset = not or { - OptionalToken:none return - } - - Token - dup "type_" { TokenType:integer } ! - dup "value" { matched_str } ! - OptionalToken:some - } - } -} - -fn matches_at args input as const str, search as str, offset as int return bool { - input search offset str:find_after - use found_offset, ok { - ok offset found_offset = and - } -} - -fn Tokenizer:match_fixed_size args tokenizer as Tokenizer, fixed_sized_token as FixedSizedToken return OptionalToken { - tokenizer "input" ? - tokenizer "offset" ? - fixed_sized_token "value" ? - fixed_sized_token "type_" ? - use input, offset, token_value, token_type { - if input token_value offset matches_at not { - OptionalToken:none return - } - - // prevent matching identifier `asdf` as keyword `as` - if - input offset token_value str:len + is_identifier_char - token_value 0 is_identifier_char - and - { - OptionalToken:none return - } - - Token - dup "type_" { token_type } ! - dup "value" { token_value } ! - OptionalToken:some - } -} - -fn Tokenizer:tokenize_fixed_size args tokenizer as Tokenizer return OptionalToken { - tokenizer "fixed_sized_tokens" ? - foreach { - use fixed_sized_token { - tokenizer fixed_sized_token Tokenizer:match_fixed_size - match { - case OptionalToken:some { - use optional_token { - drop - optional_token OptionalToken:some return - } - } - default { - nop - } - } - } - } - - OptionalToken:none -} - -fn Tokenizer:tokenize_string args tokenizer as Tokenizer return OptionalToken { - tokenizer "input" ? - tokenizer "offset" ? - tokenizer "string_regex" ? - use input, offset, string_regex { - string_regex input offset regex:find - use matched_str, matched_offset, matched { - if matched matched_offset offset = and not { - OptionalToken:none return - } - - Token - dup "type_" { TokenType:string } ! - dup "value" { matched_str } ! - OptionalToken:some - } - } -} - -fn Tokenizer:tokenize_character args tokenizer as Tokenizer return OptionalToken { - tokenizer "input" ? - tokenizer "offset" ? - tokenizer "character_regex" ? - use input, offset, character_regex { - character_regex input offset regex:find - use matched_str, matched_offset, matched { - if matched matched_offset offset = and not { - OptionalToken:none return - } - - Token - dup "type_" { TokenType:char_ } ! - dup "value" { matched_str } ! - OptionalToken:some - - } - } -} - -fn is_identifier_char args input as const str, offset as int return bool { - input offset str:at - use char_, ok { - if ok not { - false return - } - - char_ char:is_alpha - char_ '_' = - or - } -} - -fn Tokenizer:tokenize_identifier args tokenizer as Tokenizer return OptionalToken { - tokenizer "input" ? - tokenizer "offset" ? - tokenizer "identifier_regex" ? - use input, offset, identifier_regex { - identifier_regex input offset regex:find - use matched_str, matched_offset, matched { - if matched not matched_offset offset = not or { - OptionalToken:none return - } - - Token - dup "type_" { TokenType:identifier } ! - dup "value" { matched_str } ! - OptionalToken:some - } - } -} - -fn Tokenizer:tokenize_at_offset args tokenizer as Tokenizer return OptionalToken { - tokenizer Tokenizer:tokenize_whitespace - match { - case OptionalToken:some { OptionalToken:some return } - default { nop } - } - - tokenizer Tokenizer:tokenize_comment - match { - case OptionalToken:some { OptionalToken:some return } - default { nop } - } - - tokenizer Tokenizer:tokenize_integer - match { - case OptionalToken:some { OptionalToken:some return } - default { nop } - } - - tokenizer Tokenizer:tokenize_fixed_size - match { - case OptionalToken:some { OptionalToken:some return } - default { nop } - } - - tokenizer Tokenizer:tokenize_string - match { - case OptionalToken:some { OptionalToken:some return } - default { nop } - } - - tokenizer Tokenizer:tokenize_identifier - match { - case OptionalToken:some { OptionalToken:some return } - default { nop } - } - - tokenizer Tokenizer:tokenize_character -} - -enum TokenizeResult { - ok as vec[Token], - error, -} - -fn Tokenizer:run args tokenizer as Tokenizer return TokenizeResult { - vec[Token] - tokenizer "input" ? - use tokens, input { - while tokenizer "offset" ? input str:len < { - tokenizer Tokenizer:tokenize_at_offset - match { - case OptionalToken:none { - TokenizeResult:error return - } - case OptionalToken:some { - use token { - token "position" { tokenizer "offset_position" ? copy swap drop } ! - - tokens token vec:push - tokenizer "offset" { - tokenizer "offset" ? - token "value" ? str:len - + - } ! - tokenizer "offset_position" ? token Position:update - } - } - } - } - tokens TokenizeResult:ok return - } -} - -fn print_tokens args tokens as vec[Token] { - tokens - foreach { - dup drop - use token { - token "type_" ? - match { - case TokenType:whitespace { nop } - case TokenType:comment { nop } - default { - token "position" ? - use position { - position "file" ? . - ":" . - position "line" ? . - ":" . - position "column" ? . - } - " " . - token "type_" ? TokenType:to_str . - " " . - token "value" ? . - "\n" . - } - } - } - } -} - -fn main args argv as vec[str] return int { - if argv vec:len 2 = not { - "Usage: " . - argv 0 vec:get . - " \n" . - 1 return - } - - argv 1 vec:get - - use source_path { - source_path read_file - match { - case FileReadResult:open_error { - "Could not open " . source_path . "\n" . - 1 return - } - case FileReadResult:read_error { - "Could not read " . source_path . "\n" . - 1 return - } - case FileReadResult:ok { - use content { - source_path content make_tokenizer Tokenizer:run - match { - case TokenizeResult:ok { - use tokens { - tokens print_tokens - } - } - case TokenizeResult:error { - "Tokenization failed.\n" . - 1 return - } - } - } - } - } - } - - 0 -} diff --git a/tests/selfhosting/test_tokenizer.py b/tests/selfhosting/test_tokenizer.py index eaf6566c..c0fc0c69 100644 --- a/tests/selfhosting/test_tokenizer.py +++ b/tests/selfhosting/test_tokenizer.py @@ -53,6 +53,7 @@ def aaa_source_files() -> tuple[list[str], list[ParameterSet]]: ) +@pytest.mark.skip() # TODO @pytest.mark.parametrize(*aaa_source_files()) def test_tokenizer_output( tokenizer_excecutable: str, capfd: CaptureFixture[str], source_file: Path