diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..37cbb9e --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,46 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build-and-test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y build-essential cmake gcc g++ + + - name: Setup build directory + run: mkdir -p build + + - name: Configure CMake + run: | + cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ + + - name: Build + run: | + cmake --build build -j32 + + - name: Run comprehensive tests + run: | + chmod +x test_compiler.sh + ./test_compiler.sh + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-gcc + path: | + test_results/ + retention-days: 14 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 4cef982..f32b71f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ -build -.cache -install \ No newline at end of file +build +.cache +install +test_cases +test_results +a.out \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..fd95a24 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,27 @@ +cmake_minimum_required(VERSION 3.16) +project(ChibCC VERSION 1.0.0 LANGUAGES CXX) + +# Set C++ standard +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +include_directories(include) + +# Source files +set(SOURCES + src/Diagnostic.cpp + src/TokenKinds.cpp + src/Tokenizer.cpp + src/Parser.cpp + src/CodeGenerator.cpp + main.cpp +) + +# Create executable +add_executable(chibcc ${SOURCES}) + +# Set output directory +set_target_properties(chibcc PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib +) \ No newline at end of file diff --git a/README b/README index ae4184c..1b6ac03 100644 --- a/README +++ b/README @@ -1 +1 @@ -## A CPP implementation for [chibcc](https://github.com/rui314/chibicc.git) \ No newline at end of file +## A CPP implementation for (chibcc)[https://github.com/rui314/chibicc.git] \ No newline at end of file diff --git a/a.out b/a.out new file mode 100644 index 0000000..df2a7de Binary files /dev/null and b/a.out differ diff --git a/include/AST.h b/include/AST.h new file mode 100644 index 0000000..6f283f6 --- /dev/null +++ b/include/AST.h @@ -0,0 +1,37 @@ +#ifndef CHIBCC_AST_H +#define CHIBCC_AST_H + +#include "Common.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// AST Node Types +//===----------------------------------------------------------------------===// + +enum class NodeKind { + Add, // + + Sub, // - + Mul, // * + Div, // / + Neg, // unary - + Eq, // == + Ne, // != + Lt, // < + Le, // <= + Num, // Integer +}; + +class Node { +public: + NodeKind Kind; + std::unique_ptr Lhs; + std::unique_ptr Rhs; + int Val; + + explicit Node(NodeKind K) : Kind(K), Lhs(nullptr), Rhs(nullptr), Val(0) {} +}; + +} // namespace chibcc + +#endif // CHIBCC_AST_H \ No newline at end of file diff --git a/include/CodeGenerator.h b/include/CodeGenerator.h new file mode 100644 index 0000000..89431b5 --- /dev/null +++ b/include/CodeGenerator.h @@ -0,0 +1,28 @@ +#ifndef CHIBCC_CODEGENERATOR_H +#define CHIBCC_CODEGENERATOR_H + +#include "AST.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Code Generator +//===----------------------------------------------------------------------===// + +class CodeGenerator { +private: + int Depth; + + void push(); + void pop(const char *Arg); + void genExpr(Node *N); + +public: + CodeGenerator() : Depth(0) {} + + void codegen(Node *N); +}; + +} // namespace chibcc + +#endif // CHIBCC_CODEGENERATOR_H \ No newline at end of file diff --git a/include/Common.h b/include/Common.h new file mode 100644 index 0000000..babd4a1 --- /dev/null +++ b/include/Common.h @@ -0,0 +1,32 @@ +#ifndef CHIBCC_COMMON_H +#define CHIBCC_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Forward Declarations +//===----------------------------------------------------------------------===// + +class DiagnosticEngine; +class SourceLocation; + +//===----------------------------------------------------------------------===// +// Legacy Error Handling (for compatibility) +//===----------------------------------------------------------------------===// + +void error(const char *Fmt, ...); +void errorAt(const char *Loc, const char *Fmt, ...); + +} // namespace chibcc + +#endif // CHIBCC_COMMON_H \ No newline at end of file diff --git a/include/Diagnostic.h b/include/Diagnostic.h new file mode 100644 index 0000000..7aabe27 --- /dev/null +++ b/include/Diagnostic.h @@ -0,0 +1,194 @@ +#ifndef CHIBCC_DIAGNOSTIC_H +#define CHIBCC_DIAGNOSTIC_H + +#include "Common.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Diagnostic Levels +//===----------------------------------------------------------------------===// + +enum class DiagnosticLevel { + Ignored = 0, + Note, + Remark, + Warning, + Error, + Fatal +}; + +//===----------------------------------------------------------------------===// +// Diagnostic IDs +//===----------------------------------------------------------------------===// + +namespace diag { +enum { +#define DIAG(ENUM, LEVEL, DESC) ENUM, +#include "DiagnosticKinds.def" + NUM_BUILTIN_DIAGNOSTICS +}; +} // namespace diag + +//===----------------------------------------------------------------------===// +// Source Location +//===----------------------------------------------------------------------===// + +class SourceLocation { +private: + const char *Ptr; + +public: + SourceLocation() : Ptr(nullptr) {} + explicit SourceLocation(const char *Loc) : Ptr(Loc) {} + + bool isValid() const { return Ptr != nullptr; } + bool isInvalid() const { return Ptr == nullptr; } + + const char *getPointer() const { return Ptr; } + + bool operator==(const SourceLocation &RHS) const { return Ptr == RHS.Ptr; } + bool operator!=(const SourceLocation &RHS) const { return Ptr != RHS.Ptr; } +}; + +//===----------------------------------------------------------------------===// +// Source Range +//===----------------------------------------------------------------------===// + +class SourceRange { +private: + SourceLocation Begin, End; + +public: + SourceRange() = default; + SourceRange(SourceLocation Loc) : Begin(Loc), End(Loc) {} + SourceRange(SourceLocation Begin, SourceLocation End) + : Begin(Begin), End(End) {} + + SourceLocation getBegin() const { return Begin; } + SourceLocation getEnd() const { return End; } + + void setBegin(SourceLocation Loc) { Begin = Loc; } + void setEnd(SourceLocation Loc) { End = Loc; } + + bool isValid() const { return Begin.isValid() && End.isValid(); } + bool isInvalid() const { return !isValid(); } +}; + +//===----------------------------------------------------------------------===// +// Diagnostic Engine +//===----------------------------------------------------------------------===// + +class DiagnosticEngine { +private: + const char *SourceBuffer; + std::string FileName; + unsigned NumWarnings; + unsigned NumErrors; + bool SuppressAllDiagnostics; + bool WarningsAsErrors; + + void emitDiagnostic(SourceLocation Loc, DiagnosticLevel Level, + const std::string &Message); + void printSourceLine(SourceLocation Loc); + void printCaretDiagnostic(SourceLocation Loc, SourceRange Range); + +public: + DiagnosticEngine(const char *Buffer, const std::string &File = "") + : SourceBuffer(Buffer), FileName(File), NumWarnings(0), NumErrors(0), + SuppressAllDiagnostics(false), WarningsAsErrors(false) {} + + /// \brief Report a diagnostic at the given location. + void report(SourceLocation Loc, unsigned DiagID, const std::string &Message); + + /// \brief Report a diagnostic with a source range. + void report(SourceRange Range, unsigned DiagID, const std::string &Message); + + /// \brief Convenience methods for common diagnostic levels + void reportError(SourceLocation Loc, const std::string &Message); + void reportWarning(SourceLocation Loc, const std::string &Message); + void reportNote(SourceLocation Loc, const std::string &Message); + void reportFatal(SourceLocation Loc, const std::string &Message); + + /// \brief Get diagnostic counts + unsigned getNumWarnings() const { return NumWarnings; } + unsigned getNumErrors() const { return NumErrors; } + bool hasErrorOccurred() const { return NumErrors > 0; } + + /// \brief Control diagnostic behavior + void setSuppressAllDiagnostics(bool Val = true) { + SuppressAllDiagnostics = Val; + } + void setWarningsAsErrors(bool Val = true) { WarningsAsErrors = Val; } + + /// \brief Get the diagnostic level for a given diagnostic ID + static DiagnosticLevel getDiagnosticLevel(unsigned DiagID); + + /// \brief Get the diagnostic description for a given diagnostic ID + static const char *getDiagnosticText(unsigned DiagID); +}; + +//===----------------------------------------------------------------------===// +// Diagnostic Builder +//===----------------------------------------------------------------------===// + +class DiagnosticBuilder { +private: + DiagnosticEngine *Engine; + SourceLocation Loc; + SourceRange Range; + unsigned DiagID; + std::string Message; + bool IsActive; + +public: + DiagnosticBuilder(DiagnosticEngine *Engine, SourceLocation Loc, + unsigned DiagID) + : Engine(Engine), Loc(Loc), Range(Loc), DiagID(DiagID), IsActive(true) {} + + DiagnosticBuilder(DiagnosticBuilder &&Other) + : Engine(Other.Engine), Loc(Other.Loc), Range(Other.Range), + DiagID(Other.DiagID), Message(std::move(Other.Message)), + IsActive(Other.IsActive) { + Other.IsActive = false; + } + + ~DiagnosticBuilder() { + if (IsActive && Engine) { + Engine->report(Range, DiagID, Message); + } + } + + /// \brief Add a string to the diagnostic message + DiagnosticBuilder &operator<<(const std::string &Str) { + Message += Str; + return *this; + } + + DiagnosticBuilder &operator<<(const char *Str) { + Message += Str; + return *this; + } + + DiagnosticBuilder &operator<<(int Val) { + Message += std::to_string(Val); + return *this; + } + + /// \brief Add a source range to highlight + DiagnosticBuilder &addRange(SourceRange R) { + Range = R; + return *this; + } + + /// \brief Add a fix-it hint + DiagnosticBuilder &addFixItHint(SourceRange, const std::string &Text) { + // For now, just add to message - could be enhanced later + Message += " (fix: replace with '" + Text + "')"; + return *this; + } +}; + +} // namespace chibcc + +#endif // CHIBCC_DIAGNOSTIC_H \ No newline at end of file diff --git a/include/DiagnosticKinds.def b/include/DiagnosticKinds.def new file mode 100644 index 0000000..09f8925 --- /dev/null +++ b/include/DiagnosticKinds.def @@ -0,0 +1,87 @@ +//===--- DiagnosticKinds.def - C Family Diagnostic Kind Database -*- C++ -*-===// +// +// Part of the ChibCC Project +// +//===----------------------------------------------------------------------===// +// +// This file defines the diagnostic kind database. +// +//===----------------------------------------------------------------------===// + +#ifndef DIAG +#define DIAG(ENUM, LEVEL, DESC) +#endif + +//===----------------------------------------------------------------------===// +// Lexical Analysis (Tokenizer) Diagnostics +//===----------------------------------------------------------------------===// + +DIAG(err_invalid_character, Error, "invalid character '%0' in source file") +DIAG(err_unterminated_string, Error, "unterminated string literal") +DIAG(err_unterminated_char, Error, "unterminated character constant") +DIAG(err_empty_character, Error, "empty character constant") +DIAG(err_multichar_character, Error, "multi-character character constant") +DIAG(err_invalid_escape_sequence, Error, "invalid escape sequence '\\%0'") +DIAG(err_invalid_numeric_literal, Error, "invalid numeric literal") +DIAG(err_numeric_literal_too_large, Error, "numeric literal is too large") + +DIAG(warn_trigraph, Warning, "trigraph converted to '%0' character") +DIAG(warn_multichar_character_literal, Warning, "multi-character character constant") + +//===----------------------------------------------------------------------===// +// Parsing Diagnostics +//===----------------------------------------------------------------------===// + +DIAG(err_expected_token, Error, "expected '%0'") +DIAG(err_expected_expression, Error, "expected expression") +DIAG(err_expected_statement, Error, "expected statement") +DIAG(err_expected_declaration, Error, "expected declaration") +DIAG(err_expected_identifier, Error, "expected identifier") +DIAG(err_expected_type, Error, "expected type name") + +DIAG(err_unexpected_token, Error, "unexpected token '%0'") +DIAG(err_extra_tokens, Error, "extra tokens at end of directive") +DIAG(err_missing_semicolon, Error, "expected ';' after %0") +DIAG(err_missing_comma, Error, "expected ',' between %0") + +DIAG(err_unmatched_paren, Error, "expected ')' to match this '('") +DIAG(err_unmatched_brace, Error, "expected '}' to match this '{'") +DIAG(err_unmatched_bracket, Error, "expected ']' to match this '['") + +//===----------------------------------------------------------------------===// +// Semantic Analysis Diagnostics +//===----------------------------------------------------------------------===// + +DIAG(err_undeclared_identifier, Error, "use of undeclared identifier '%0'") +DIAG(err_redefinition, Error, "redefinition of '%0'") +DIAG(err_conflicting_types, Error, "conflicting types for '%0'") +DIAG(err_incompatible_types, Error, "incompatible types: '%0' and '%1'") +DIAG(err_invalid_operands, Error, "invalid operands to binary expression ('%0' and '%1')") +DIAG(err_invalid_unary_operand, Error, "invalid operand to unary expression ('%0')") + +DIAG(err_division_by_zero, Error, "division by zero") +DIAG(err_modulo_by_zero, Error, "modulo by zero") + +DIAG(warn_unused_variable, Warning, "unused variable '%0'") +DIAG(warn_uninitialized_variable, Warning, "variable '%0' is uninitialized when used here") +DIAG(warn_implicit_conversion, Warning, "implicit conversion from '%0' to '%1'") + +//===----------------------------------------------------------------------===// +// Code Generation Diagnostics +//===----------------------------------------------------------------------===// + +DIAG(err_unsupported_feature, Error, "unsupported feature: %0") +DIAG(err_internal_error, Error, "internal compiler error: %0") + +DIAG(note_previous_declaration, Note, "previous declaration is here") +DIAG(note_previous_definition, Note, "previous definition is here") +DIAG(note_to_match_this, Note, "to match this '%0'") + +//===----------------------------------------------------------------------===// +// General Diagnostics +//===----------------------------------------------------------------------===// + +DIAG(fatal_too_many_errors, Fatal, "too many errors emitted, stopping now") +DIAG(note_include_translation_unit, Note, "in file included from %0:%1:") + +#undef DIAG \ No newline at end of file diff --git a/include/Parser.h b/include/Parser.h new file mode 100644 index 0000000..98120b0 --- /dev/null +++ b/include/Parser.h @@ -0,0 +1,39 @@ +#ifndef CHIBCC_PARSER_H +#define CHIBCC_PARSER_H + +#include "AST.h" +#include "Tokenizer.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Parser +//===----------------------------------------------------------------------===// + +class Parser { +private: + std::unique_ptr newNode(NodeKind Kind); + std::unique_ptr newBinary(NodeKind Kind, std::unique_ptr Lhs, + std::unique_ptr Rhs); + std::unique_ptr newUnary(NodeKind Kind, std::unique_ptr Expr); + std::unique_ptr newNum(int Val); + + std::unique_ptr expr(Token **Rest, Token *Tok); + std::unique_ptr equality(Token **Rest, Token *Tok); + std::unique_ptr relational(Token **Rest, Token *Tok); + std::unique_ptr add(Token **Rest, Token *Tok); + std::unique_ptr mul(Token **Rest, Token *Tok); + std::unique_ptr unary(Token **Rest, Token *Tok); + std::unique_ptr primary(Token **Rest, Token *Tok); + + Lexer &Lex; + +public: + explicit Parser(Lexer &L) : Lex(L) {} + + std::unique_ptr parse(Token *Tok); +}; + +} // namespace chibcc + +#endif // CHIBCC_PARSER_H \ No newline at end of file diff --git a/include/Token.h b/include/Token.h new file mode 100644 index 0000000..24be49c --- /dev/null +++ b/include/Token.h @@ -0,0 +1,136 @@ +#ifndef CHIBCC_TOKEN_H +#define CHIBCC_TOKEN_H + +#include "Common.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Token Types +//===----------------------------------------------------------------------===// + +namespace tok { + enum TokenKind : unsigned short { +#define TOK(X) X, +#include "TokenKinds.def" + NUM_TOKENS + }; + + /// \brief Determines the name of a token as used within the front end. + /// + /// The name of a token will be an internal name (such as "l_square") + /// and should not be used as part of diagnostic messages. + const char *getTokenName(TokenKind Kind); + + /// \brief Determines the spelling of simple punctuator tokens like + /// '!' or '%', and returns NULL for literal and annotation tokens. + /// + /// This routine only retrieves the "simple" spelling of the token, + /// and will not produce any alternative spellings (e.g., a + /// digraph spelling, an escaped newline, etc.). For the actual + /// spelling of a given Token, use Preprocessor::getSpelling(). + const char *getPunctuatorSpelling(TokenKind Kind); + + /// \brief Determines the spelling of simple keyword and contextual keyword + /// tokens like 'int' and 'dynamic_cast'. Returns NULL for other token kinds. + const char *getKeywordSpelling(TokenKind Kind); + + /// \brief Return true if this is a raw identifier or an identifier kind. + inline bool isAnyIdentifier(TokenKind K) { + return (K == tok::identifier); + } + + /// \brief Return true if this is a C or C++ string-literal (or + /// C++11 user-defined-string-literal) token. + inline bool isStringLiteral(TokenKind K) { + return K == tok::string_literal; + } + + /// \brief Return true if this is a "literal" kind, like a numeric + /// constant, string, etc. + inline bool isLiteral(TokenKind K) { + return K == tok::numeric_constant || K == tok::char_constant || + isStringLiteral(K); + } + + /// \brief Return true if this is any of tok::annot_* kinds. + inline bool isAnnotation(TokenKind) { + return false; // No annotations in this simple implementation + } +} + +class Token { +public: + tok::TokenKind Kind; + std::unique_ptr Next; + + /// The location of the token. This is actually a pointer into the original + /// source buffer. + const char *Loc; + + /// The length of the token. + unsigned Len; + + union { + /// The actual value of a numeric constant token. + uint64_t IntegerValue; + + /// A pointer to the start of the literal data for string literals. + const char *LiteralData; + }; + + Token() : Kind(tok::unknown), Next(nullptr), Loc(nullptr), Len(0) { + IntegerValue = 0; + } + + Token(tok::TokenKind K, const char *Location, unsigned Length) + : Kind(K), Next(nullptr), Loc(Location), Len(Length) { + IntegerValue = 0; + } + + /// \brief Return true if this token is a literal value. + bool isLiteral() const { + return tok::isLiteral(Kind); + } + + /// \brief Return true if this token is an identifier. + bool isAnyIdentifier() const { + return tok::isAnyIdentifier(Kind); + } + + /// \brief Return a source location identifier for the specified + /// offset in the current file. + const char *getLocation() const { return Loc; } + + /// \brief Return the length of the token. + unsigned getLength() const { return Len; } + + /// \brief Return the actual spelling of this token. + std::string getSpelling() const { + return std::string(Loc, Len); + } + + /// \brief Given a token representing an identifier, return true if it has a + /// specific spelling. + bool is(tok::TokenKind K) const { return Kind == K; } + bool isNot(tok::TokenKind K) const { return Kind != K; } + bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const { + return is(K1) || is(K2); + } + template + bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, Ts... Ks) const { + return is(K1) || isOneOf(K2, Ks...); + } +}; + +class DiagnosticEngine; + +void errorTok(Token *Tok, const char *Fmt, ...); + +/// \brief Create a diagnostic builder for token-based diagnostics +class DiagnosticBuilder; +DiagnosticBuilder diagnoseTok(DiagnosticEngine &Diags, Token *Tok, unsigned DiagID); + +} // namespace chibcc + +#endif // CHIBCC_TOKEN_H \ No newline at end of file diff --git a/include/TokenKinds.def b/include/TokenKinds.def new file mode 100644 index 0000000..b2677b3 --- /dev/null +++ b/include/TokenKinds.def @@ -0,0 +1,143 @@ +//===--- TokenKinds.def - C Family Token Kind Database ---------*- C++ -*-===// +// +// Part of the ChibCC Project +// +//===----------------------------------------------------------------------===// +// +// This file defines the TokenKind database. This includes normal tokens like +// identifiers, string literals, etc. as well as keywords for various languages. +// +//===----------------------------------------------------------------------===// + +#ifndef TOK +#define TOK(X) +#endif +#ifndef PUNCTUATOR +#define PUNCTUATOR(X,Y) TOK(X) +#endif +#ifndef KEYWORD +#define KEYWORD(X,Y) TOK(kw_ ## X) +#endif + +//===----------------------------------------------------------------------===// +// Preprocessor tokens. +//===----------------------------------------------------------------------===// + +// These define members of the tok::* namespace. + +TOK(unknown) // Not a token. +TOK(eof) // End of file. +TOK(eod) // End of preprocessing directive (end of line inside a + // directive). +TOK(code_completion) // Code completion marker + +//===----------------------------------------------------------------------===// +// Language keywords. +//===----------------------------------------------------------------------===// + +// These define members of the tok::kw_* namespace. Note that keywords are +// always first-class tokens. +KEYWORD(auto , KEYALL) +KEYWORD(break , KEYALL) +KEYWORD(case , KEYALL) +KEYWORD(char , KEYALL) +KEYWORD(const , KEYALL) +KEYWORD(continue , KEYALL) +KEYWORD(default , KEYALL) +KEYWORD(do , KEYALL) +KEYWORD(double , KEYALL) +KEYWORD(else , KEYALL) +KEYWORD(enum , KEYALL) +KEYWORD(extern , KEYALL) +KEYWORD(float , KEYALL) +KEYWORD(for , KEYALL) +KEYWORD(goto , KEYALL) +KEYWORD(if , KEYALL) +KEYWORD(int , KEYALL) +KEYWORD(long , KEYALL) +KEYWORD(register , KEYALL) +KEYWORD(return , KEYALL) +KEYWORD(short , KEYALL) +KEYWORD(signed , KEYALL) +KEYWORD(sizeof , KEYALL) +KEYWORD(static , KEYALL) +KEYWORD(struct , KEYALL) +KEYWORD(switch , KEYALL) +KEYWORD(typedef , KEYALL) +KEYWORD(union , KEYALL) +KEYWORD(unsigned , KEYALL) +KEYWORD(void , KEYALL) +KEYWORD(volatile , KEYALL) +KEYWORD(while , KEYALL) + +//===----------------------------------------------------------------------===// +// Literals +//===----------------------------------------------------------------------===// + +TOK(numeric_constant) // 0x123 +TOK(char_constant) // 'a' +TOK(string_literal) // "foo" + +//===----------------------------------------------------------------------===// +// Identifiers. +//===----------------------------------------------------------------------===// + +TOK(identifier) // abcde123 + +//===----------------------------------------------------------------------===// +// C/C++ Punctuators. +//===----------------------------------------------------------------------===// + +PUNCTUATOR(l_square, "[") +PUNCTUATOR(r_square, "]") +PUNCTUATOR(l_paren, "(") +PUNCTUATOR(r_paren, ")") +PUNCTUATOR(l_brace, "{") +PUNCTUATOR(r_brace, "}") +PUNCTUATOR(period, ".") +PUNCTUATOR(ellipsis, "...") +PUNCTUATOR(amp, "&") +PUNCTUATOR(ampamp, "&&") +PUNCTUATOR(ampequal, "&=") +PUNCTUATOR(star, "*") +PUNCTUATOR(starequal, "*=") +PUNCTUATOR(plus, "+") +PUNCTUATOR(plusplus, "++") +PUNCTUATOR(plusequal, "+=") +PUNCTUATOR(minus, "-") +PUNCTUATOR(arrow, "->") +PUNCTUATOR(minusminus, "--") +PUNCTUATOR(minusequal, "-=") +PUNCTUATOR(tilde, "~") +PUNCTUATOR(exclaim, "!") +PUNCTUATOR(exclaimequal, "!=") +PUNCTUATOR(slash, "/") +PUNCTUATOR(slashequal, "/=") +PUNCTUATOR(percent, "%") +PUNCTUATOR(percentequal, "%=") +PUNCTUATOR(less, "<") +PUNCTUATOR(lessless, "<<") +PUNCTUATOR(lessequal, "<=") +PUNCTUATOR(lesslessequal, "<<=") +PUNCTUATOR(greater, ">") +PUNCTUATOR(greatergreater, ">>") +PUNCTUATOR(greaterequal, ">=") +PUNCTUATOR(greatergreaterequal, ">>=") +PUNCTUATOR(caret, "^") +PUNCTUATOR(caretequal, "^=") +PUNCTUATOR(pipe, "|") +PUNCTUATOR(pipepipe, "||") +PUNCTUATOR(pipeequal, "|=") +PUNCTUATOR(question, "?") +PUNCTUATOR(colon, ":") +PUNCTUATOR(semi, ";") +PUNCTUATOR(equal, "=") +PUNCTUATOR(equalequal, "==") +PUNCTUATOR(comma, ",") +PUNCTUATOR(hash, "#") +PUNCTUATOR(hashhash, "##") +PUNCTUATOR(hashat, "#@") + +#undef KEYWORD +#undef PUNCTUATOR +#undef TOK \ No newline at end of file diff --git a/include/Tokenizer.h b/include/Tokenizer.h new file mode 100644 index 0000000..a8ff40d --- /dev/null +++ b/include/Tokenizer.h @@ -0,0 +1,81 @@ +#ifndef CHIBCC_TOKENIZER_H +#define CHIBCC_TOKENIZER_H + +#include "Token.h" +#include "Diagnostic.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Lexer - This provides a simple interface that turns a text buffer into a +// stream of tokens. This provides no support for file reading or buffering, +// or buffering/seeking of tokens, only forward lexing is supported. It relies +// on the specified Preprocessor object to handle preprocessor directives, etc. +//===----------------------------------------------------------------------===// + +class Lexer { +private: + const char *BufferStart; // Start of the buffer. + const char *BufferPtr; // Current pointer into the buffer. + const char *BufferEnd; // End of the buffer. + DiagnosticEngine &Diags; // Diagnostic engine for error reporting. + + /// \brief Create a new token with the specified information. + std::unique_ptr formToken(tok::TokenKind Kind, const char *TokStart); + + /// \brief Skip whitespace and comments, return the first non-whitespace + /// character after skipping whitespace and comments. + bool skipWhitespace(); + + /// \brief We have just read the // characters, skip until we find the + /// newline character that terminates the comment. Then update BufferPtr. + bool skipLineComment(); + + /// \brief We have just read the /* characters, skip until we find the */ + /// characters that terminate the comment. Then update BufferPtr. + bool skipBlockComment(); + + /// \brief Lex a number: integer-constant, floating-constant. + void lexNumericConstant(Token &Result); + + /// \brief Lex a string literal or character constant. + void lexStringLiteral(Token &Result, const char *CurPtr); + + /// \brief Lex an identifier or keyword. + void lexIdentifier(Token &Result, const char *CurPtr); + + /// \brief Return true if the specified string is the body of an identifier. + static bool isIdentifierBody(unsigned char c) { + return isalnum(c) || c == '_'; + } + + /// \brief Return true if the specified string is the start of an identifier. + static bool isIdentifierHead(unsigned char c) { + return isalpha(c) || c == '_'; + } + + /// \brief Matches punctuation tokens. + tok::TokenKind tryMatchPunctuator(const char *CurPtr, unsigned &Size); + +public: + /// \brief Construct a Lexer for the given buffer. + Lexer(const char *InputStart, const char *InputEnd, DiagnosticEngine &Diags); + + /// \brief Lex the next token and return it. + std::unique_ptr lex(); + + /// \brief Return true if the specified token kind is a literal (like a + /// numeric constant, string, etc). + static bool isLiteral(tok::TokenKind K) { + return tok::isLiteral(K); + } + + /// \brief Utility functions for token matching + static bool equal(Token *Tok, const char *Op); + static Token *skip(Token *Tok, const char *Op); + static bool equal(Token *Tok, tok::TokenKind Kind); +}; + +} // namespace chibcc + +#endif // CHIBCC_TOKENIZER_H \ No newline at end of file diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..aa9f6dc --- /dev/null +++ b/main.cpp @@ -0,0 +1,55 @@ +#include "CodeGenerator.h" +#include "Diagnostic.h" +#include "Parser.h" +#include "Tokenizer.h" +#include + +using namespace chibcc; + +int main(int Argc, char **Argv) { + if (Argc != 2) { + std::cerr << "Usage: " << Argv[0] << " " << std::endl; + return 1; + } + + const char *Input = Argv[1]; + + // Create diagnostic engine + DiagnosticEngine Diags(Input, ""); + + // Create lexer + Lexer Lex(Input, Input + strlen(Input), Diags); + + // Tokenize all input into a linked list + std::unique_ptr Head = std::make_unique(); + Token *Current = Head.get(); + + while (true) { + auto Tok = Lex.lex(); + Current->Next = std::move(Tok); + Current = Current->Next.get(); + if (Current->Kind == tok::eof) { + break; + } + } + + // Check for lexical errors + if (Diags.hasErrorOccurred()) { + return 1; + } + + // Parse tokens into AST + Parser P(Lex); + auto Ast = P.parse(Head->Next.get()); + + // Check for parse errors + if (Diags.hasErrorOccurred()) { + return 1; + } + + // Generate assembly code + CodeGenerator CG; + CG.codegen(Ast.get()); + + return 0; +} \ No newline at end of file diff --git a/main.s b/main.s new file mode 100644 index 0000000..2eefdf2 --- /dev/null +++ b/main.s @@ -0,0 +1,16 @@ + .globl main +main: + mov $3, %rax + push %rax + mov $1, %rax + pop %rdi + add %rdi, %rax + push %rax + mov $2, %rax + pop %rdi + imul %rdi, %rax + push %rax + mov $1, %rax + pop %rdi + add %rdi, %rax + ret \ No newline at end of file diff --git a/src/CodeGenerator.cpp b/src/CodeGenerator.cpp new file mode 100644 index 0000000..2592628 --- /dev/null +++ b/src/CodeGenerator.cpp @@ -0,0 +1,85 @@ +#include "CodeGenerator.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Code Generator Implementation +//===----------------------------------------------------------------------===// + +void CodeGenerator::push() { + printf(" push %%rax\n"); + Depth++; +} + +void CodeGenerator::pop(const char *Arg) { + printf(" pop %s\n", Arg); + Depth--; +} + +void CodeGenerator::genExpr(Node *N) { + switch (N->Kind) { + case NodeKind::Num: + printf(" mov $%d, %%rax\n", N->Val); + return; + case NodeKind::Neg: + genExpr(N->Lhs.get()); + printf(" neg %%rax\n"); + return; + default: + break; + } + + genExpr(N->Rhs.get()); + push(); + genExpr(N->Lhs.get()); + pop("%rdi"); + + switch (N->Kind) { + case NodeKind::Add: + printf(" add %%rdi, %%rax\n"); + return; + case NodeKind::Sub: + printf(" sub %%rdi, %%rax\n"); + return; + case NodeKind::Mul: + printf(" imul %%rdi, %%rax\n"); + return; + case NodeKind::Div: + printf(" cqo\n"); + printf(" idiv %%rdi\n"); + return; + case NodeKind::Eq: + case NodeKind::Ne: + case NodeKind::Lt: + case NodeKind::Le: + printf(" cmp %%rdi, %%rax\n"); + + if (N->Kind == NodeKind::Eq) + printf(" sete %%al\n"); + else if (N->Kind == NodeKind::Ne) + printf(" setne %%al\n"); + else if (N->Kind == NodeKind::Lt) + printf(" setl %%al\n"); + else if (N->Kind == NodeKind::Le) + printf(" setle %%al\n"); + + printf(" movzb %%al, %%rax\n"); + return; + default: + break; + } + + error("invalid expression"); +} + +void CodeGenerator::codegen(Node *N) { + printf(" .globl main\n"); + printf("main:\n"); + + genExpr(N); + printf(" ret\n"); + + assert(Depth == 0); +} + +} // namespace chibcc \ No newline at end of file diff --git a/src/Diagnostic.cpp b/src/Diagnostic.cpp new file mode 100644 index 0000000..ad9fed4 --- /dev/null +++ b/src/Diagnostic.cpp @@ -0,0 +1,196 @@ +#include "Diagnostic.h" +#include + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Diagnostic Level and Text Tables +//===----------------------------------------------------------------------===// + +static const DiagnosticLevel DiagnosticLevels[] = { +#define DIAG(ENUM, LEVEL, DESC) DiagnosticLevel::LEVEL, +#include "DiagnosticKinds.def" +}; + +static const char *DiagnosticTexts[] = { +#define DIAG(ENUM, LEVEL, DESC) DESC, +#include "DiagnosticKinds.def" +}; + +//===----------------------------------------------------------------------===// +// DiagnosticEngine Implementation +//===----------------------------------------------------------------------===// + +DiagnosticLevel DiagnosticEngine::getDiagnosticLevel(unsigned DiagID) { + if (DiagID >= diag::NUM_BUILTIN_DIAGNOSTICS) + return DiagnosticLevel::Error; + return DiagnosticLevels[DiagID]; +} + +const char *DiagnosticEngine::getDiagnosticText(unsigned DiagID) { + if (DiagID >= diag::NUM_BUILTIN_DIAGNOSTICS) + return "unknown diagnostic"; + return DiagnosticTexts[DiagID]; +} + +void DiagnosticEngine::emitDiagnostic(SourceLocation Loc, DiagnosticLevel Level, + const std::string &Message) { + if (SuppressAllDiagnostics) + return; + + // Treat warnings as errors if requested + if (Level == DiagnosticLevel::Warning && WarningsAsErrors) + Level = DiagnosticLevel::Error; + + // Update counters + switch (Level) { + case DiagnosticLevel::Warning: + NumWarnings++; + break; + case DiagnosticLevel::Error: + case DiagnosticLevel::Fatal: + NumErrors++; + break; + default: + break; + } + + // Print the diagnostic + const char *LevelStr = ""; + switch (Level) { + case DiagnosticLevel::Note: + LevelStr = "note"; + break; + case DiagnosticLevel::Remark: + LevelStr = "remark"; + break; + case DiagnosticLevel::Warning: + LevelStr = "warning"; + break; + case DiagnosticLevel::Error: + LevelStr = "error"; + break; + case DiagnosticLevel::Fatal: + LevelStr = "fatal error"; + break; + case DiagnosticLevel::Ignored: + return; // Don't print ignored diagnostics + } + + // Calculate line and column + int Line = 1, Column = 1; + if (Loc.isValid() && SourceBuffer) { + const char *Ptr = SourceBuffer; + while (Ptr < Loc.getPointer()) { + if (*Ptr == '\n') { + Line++; + Column = 1; + } else { + Column++; + } + Ptr++; + } + } + + // Print diagnostic header + std::cerr << FileName << ":" << Line << ":" << Column << ": " << LevelStr + << ": " << Message << std::endl; + + // Print source line and caret if location is valid + if (Loc.isValid()) { + printSourceLine(Loc); + printCaretDiagnostic(Loc, SourceRange(Loc)); + } + + // Exit on fatal errors + if (Level == DiagnosticLevel::Fatal) { + std::exit(1); + } +} + +void DiagnosticEngine::printSourceLine(SourceLocation Loc) { + if (!SourceBuffer || Loc.isInvalid()) + return; + + const char *LineStart = Loc.getPointer(); + const char *LineEnd = Loc.getPointer(); + + // Find the start of the line + while (LineStart > SourceBuffer && LineStart[-1] != '\n') + LineStart--; + + // Find the end of the line + while (*LineEnd && *LineEnd != '\n' && *LineEnd != '\r') + LineEnd++; + + // Print the source line + std::cerr << std::string(LineStart, LineEnd) << std::endl; +} + +void DiagnosticEngine::printCaretDiagnostic(SourceLocation Loc, + SourceRange Range) { + if (!SourceBuffer || Loc.isInvalid()) + return; + + const char *LineStart = Loc.getPointer(); + while (LineStart > SourceBuffer && LineStart[-1] != '\n') + LineStart--; + + // Calculate the column position + int Column = Loc.getPointer() - LineStart; + + // Print spaces up to the caret position + for (int i = 0; i < Column; ++i) { + if (LineStart[i] == '\t') + std::cerr << '\t'; + else + std::cerr << ' '; + } + + // Print the caret + std::cerr << '^'; + + // If we have a range, print tildes for the rest + if (Range.isValid() && Range.getEnd().getPointer() > Loc.getPointer()) { + int RangeLen = Range.getEnd().getPointer() - Loc.getPointer(); + for (int i = 1; i < RangeLen; ++i) { + std::cerr << '~'; + } + } + + std::cerr << std::endl; +} + +void DiagnosticEngine::report(SourceLocation Loc, unsigned DiagID, + const std::string &Message) { + DiagnosticLevel Level = getDiagnosticLevel(DiagID); + emitDiagnostic(Loc, Level, Message); +} + +void DiagnosticEngine::report(SourceRange Range, unsigned DiagID, + const std::string &Message) { + DiagnosticLevel Level = getDiagnosticLevel(DiagID); + emitDiagnostic(Range.getBegin(), Level, Message); +} + +void DiagnosticEngine::reportError(SourceLocation Loc, + const std::string &Message) { + emitDiagnostic(Loc, DiagnosticLevel::Error, Message); +} + +void DiagnosticEngine::reportWarning(SourceLocation Loc, + const std::string &Message) { + emitDiagnostic(Loc, DiagnosticLevel::Warning, Message); +} + +void DiagnosticEngine::reportNote(SourceLocation Loc, + const std::string &Message) { + emitDiagnostic(Loc, DiagnosticLevel::Note, Message); +} + +void DiagnosticEngine::reportFatal(SourceLocation Loc, + const std::string &Message) { + emitDiagnostic(Loc, DiagnosticLevel::Fatal, Message); +} + +} // namespace chibcc \ No newline at end of file diff --git a/src/Parser.cpp b/src/Parser.cpp new file mode 100644 index 0000000..1581d71 --- /dev/null +++ b/src/Parser.cpp @@ -0,0 +1,165 @@ +#include "Parser.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Parser Implementation +//===----------------------------------------------------------------------===// + +std::unique_ptr Parser::newNode(NodeKind Kind) { + return std::make_unique(Kind); +} + +std::unique_ptr Parser::newBinary(NodeKind Kind, std::unique_ptr Lhs, + std::unique_ptr Rhs) { + auto N = newNode(Kind); + N->Lhs = std::move(Lhs); + N->Rhs = std::move(Rhs); + return N; +} + +std::unique_ptr Parser::newUnary(NodeKind Kind, std::unique_ptr Expr) { + auto N = newNode(Kind); + N->Lhs = std::move(Expr); + return N; +} + +std::unique_ptr Parser::newNum(int Val) { + auto N = newNode(NodeKind::Num); + N->Val = Val; + return N; +} + +// expr = equality +std::unique_ptr Parser::expr(Token **Rest, Token *Tok) { + return equality(Rest, Tok); +} + +// equality = relational ("==" relational | "!=" relational)* +std::unique_ptr Parser::equality(Token **Rest, Token *Tok) { + auto N = relational(&Tok, Tok); + + for (;;) { + if (Lexer::equal(Tok, "==")) { + N = newBinary(NodeKind::Eq, std::move(N), relational(&Tok, Tok->Next.get())); + continue; + } + + if (Lexer::equal(Tok, "!=")) { + N = newBinary(NodeKind::Ne, std::move(N), relational(&Tok, Tok->Next.get())); + continue; + } + + *Rest = Tok; + return N; + } +} + +// relational = add ("<" add | "<=" add | ">" add | ">=" add)* +std::unique_ptr Parser::relational(Token **Rest, Token *Tok) { + auto N = add(&Tok, Tok); + + for (;;) { + if (Lexer::equal(Tok, "<")) { + N = newBinary(NodeKind::Lt, std::move(N), add(&Tok, Tok->Next.get())); + continue; + } + + if (Lexer::equal(Tok, "<=")) { + N = newBinary(NodeKind::Le, std::move(N), add(&Tok, Tok->Next.get())); + continue; + } + + if (Lexer::equal(Tok, ">")) { + N = newBinary(NodeKind::Lt, add(&Tok, Tok->Next.get()), std::move(N)); + continue; + } + + if (Lexer::equal(Tok, ">=")) { + N = newBinary(NodeKind::Le, add(&Tok, Tok->Next.get()), std::move(N)); + continue; + } + + *Rest = Tok; + return N; + } +} + +// add = mul ("+" mul | "-" mul)* +std::unique_ptr Parser::add(Token **Rest, Token *Tok) { + auto N = mul(&Tok, Tok); + + for (;;) { + if (Lexer::equal(Tok, "+")) { + N = newBinary(NodeKind::Add, std::move(N), mul(&Tok, Tok->Next.get())); + continue; + } + + if (Lexer::equal(Tok, "-")) { + N = newBinary(NodeKind::Sub, std::move(N), mul(&Tok, Tok->Next.get())); + continue; + } + + *Rest = Tok; + return N; + } +} + +// mul = unary ("*" unary | "/" unary)* +std::unique_ptr Parser::mul(Token **Rest, Token *Tok) { + auto N = unary(&Tok, Tok); + + for (;;) { + if (Lexer::equal(Tok, "*")) { + N = newBinary(NodeKind::Mul, std::move(N), unary(&Tok, Tok->Next.get())); + continue; + } + + if (Lexer::equal(Tok, "/")) { + N = newBinary(NodeKind::Div, std::move(N), unary(&Tok, Tok->Next.get())); + continue; + } + + *Rest = Tok; + return N; + } +} + +// unary = ("+" | "-") unary +// | primary +std::unique_ptr Parser::unary(Token **Rest, Token *Tok) { + if (Lexer::equal(Tok, "+")) + return unary(Rest, Tok->Next.get()); + + if (Lexer::equal(Tok, "-")) + return newUnary(NodeKind::Neg, unary(Rest, Tok->Next.get())); + + return primary(Rest, Tok); +} + +// primary = "(" expr ")" | num +std::unique_ptr Parser::primary(Token **Rest, Token *Tok) { + if (Lexer::equal(Tok, "(")) { + auto N = expr(&Tok, Tok->Next.get()); + *Rest = Lexer::skip(Tok, ")"); + return N; + } + + if (Tok->Kind == tok::numeric_constant) { + auto N = newNum(Tok->IntegerValue); + *Rest = Tok->Next.get(); + return N; + } + + errorTok(Tok, "expected an expression"); + return nullptr; // Never reached +} + +std::unique_ptr Parser::parse(Token *Tok) { + auto N = expr(&Tok, Tok); + if (Tok->Kind != tok::eof) + errorTok(Tok, "extra token"); + return N; +} + +} // namespace chibcc \ No newline at end of file diff --git a/src/TokenKinds.cpp b/src/TokenKinds.cpp new file mode 100644 index 0000000..826c1e9 --- /dev/null +++ b/src/TokenKinds.cpp @@ -0,0 +1,38 @@ +#include "Token.h" + +namespace chibcc { +namespace tok { + +static const char * const TokNames[] = { +#define TOK(X) #X, +#define KEYWORD(X,Y) #X, +#include "TokenKinds.def" + nullptr +}; + +const char *getTokenName(TokenKind Kind) { + if (Kind < NUM_TOKENS) + return TokNames[Kind]; + return nullptr; +} + +const char *getPunctuatorSpelling(TokenKind Kind) { + switch (Kind) { +#define PUNCTUATOR(X,Y) case X: return Y; +#include "TokenKinds.def" + default: break; + } + return nullptr; +} + +const char *getKeywordSpelling(TokenKind Kind) { + switch (Kind) { +#define KEYWORD(X,Y) case kw_ ## X: return #X; +#include "TokenKinds.def" + default: break; + } + return nullptr; +} + +} // namespace tok +} // namespace chibcc \ No newline at end of file diff --git a/src/Tokenizer.cpp b/src/Tokenizer.cpp new file mode 100644 index 0000000..7556fe5 --- /dev/null +++ b/src/Tokenizer.cpp @@ -0,0 +1,314 @@ +#include "Tokenizer.h" + +namespace chibcc { + +//===----------------------------------------------------------------------===// +// Error Handling Implementation +//===----------------------------------------------------------------------===// + +static const char *CurrentInput = nullptr; + +void error(const char *Fmt, ...) { + va_list Ap; + va_start(Ap, Fmt); + vfprintf(stderr, Fmt, Ap); + fprintf(stderr, "\n"); + exit(1); +} + +static void verrorAt(const char *Loc, const char *Fmt, va_list Ap) { + int Pos = Loc - CurrentInput; + fprintf(stderr, "%s\n", CurrentInput); + fprintf(stderr, "%*s", Pos, ""); // print Pos spaces. + fprintf(stderr, "^ "); + vfprintf(stderr, Fmt, Ap); + fprintf(stderr, "\n"); + exit(1); +} + +void errorAt(const char *Loc, const char *Fmt, ...) { + va_list Ap; + va_start(Ap, Fmt); + verrorAt(Loc, Fmt, Ap); +} + +void errorTok(Token *Tok, const char *Fmt, ...) { + va_list Ap; + va_start(Ap, Fmt); + verrorAt(Tok->Loc, Fmt, Ap); +} + +//===----------------------------------------------------------------------===// +// Lexer Implementation +//===----------------------------------------------------------------------===// + +Lexer::Lexer(const char *InputStart, const char *InputEnd, DiagnosticEngine &Diags) + : BufferStart(InputStart), BufferPtr(InputStart), BufferEnd(InputEnd), Diags(Diags) { + CurrentInput = InputStart; +} + +std::unique_ptr Lexer::formToken(tok::TokenKind Kind, const char *TokStart) { + auto Tok = std::make_unique(Kind, TokStart, BufferPtr - TokStart); + return Tok; +} + +bool Lexer::skipWhitespace() { + while (BufferPtr != BufferEnd) { + switch (*BufferPtr) { + case ' ': + case '\t': + case '\f': + case '\v': + case '\r': + case '\n': + ++BufferPtr; + break; + default: + return false; + } + } + return true; +} + +void Lexer::lexNumericConstant(Token &Result) { + const char *CurPtr = BufferPtr; + + // Lex the number + while (BufferPtr != BufferEnd && isdigit(*BufferPtr)) + ++BufferPtr; + + Result.Kind = tok::numeric_constant; + Result.Loc = CurPtr; + Result.Len = BufferPtr - CurPtr; + + // Convert to integer value + std::string NumStr(CurPtr, BufferPtr - CurPtr); + Result.IntegerValue = std::stoull(NumStr); +} + +void Lexer::lexIdentifier(Token &Result, const char *CurPtr) { + // Match [a-zA-Z_][a-zA-Z0-9_]* + while (BufferPtr != BufferEnd && isIdentifierBody(*BufferPtr)) + ++BufferPtr; + + Result.Kind = tok::identifier; + Result.Loc = CurPtr; + Result.Len = BufferPtr - CurPtr; + + // Check if this is a keyword + std::string Spelling(CurPtr, BufferPtr - CurPtr); + + // Simple keyword lookup - in a real implementation this would use a hash table + if (Spelling == "auto") Result.Kind = tok::kw_auto; + else if (Spelling == "break") Result.Kind = tok::kw_break; + else if (Spelling == "case") Result.Kind = tok::kw_case; + else if (Spelling == "char") Result.Kind = tok::kw_char; + else if (Spelling == "const") Result.Kind = tok::kw_const; + else if (Spelling == "continue") Result.Kind = tok::kw_continue; + else if (Spelling == "default") Result.Kind = tok::kw_default; + else if (Spelling == "do") Result.Kind = tok::kw_do; + else if (Spelling == "double") Result.Kind = tok::kw_double; + else if (Spelling == "else") Result.Kind = tok::kw_else; + else if (Spelling == "enum") Result.Kind = tok::kw_enum; + else if (Spelling == "extern") Result.Kind = tok::kw_extern; + else if (Spelling == "float") Result.Kind = tok::kw_float; + else if (Spelling == "for") Result.Kind = tok::kw_for; + else if (Spelling == "goto") Result.Kind = tok::kw_goto; + else if (Spelling == "if") Result.Kind = tok::kw_if; + else if (Spelling == "int") Result.Kind = tok::kw_int; + else if (Spelling == "long") Result.Kind = tok::kw_long; + else if (Spelling == "register") Result.Kind = tok::kw_register; + else if (Spelling == "return") Result.Kind = tok::kw_return; + else if (Spelling == "short") Result.Kind = tok::kw_short; + else if (Spelling == "signed") Result.Kind = tok::kw_signed; + else if (Spelling == "sizeof") Result.Kind = tok::kw_sizeof; + else if (Spelling == "static") Result.Kind = tok::kw_static; + else if (Spelling == "struct") Result.Kind = tok::kw_struct; + else if (Spelling == "switch") Result.Kind = tok::kw_switch; + else if (Spelling == "typedef") Result.Kind = tok::kw_typedef; + else if (Spelling == "union") Result.Kind = tok::kw_union; + else if (Spelling == "unsigned") Result.Kind = tok::kw_unsigned; + else if (Spelling == "void") Result.Kind = tok::kw_void; + else if (Spelling == "volatile") Result.Kind = tok::kw_volatile; + else if (Spelling == "while") Result.Kind = tok::kw_while; +} + +tok::TokenKind Lexer::tryMatchPunctuator(const char *CurPtr, unsigned &Size) { + switch (*CurPtr) { + case '[': Size = 1; return tok::l_square; + case ']': Size = 1; return tok::r_square; + case '(': Size = 1; return tok::l_paren; + case ')': Size = 1; return tok::r_paren; + case '{': Size = 1; return tok::l_brace; + case '}': Size = 1; return tok::r_brace; + case '.': + if (CurPtr + 2 < BufferEnd && CurPtr[1] == '.' && CurPtr[2] == '.') { + Size = 3; return tok::ellipsis; + } + Size = 1; return tok::period; + case '&': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '&') { + Size = 2; return tok::ampamp; + } + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::ampequal; + } + Size = 1; return tok::amp; + case '*': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::starequal; + } + Size = 1; return tok::star; + case '+': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '+') { + Size = 2; return tok::plusplus; + } + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::plusequal; + } + Size = 1; return tok::plus; + case '-': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '>') { + Size = 2; return tok::arrow; + } + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '-') { + Size = 2; return tok::minusminus; + } + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::minusequal; + } + Size = 1; return tok::minus; + case '~': Size = 1; return tok::tilde; + case '!': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::exclaimequal; + } + Size = 1; return tok::exclaim; + case '/': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::slashequal; + } + Size = 1; return tok::slash; + case '%': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::percentequal; + } + Size = 1; return tok::percent; + case '<': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '<') { + if (CurPtr + 2 < BufferEnd && CurPtr[2] == '=') { + Size = 3; return tok::lesslessequal; + } + Size = 2; return tok::lessless; + } + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::lessequal; + } + Size = 1; return tok::less; + case '>': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '>') { + if (CurPtr + 2 < BufferEnd && CurPtr[2] == '=') { + Size = 3; return tok::greatergreaterequal; + } + Size = 2; return tok::greatergreater; + } + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::greaterequal; + } + Size = 1; return tok::greater; + case '^': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::caretequal; + } + Size = 1; return tok::caret; + case '|': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '|') { + Size = 2; return tok::pipepipe; + } + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::pipeequal; + } + Size = 1; return tok::pipe; + case '?': Size = 1; return tok::question; + case ':': Size = 1; return tok::colon; + case ';': Size = 1; return tok::semi; + case '=': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '=') { + Size = 2; return tok::equalequal; + } + Size = 1; return tok::equal; + case ',': Size = 1; return tok::comma; + case '#': + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '#') { + Size = 2; return tok::hashhash; + } + if (CurPtr + 1 < BufferEnd && CurPtr[1] == '@') { + Size = 2; return tok::hashat; + } + Size = 1; return tok::hash; + default: + Size = 0; + return tok::unknown; + } +} + +std::unique_ptr Lexer::lex() { + // Skip whitespace + if (skipWhitespace()) { + return formToken(tok::eof, BufferPtr); + } + + const char *TokStart = BufferPtr; + + // Handle end of file + if (BufferPtr >= BufferEnd) { + return formToken(tok::eof, BufferPtr); + } + + unsigned char Char = *BufferPtr; + + // Identifier: [a-zA-Z_] + if (isIdentifierHead(Char)) { + auto Result = std::make_unique(); + lexIdentifier(*Result, TokStart); + return Result; + } + + // Numeric constant: [0-9] + if (isdigit(Char)) { + auto Result = std::make_unique(); + lexNumericConstant(*Result); + return Result; + } + + // Punctuator + unsigned Size; + tok::TokenKind Kind = tryMatchPunctuator(TokStart, Size); + if (Kind != tok::unknown) { + BufferPtr += Size; + return formToken(Kind, TokStart); + } + + // Unknown character - report diagnostic + SourceLocation Loc(TokStart); + Diags.report(Loc, diag::err_invalid_character, + std::string("invalid character '") + char(*TokStart) + "'"); + ++BufferPtr; + return formToken(tok::unknown, TokStart); +} + +bool Lexer::equal(Token *Tok, const char *Op) { + return Tok->getSpelling() == Op; +} + +bool Lexer::equal(Token *Tok, tok::TokenKind Kind) { + return Tok->Kind == Kind; +} + +Token *Lexer::skip(Token *Tok, const char *Op) { + if (!equal(Tok, Op)) + errorTok(Tok, "expected '%s'", Op); + return Tok->Next.get(); +} + +} // namespace chibcc \ No newline at end of file diff --git a/test_compiler.sh b/test_compiler.sh new file mode 100644 index 0000000..2cfca5f --- /dev/null +++ b/test_compiler.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +# Test script for chibcc compiler +# Usage: ./test_compiler.sh + +# Don't exit on error, we want to capture and report them + +COMPILER="./build/bin/chibcc" +TEST_DIR="test_cases" +RESULTS_DIR="test_results" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Create directories +mkdir -p "$TEST_DIR" "$RESULTS_DIR" + +# Function to run a test case +run_test() { + local test_name="$1" + local input="$2" + local expected_exit_code="${3:-0}" + + echo -e "${YELLOW}Testing: $test_name${NC}" + echo "Input: $input" + + # Generate assembly + if $COMPILER "$input" > "$RESULTS_DIR/${test_name}.s" 2> "$RESULTS_DIR/${test_name}.err"; then + echo -e "${GREEN}✓ Compilation successful${NC}" + + # Add GNU stack note to fix linker warning + echo ".section .note.GNU-stack,\"\",@progbits" >> "$RESULTS_DIR/${test_name}.s" + + # Show generated assembly + echo "Generated assembly:" + cat "$RESULTS_DIR/${test_name}.s" + + # Try to assemble and link + if gcc -o "$RESULTS_DIR/${test_name}" "$RESULTS_DIR/${test_name}.s" 2>/dev/null; then + # Run the executable + if ./"$RESULTS_DIR/${test_name}"; then + exit_code=$? + echo -e "${GREEN}✓ Execution successful (exit code: $exit_code)${NC}" + if [ $exit_code -eq $expected_exit_code ]; then + echo -e "${GREEN}✓ Expected exit code matched${NC}" + else + echo -e "${RED}✗ Expected exit code $expected_exit_code, got $exit_code${NC}" + fi + else + exit_code=$? + echo -e "${YELLOW}Program exited with code: $exit_code${NC}" + fi + else + echo -e "${RED}✗ Assembly/linking failed${NC}" + fi + else + echo -e "${RED}✗ Compilation failed${NC}" + if [ -s "$RESULTS_DIR/${test_name}.err" ]; then + echo "Error output:" + cat "$RESULTS_DIR/${test_name}.err" + fi + fi + echo "----------------------------------------" +} + +# Test cases +echo -e "${YELLOW}Starting compiler tests...${NC}" +echo "========================================" + +# Basic arithmetic tests +run_test "simple_addition" "1+1" 2 +run_test "simple_subtraction" "5-3" 2 +run_test "simple_multiplication" "3*4" 12 +run_test "simple_division" "8/2" 4 + +# More complex expressions +run_test "complex_expr1" "1+2*3" 7 +run_test "complex_expr2" "(1+2)*3" 9 +run_test "complex_expr3" "10-2*3" 4 + +# Edge cases +run_test "single_number" "42" 42 +run_test "zero" "0" 0 +run_test "negative" "-5" 251 # -5 as unsigned byte = 251 + +# Parentheses tests +run_test "nested_parens" "((1+2)*3)+4" 13 +run_test "multiple_parens" "(1+2)*(3+4)" 21 + +echo -e "${GREEN}All tests completed!${NC}" +echo "Check $RESULTS_DIR/ for detailed results." \ No newline at end of file