From 1fc842e23f9b35cc8eab590ba519cc61ff885a26 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 22:06:32 +0200 Subject: [PATCH] feat(parser): add basic lexer for type definitions --- .../00_syntax_prototype/02_custom_types.midas | 2 +- lexer/annotations.py | 8 +- lexer/keyword.py | 9 ++ lexer/midas.py | 126 ++++++++++++++++++ lexer/token.py | 17 +++ 5 files changed, 157 insertions(+), 5 deletions(-) create mode 100644 lexer/keyword.py create mode 100644 lexer/midas.py diff --git a/examples/00_syntax_prototype/02_custom_types.midas b/examples/00_syntax_prototype/02_custom_types.midas index ba8b758..8248e16 100644 --- a/examples/00_syntax_prototype/02_custom_types.midas +++ b/examples/00_syntax_prototype/02_custom_types.midas @@ -21,4 +21,4 @@ type Age // Predefined custom constraints that can be referenced in other definitions constraint Positive = _ >= 0 constraint StrictlyPositive = _ > 0 -constraint Even = _ % 2 == 0 \ No newline at end of file +//constraint Even = _ % 2 == 0 \ No newline at end of file diff --git a/lexer/annotations.py b/lexer/annotations.py index b8c7cf7..3cc0431 100644 --- a/lexer/annotations.py +++ b/lexer/annotations.py @@ -46,7 +46,7 @@ class AnnotationLexer(Lexer): def scan_number(self): """Scan the rest of number and add it as a token - + This method handles both simple integers and floats. Scientific notation and base prefixes (0x, 0b, 0o) are not supported """ @@ -63,7 +63,7 @@ class AnnotationLexer(Lexer): def scan_identifier(self): """Scan the rest of an identifier and add it as a token - + An identifier starts with a letter, followed by any number of alphanumerical characters or underscores """ @@ -73,8 +73,8 @@ class AnnotationLexer(Lexer): def scan_comment(self): """Scan the rest of a comment and add it as a token - - A comment starts with a '#' character and ends at the EOL/EOF + + A comment starts with a `#` character and ends at the EOL/EOF """ while self.peek() != "\n" and not self.is_at_end(): self.advance() diff --git a/lexer/keyword.py b/lexer/keyword.py new file mode 100644 index 0000000..a4f03cf --- /dev/null +++ b/lexer/keyword.py @@ -0,0 +1,9 @@ +from lexer.token import TokenType + +KEYWORDS: dict[str, TokenType] = { + "type": TokenType.TYPE, + "op": TokenType.OP, + "constraint": TokenType.CONSTRAINT, + "true": TokenType.TRUE, + "false": TokenType.FALSE, +} diff --git a/lexer/midas.py b/lexer/midas.py new file mode 100644 index 0000000..16440da --- /dev/null +++ b/lexer/midas.py @@ -0,0 +1,126 @@ +from lexer.base import Lexer +from lexer.keyword import KEYWORDS +from lexer.token import TokenType + + +class MidasLexer(Lexer): + def scan_token(self) -> None: + char: str = self.advance() + match char: + case "(": + self.add_token(TokenType.LEFT_PAREN) + case ")": + self.add_token(TokenType.RIGHT_PAREN) + case "[": + self.add_token(TokenType.LEFT_BRACKET) + case "]": + self.add_token(TokenType.RIGHT_BRACKET) + case "{": + self.add_token(TokenType.LEFT_BRACE) + case "}": + self.add_token(TokenType.RIGHT_BRACE) + case "<": + self.add_token( + TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS + ) + case ">": + self.add_token( + TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER + ) + case "=": + self.add_token( + TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL + ) + case ":": + self.add_token(TokenType.COLON) + case ",": + self.add_token(TokenType.COMMA) + case "_": + self.add_token(TokenType.UNDERSCORE) + case "+": + self.add_token(TokenType.PLUS) + case "-": + self.add_token(TokenType.MINUS) + case "*": + self.add_token(TokenType.STAR) + case "/": + if self.match("/"): + self.scan_comment() + elif self.match("*"): + self.scan_comment_multiline() + else: + self.add_token(TokenType.SLASH) + case "\n": + self.add_token(TokenType.NEWLINE) + case " " | "\r" | "\t": + # Consume all whitespace characters until EOL or EOF + while ( + self.peek().isspace() + and self.peek() != "\n" + and not self.is_at_end() + ): + self.advance() + self.add_token(TokenType.WHITESPACE) + case _: + if char.isdigit(): + self.scan_number() + elif char.isalpha(): + self.scan_identifier() + else: + self.error("Unexpected character") + return None + + def scan_number(self): + """Scan the rest of number and add it as a token + + This method handles both simple integers and floats. Scientific notation + and base prefixes (0x, 0b, 0o) are not supported + """ + while self.peek().isdigit(): + self.advance() + + if self.peek() == "." and self.peek_next().isdigit(): + self.advance() + while self.peek().isdigit(): + self.advance() + + value: float = float(self.source[self.start : self.idx]) + self.add_token(TokenType.NUMBER, value) + + def scan_identifier(self): + """Scan the rest of an identifier and add it as a token + + An identifier starts with a letter, followed by any number of + alphanumerical characters or underscores + """ + while self.peek().isalnum() or self.peek() == "_": + self.advance() + + lexeme: str = self.source[self.start : self.idx] + token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER) + self.add_token(token_type) + + def scan_comment(self): + """Scan the rest of a comment and add it as a token + + A comment starts with `//` and ends at the EOL/EOF + """ + while self.peek() != "\n" and not self.is_at_end(): + self.advance() + self.add_token(TokenType.COMMENT) + + def scan_comment_multiline(self): + """Scan the rest of a multiline comment and add it as a token + + A multiline comment starts with `/*` and ends with `*/` or at the EOF + """ + while ( + not (self.peek() == "*" and self.peek_next() == "/") + and not self.is_at_end() + ): + self.advance() + if not self.is_at_end(): + self.advance() + if not self.is_at_end(): + self.advance() + self.add_token(TokenType.COMMENT) diff --git a/lexer/token.py b/lexer/token.py index e06194c..9b5bc13 100644 --- a/lexer/token.py +++ b/lexer/token.py @@ -11,12 +11,23 @@ class TokenType(Enum): RIGHT_PAREN = auto() LEFT_BRACKET = auto() RIGHT_BRACKET = auto() + LEFT_BRACE = auto() + RIGHT_BRACE = auto() COLON = auto() COMMA = auto() UNDERSCORE = auto() # Operators PLUS = auto() + MINUS = auto() + STAR = auto() + SLASH = auto() + GREATER = auto() + GREATER_EQUAL = auto() + LESS = auto() + LESS_EQUAL = auto() + EQUAL = auto() + EQUAL_EQUAL = auto() # Literals IDENTIFIER = auto() @@ -25,6 +36,11 @@ class TokenType(Enum): FALSE = auto() NONE = auto() + # Keywords + TYPE = auto() + OP = auto() + CONSTRAINT = auto() + # Misc COMMENT = auto() WHITESPACE = auto() @@ -35,6 +51,7 @@ class TokenType(Enum): @dataclass(frozen=True) class Token: """A scanned token""" + type: TokenType lexeme: str value: Any