feat(parser): add basic lexer for type definitions

2026-05-13 22:40:26 +02:00
parent fcbea218a4
commit 1fc842e23f
5 changed files with 157 additions and 5 deletions
@@ -21,4 +21,4 @@ type Age<int + (0 <= _ < 150)>
 // Predefined custom constraints that can be referenced in other definitions
 constraint Positive = _ >= 0
 constraint StrictlyPositive = _ > 0
-constraint Even = _ % 2 == 0
+//constraint Even = _ % 2 == 0
@@ -46,7 +46,7 @@ class AnnotationLexer(Lexer):
    def scan_number(self):
        """Scan the rest of number and add it as a token
-        
+
        This method handles both simple integers and floats. Scientific notation
        and base prefixes (0x, 0b, 0o) are not supported
        """
@@ -63,7 +63,7 @@ class AnnotationLexer(Lexer):
    def scan_identifier(self):
        """Scan the rest of an identifier and add it as a token
-        
+
        An identifier starts with a letter, followed by any number of
        alphanumerical characters or underscores
        """
@@ -73,8 +73,8 @@ class AnnotationLexer(Lexer):
    def scan_comment(self):
        """Scan the rest of a comment and add it as a token
-        
+
-        A comment starts with a '#' character and ends at the EOL/EOF
+        A comment starts with a `#` character and ends at the EOL/EOF
        """
        while self.peek() != "\n" and not self.is_at_end():
            self.advance()
@@ -0,0 +1,9 @@
 from lexer.token import TokenType
 KEYWORDS: dict[str, TokenType] = {
    "type": TokenType.TYPE,
    "op": TokenType.OP,
    "constraint": TokenType.CONSTRAINT,
    "true": TokenType.TRUE,
    "false": TokenType.FALSE,
 }
@@ -0,0 +1,126 @@
 from lexer.base import Lexer
 from lexer.keyword import KEYWORDS
 from lexer.token import TokenType
 class MidasLexer(Lexer):
    def scan_token(self) -> None:
        char: str = self.advance()
        match char:
            case "(":
                self.add_token(TokenType.LEFT_PAREN)
            case ")":
                self.add_token(TokenType.RIGHT_PAREN)
            case "[":
                self.add_token(TokenType.LEFT_BRACKET)
            case "]":
                self.add_token(TokenType.RIGHT_BRACKET)
            case "{":
                self.add_token(TokenType.LEFT_BRACE)
            case "}":
                self.add_token(TokenType.RIGHT_BRACE)
            case "<":
                self.add_token(
                    TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS
                )
            case ">":
                self.add_token(
                    TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER
                )
            case "=":
                self.add_token(
                    TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL
                )
            case ":":
                self.add_token(TokenType.COLON)
            case ",":
                self.add_token(TokenType.COMMA)
            case "_":
                self.add_token(TokenType.UNDERSCORE)
            case "+":
                self.add_token(TokenType.PLUS)
            case "-":
                self.add_token(TokenType.MINUS)
            case "*":
                self.add_token(TokenType.STAR)
            case "/":
                if self.match("/"):
                    self.scan_comment()
                elif self.match("*"):
                    self.scan_comment_multiline()
                else:
                    self.add_token(TokenType.SLASH)
            case "\n":
                self.add_token(TokenType.NEWLINE)
            case " " | "\r" | "\t":
                # Consume all whitespace characters until EOL or EOF
                while (
                    self.peek().isspace()
                    and self.peek() != "\n"
                    and not self.is_at_end()
                ):
                    self.advance()
                self.add_token(TokenType.WHITESPACE)
            case _:
                if char.isdigit():
                    self.scan_number()
                elif char.isalpha():
                    self.scan_identifier()
                else:
                    self.error("Unexpected character")
        return None
    def scan_number(self):
        """Scan the rest of number and add it as a token
        This method handles both simple integers and floats. Scientific notation
        and base prefixes (0x, 0b, 0o) are not supported
        """
        while self.peek().isdigit():
            self.advance()
        if self.peek() == "." and self.peek_next().isdigit():
            self.advance()
            while self.peek().isdigit():
                self.advance()
        value: float = float(self.source[self.start : self.idx])
        self.add_token(TokenType.NUMBER, value)
    def scan_identifier(self):
        """Scan the rest of an identifier and add it as a token
        An identifier starts with a letter, followed by any number of
        alphanumerical characters or underscores
        """
        while self.peek().isalnum() or self.peek() == "_":
            self.advance()
        lexeme: str = self.source[self.start : self.idx]
        token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER)
        self.add_token(token_type)
    def scan_comment(self):
        """Scan the rest of a comment and add it as a token
        A comment starts with `//` and ends at the EOL/EOF
        """
        while self.peek() != "\n" and not self.is_at_end():
            self.advance()
        self.add_token(TokenType.COMMENT)
    def scan_comment_multiline(self):
        """Scan the rest of a multiline comment and add it as a token
        A multiline comment starts with `/*` and ends with `*/` or at the EOF
        """
        while (
            not (self.peek() == "*" and self.peek_next() == "/")
            and not self.is_at_end()
        ):
            self.advance()
        if not self.is_at_end():
            self.advance()
        if not self.is_at_end():
            self.advance()
        self.add_token(TokenType.COMMENT)
@@ -11,12 +11,23 @@ class TokenType(Enum):
    RIGHT_PAREN = auto()
    LEFT_BRACKET = auto()
    RIGHT_BRACKET = auto()
    LEFT_BRACE = auto()
    RIGHT_BRACE = auto()
    COLON = auto()
    COMMA = auto()
    UNDERSCORE = auto()
    # Operators
    PLUS = auto()
    MINUS = auto()
    STAR = auto()
    SLASH = auto()
    GREATER = auto()
    GREATER_EQUAL = auto()
    LESS = auto()
    LESS_EQUAL = auto()
    EQUAL = auto()
    EQUAL_EQUAL = auto()
    # Literals
    IDENTIFIER = auto()
@@ -25,6 +36,11 @@ class TokenType(Enum):
    FALSE = auto()
    NONE = auto()
    # Keywords
    TYPE = auto()
    OP = auto()
    CONSTRAINT = auto()
    # Misc
    COMMENT = auto()
    WHITESPACE = auto()
@@ -35,6 +51,7 @@ class TokenType(Enum):
@dataclass(frozen=True)
 class Token:
    """A scanned token"""
    type: TokenType
    lexeme: str
    value: Any