feat(parser): add basic lexer for type definitions

This commit is contained in:
2026-05-13 22:06:32 +02:00
parent fcbea218a4
commit 1fc842e23f
5 changed files with 157 additions and 5 deletions

View File

@@ -21,4 +21,4 @@ type Age<int + (0 <= _ < 150)>
// Predefined custom constraints that can be referenced in other definitions
constraint Positive = _ >= 0
constraint StrictlyPositive = _ > 0
constraint Even = _ % 2 == 0
//constraint Even = _ % 2 == 0

View File

@@ -74,7 +74,7 @@ class AnnotationLexer(Lexer):
def scan_comment(self):
"""Scan the rest of a comment and add it as a token
A comment starts with a '#' character and ends at the EOL/EOF
A comment starts with a `#` character and ends at the EOL/EOF
"""
while self.peek() != "\n" and not self.is_at_end():
self.advance()

9
lexer/keyword.py Normal file
View File

@@ -0,0 +1,9 @@
from lexer.token import TokenType
KEYWORDS: dict[str, TokenType] = {
"type": TokenType.TYPE,
"op": TokenType.OP,
"constraint": TokenType.CONSTRAINT,
"true": TokenType.TRUE,
"false": TokenType.FALSE,
}

126
lexer/midas.py Normal file
View File

@@ -0,0 +1,126 @@
from lexer.base import Lexer
from lexer.keyword import KEYWORDS
from lexer.token import TokenType
class MidasLexer(Lexer):
def scan_token(self) -> None:
char: str = self.advance()
match char:
case "(":
self.add_token(TokenType.LEFT_PAREN)
case ")":
self.add_token(TokenType.RIGHT_PAREN)
case "[":
self.add_token(TokenType.LEFT_BRACKET)
case "]":
self.add_token(TokenType.RIGHT_BRACKET)
case "{":
self.add_token(TokenType.LEFT_BRACE)
case "}":
self.add_token(TokenType.RIGHT_BRACE)
case "<":
self.add_token(
TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS
)
case ">":
self.add_token(
TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER
)
case "=":
self.add_token(
TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL
)
case ":":
self.add_token(TokenType.COLON)
case ",":
self.add_token(TokenType.COMMA)
case "_":
self.add_token(TokenType.UNDERSCORE)
case "+":
self.add_token(TokenType.PLUS)
case "-":
self.add_token(TokenType.MINUS)
case "*":
self.add_token(TokenType.STAR)
case "/":
if self.match("/"):
self.scan_comment()
elif self.match("*"):
self.scan_comment_multiline()
else:
self.add_token(TokenType.SLASH)
case "\n":
self.add_token(TokenType.NEWLINE)
case " " | "\r" | "\t":
# Consume all whitespace characters until EOL or EOF
while (
self.peek().isspace()
and self.peek() != "\n"
and not self.is_at_end()
):
self.advance()
self.add_token(TokenType.WHITESPACE)
case _:
if char.isdigit():
self.scan_number()
elif char.isalpha():
self.scan_identifier()
else:
self.error("Unexpected character")
return None
def scan_number(self):
"""Scan the rest of number and add it as a token
This method handles both simple integers and floats. Scientific notation
and base prefixes (0x, 0b, 0o) are not supported
"""
while self.peek().isdigit():
self.advance()
if self.peek() == "." and self.peek_next().isdigit():
self.advance()
while self.peek().isdigit():
self.advance()
value: float = float(self.source[self.start : self.idx])
self.add_token(TokenType.NUMBER, value)
def scan_identifier(self):
"""Scan the rest of an identifier and add it as a token
An identifier starts with a letter, followed by any number of
alphanumerical characters or underscores
"""
while self.peek().isalnum() or self.peek() == "_":
self.advance()
lexeme: str = self.source[self.start : self.idx]
token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER)
self.add_token(token_type)
def scan_comment(self):
"""Scan the rest of a comment and add it as a token
A comment starts with `//` and ends at the EOL/EOF
"""
while self.peek() != "\n" and not self.is_at_end():
self.advance()
self.add_token(TokenType.COMMENT)
def scan_comment_multiline(self):
"""Scan the rest of a multiline comment and add it as a token
A multiline comment starts with `/*` and ends with `*/` or at the EOF
"""
while (
not (self.peek() == "*" and self.peek_next() == "/")
and not self.is_at_end()
):
self.advance()
if not self.is_at_end():
self.advance()
if not self.is_at_end():
self.advance()
self.add_token(TokenType.COMMENT)

View File

@@ -11,12 +11,23 @@ class TokenType(Enum):
RIGHT_PAREN = auto()
LEFT_BRACKET = auto()
RIGHT_BRACKET = auto()
LEFT_BRACE = auto()
RIGHT_BRACE = auto()
COLON = auto()
COMMA = auto()
UNDERSCORE = auto()
# Operators
PLUS = auto()
MINUS = auto()
STAR = auto()
SLASH = auto()
GREATER = auto()
GREATER_EQUAL = auto()
LESS = auto()
LESS_EQUAL = auto()
EQUAL = auto()
EQUAL_EQUAL = auto()
# Literals
IDENTIFIER = auto()
@@ -25,6 +36,11 @@ class TokenType(Enum):
FALSE = auto()
NONE = auto()
# Keywords
TYPE = auto()
OP = auto()
CONSTRAINT = auto()
# Misc
COMMENT = auto()
WHITESPACE = auto()
@@ -35,6 +51,7 @@ class TokenType(Enum):
@dataclass(frozen=True)
class Token:
"""A scanned token"""
type: TokenType
lexeme: str
value: Any