feat(parser): add basic lexer for type definitions

This commit is contained in:
2026-05-13 22:06:32 +02:00
parent fcbea218a4
commit 1fc842e23f
5 changed files with 157 additions and 5 deletions

View File

@@ -21,4 +21,4 @@ type Age<int + (0 <= _ < 150)>
// Predefined custom constraints that can be referenced in other definitions // Predefined custom constraints that can be referenced in other definitions
constraint Positive = _ >= 0 constraint Positive = _ >= 0
constraint StrictlyPositive = _ > 0 constraint StrictlyPositive = _ > 0
constraint Even = _ % 2 == 0 //constraint Even = _ % 2 == 0

View File

@@ -46,7 +46,7 @@ class AnnotationLexer(Lexer):
def scan_number(self): def scan_number(self):
"""Scan the rest of number and add it as a token """Scan the rest of number and add it as a token
This method handles both simple integers and floats. Scientific notation This method handles both simple integers and floats. Scientific notation
and base prefixes (0x, 0b, 0o) are not supported and base prefixes (0x, 0b, 0o) are not supported
""" """
@@ -63,7 +63,7 @@ class AnnotationLexer(Lexer):
def scan_identifier(self): def scan_identifier(self):
"""Scan the rest of an identifier and add it as a token """Scan the rest of an identifier and add it as a token
An identifier starts with a letter, followed by any number of An identifier starts with a letter, followed by any number of
alphanumerical characters or underscores alphanumerical characters or underscores
""" """
@@ -73,8 +73,8 @@ class AnnotationLexer(Lexer):
def scan_comment(self): def scan_comment(self):
"""Scan the rest of a comment and add it as a token """Scan the rest of a comment and add it as a token
A comment starts with a '#' character and ends at the EOL/EOF A comment starts with a `#` character and ends at the EOL/EOF
""" """
while self.peek() != "\n" and not self.is_at_end(): while self.peek() != "\n" and not self.is_at_end():
self.advance() self.advance()

9
lexer/keyword.py Normal file
View File

@@ -0,0 +1,9 @@
from lexer.token import TokenType
KEYWORDS: dict[str, TokenType] = {
"type": TokenType.TYPE,
"op": TokenType.OP,
"constraint": TokenType.CONSTRAINT,
"true": TokenType.TRUE,
"false": TokenType.FALSE,
}

126
lexer/midas.py Normal file
View File

@@ -0,0 +1,126 @@
from lexer.base import Lexer
from lexer.keyword import KEYWORDS
from lexer.token import TokenType
class MidasLexer(Lexer):
def scan_token(self) -> None:
char: str = self.advance()
match char:
case "(":
self.add_token(TokenType.LEFT_PAREN)
case ")":
self.add_token(TokenType.RIGHT_PAREN)
case "[":
self.add_token(TokenType.LEFT_BRACKET)
case "]":
self.add_token(TokenType.RIGHT_BRACKET)
case "{":
self.add_token(TokenType.LEFT_BRACE)
case "}":
self.add_token(TokenType.RIGHT_BRACE)
case "<":
self.add_token(
TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS
)
case ">":
self.add_token(
TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER
)
case "=":
self.add_token(
TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL
)
case ":":
self.add_token(TokenType.COLON)
case ",":
self.add_token(TokenType.COMMA)
case "_":
self.add_token(TokenType.UNDERSCORE)
case "+":
self.add_token(TokenType.PLUS)
case "-":
self.add_token(TokenType.MINUS)
case "*":
self.add_token(TokenType.STAR)
case "/":
if self.match("/"):
self.scan_comment()
elif self.match("*"):
self.scan_comment_multiline()
else:
self.add_token(TokenType.SLASH)
case "\n":
self.add_token(TokenType.NEWLINE)
case " " | "\r" | "\t":
# Consume all whitespace characters until EOL or EOF
while (
self.peek().isspace()
and self.peek() != "\n"
and not self.is_at_end()
):
self.advance()
self.add_token(TokenType.WHITESPACE)
case _:
if char.isdigit():
self.scan_number()
elif char.isalpha():
self.scan_identifier()
else:
self.error("Unexpected character")
return None
def scan_number(self):
"""Scan the rest of number and add it as a token
This method handles both simple integers and floats. Scientific notation
and base prefixes (0x, 0b, 0o) are not supported
"""
while self.peek().isdigit():
self.advance()
if self.peek() == "." and self.peek_next().isdigit():
self.advance()
while self.peek().isdigit():
self.advance()
value: float = float(self.source[self.start : self.idx])
self.add_token(TokenType.NUMBER, value)
def scan_identifier(self):
"""Scan the rest of an identifier and add it as a token
An identifier starts with a letter, followed by any number of
alphanumerical characters or underscores
"""
while self.peek().isalnum() or self.peek() == "_":
self.advance()
lexeme: str = self.source[self.start : self.idx]
token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER)
self.add_token(token_type)
def scan_comment(self):
"""Scan the rest of a comment and add it as a token
A comment starts with `//` and ends at the EOL/EOF
"""
while self.peek() != "\n" and not self.is_at_end():
self.advance()
self.add_token(TokenType.COMMENT)
def scan_comment_multiline(self):
"""Scan the rest of a multiline comment and add it as a token
A multiline comment starts with `/*` and ends with `*/` or at the EOF
"""
while (
not (self.peek() == "*" and self.peek_next() == "/")
and not self.is_at_end()
):
self.advance()
if not self.is_at_end():
self.advance()
if not self.is_at_end():
self.advance()
self.add_token(TokenType.COMMENT)

View File

@@ -11,12 +11,23 @@ class TokenType(Enum):
RIGHT_PAREN = auto() RIGHT_PAREN = auto()
LEFT_BRACKET = auto() LEFT_BRACKET = auto()
RIGHT_BRACKET = auto() RIGHT_BRACKET = auto()
LEFT_BRACE = auto()
RIGHT_BRACE = auto()
COLON = auto() COLON = auto()
COMMA = auto() COMMA = auto()
UNDERSCORE = auto() UNDERSCORE = auto()
# Operators # Operators
PLUS = auto() PLUS = auto()
MINUS = auto()
STAR = auto()
SLASH = auto()
GREATER = auto()
GREATER_EQUAL = auto()
LESS = auto()
LESS_EQUAL = auto()
EQUAL = auto()
EQUAL_EQUAL = auto()
# Literals # Literals
IDENTIFIER = auto() IDENTIFIER = auto()
@@ -25,6 +36,11 @@ class TokenType(Enum):
FALSE = auto() FALSE = auto()
NONE = auto() NONE = auto()
# Keywords
TYPE = auto()
OP = auto()
CONSTRAINT = auto()
# Misc # Misc
COMMENT = auto() COMMENT = auto()
WHITESPACE = auto() WHITESPACE = auto()
@@ -35,6 +51,7 @@ class TokenType(Enum):
@dataclass(frozen=True) @dataclass(frozen=True)
class Token: class Token:
"""A scanned token""" """A scanned token"""
type: TokenType type: TokenType
lexeme: str lexeme: str
value: Any value: Any