feat(parser): add basic lexer for type definitions
This commit is contained in:
@@ -21,4 +21,4 @@ type Age<int + (0 <= _ < 150)>
|
||||
// Predefined custom constraints that can be referenced in other definitions
|
||||
constraint Positive = _ >= 0
|
||||
constraint StrictlyPositive = _ > 0
|
||||
constraint Even = _ % 2 == 0
|
||||
//constraint Even = _ % 2 == 0
|
||||
@@ -46,7 +46,7 @@ class AnnotationLexer(Lexer):
|
||||
|
||||
def scan_number(self):
|
||||
"""Scan the rest of number and add it as a token
|
||||
|
||||
|
||||
This method handles both simple integers and floats. Scientific notation
|
||||
and base prefixes (0x, 0b, 0o) are not supported
|
||||
"""
|
||||
@@ -63,7 +63,7 @@ class AnnotationLexer(Lexer):
|
||||
|
||||
def scan_identifier(self):
|
||||
"""Scan the rest of an identifier and add it as a token
|
||||
|
||||
|
||||
An identifier starts with a letter, followed by any number of
|
||||
alphanumerical characters or underscores
|
||||
"""
|
||||
@@ -73,8 +73,8 @@ class AnnotationLexer(Lexer):
|
||||
|
||||
def scan_comment(self):
|
||||
"""Scan the rest of a comment and add it as a token
|
||||
|
||||
A comment starts with a '#' character and ends at the EOL/EOF
|
||||
|
||||
A comment starts with a `#` character and ends at the EOL/EOF
|
||||
"""
|
||||
while self.peek() != "\n" and not self.is_at_end():
|
||||
self.advance()
|
||||
|
||||
9
lexer/keyword.py
Normal file
9
lexer/keyword.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from lexer.token import TokenType
|
||||
|
||||
KEYWORDS: dict[str, TokenType] = {
|
||||
"type": TokenType.TYPE,
|
||||
"op": TokenType.OP,
|
||||
"constraint": TokenType.CONSTRAINT,
|
||||
"true": TokenType.TRUE,
|
||||
"false": TokenType.FALSE,
|
||||
}
|
||||
126
lexer/midas.py
Normal file
126
lexer/midas.py
Normal file
@@ -0,0 +1,126 @@
|
||||
from lexer.base import Lexer
|
||||
from lexer.keyword import KEYWORDS
|
||||
from lexer.token import TokenType
|
||||
|
||||
|
||||
class MidasLexer(Lexer):
|
||||
def scan_token(self) -> None:
|
||||
char: str = self.advance()
|
||||
match char:
|
||||
case "(":
|
||||
self.add_token(TokenType.LEFT_PAREN)
|
||||
case ")":
|
||||
self.add_token(TokenType.RIGHT_PAREN)
|
||||
case "[":
|
||||
self.add_token(TokenType.LEFT_BRACKET)
|
||||
case "]":
|
||||
self.add_token(TokenType.RIGHT_BRACKET)
|
||||
case "{":
|
||||
self.add_token(TokenType.LEFT_BRACE)
|
||||
case "}":
|
||||
self.add_token(TokenType.RIGHT_BRACE)
|
||||
case "<":
|
||||
self.add_token(
|
||||
TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS
|
||||
)
|
||||
case ">":
|
||||
self.add_token(
|
||||
TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER
|
||||
)
|
||||
case "=":
|
||||
self.add_token(
|
||||
TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL
|
||||
)
|
||||
case ":":
|
||||
self.add_token(TokenType.COLON)
|
||||
case ",":
|
||||
self.add_token(TokenType.COMMA)
|
||||
case "_":
|
||||
self.add_token(TokenType.UNDERSCORE)
|
||||
case "+":
|
||||
self.add_token(TokenType.PLUS)
|
||||
case "-":
|
||||
self.add_token(TokenType.MINUS)
|
||||
case "*":
|
||||
self.add_token(TokenType.STAR)
|
||||
case "/":
|
||||
if self.match("/"):
|
||||
self.scan_comment()
|
||||
elif self.match("*"):
|
||||
self.scan_comment_multiline()
|
||||
else:
|
||||
self.add_token(TokenType.SLASH)
|
||||
case "\n":
|
||||
self.add_token(TokenType.NEWLINE)
|
||||
case " " | "\r" | "\t":
|
||||
# Consume all whitespace characters until EOL or EOF
|
||||
while (
|
||||
self.peek().isspace()
|
||||
and self.peek() != "\n"
|
||||
and not self.is_at_end()
|
||||
):
|
||||
self.advance()
|
||||
self.add_token(TokenType.WHITESPACE)
|
||||
case _:
|
||||
if char.isdigit():
|
||||
self.scan_number()
|
||||
elif char.isalpha():
|
||||
self.scan_identifier()
|
||||
else:
|
||||
self.error("Unexpected character")
|
||||
return None
|
||||
|
||||
def scan_number(self):
|
||||
"""Scan the rest of number and add it as a token
|
||||
|
||||
This method handles both simple integers and floats. Scientific notation
|
||||
and base prefixes (0x, 0b, 0o) are not supported
|
||||
"""
|
||||
while self.peek().isdigit():
|
||||
self.advance()
|
||||
|
||||
if self.peek() == "." and self.peek_next().isdigit():
|
||||
self.advance()
|
||||
while self.peek().isdigit():
|
||||
self.advance()
|
||||
|
||||
value: float = float(self.source[self.start : self.idx])
|
||||
self.add_token(TokenType.NUMBER, value)
|
||||
|
||||
def scan_identifier(self):
|
||||
"""Scan the rest of an identifier and add it as a token
|
||||
|
||||
An identifier starts with a letter, followed by any number of
|
||||
alphanumerical characters or underscores
|
||||
"""
|
||||
while self.peek().isalnum() or self.peek() == "_":
|
||||
self.advance()
|
||||
|
||||
lexeme: str = self.source[self.start : self.idx]
|
||||
token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER)
|
||||
self.add_token(token_type)
|
||||
|
||||
def scan_comment(self):
|
||||
"""Scan the rest of a comment and add it as a token
|
||||
|
||||
A comment starts with `//` and ends at the EOL/EOF
|
||||
"""
|
||||
while self.peek() != "\n" and not self.is_at_end():
|
||||
self.advance()
|
||||
self.add_token(TokenType.COMMENT)
|
||||
|
||||
def scan_comment_multiline(self):
|
||||
"""Scan the rest of a multiline comment and add it as a token
|
||||
|
||||
A multiline comment starts with `/*` and ends with `*/` or at the EOF
|
||||
"""
|
||||
while (
|
||||
not (self.peek() == "*" and self.peek_next() == "/")
|
||||
and not self.is_at_end()
|
||||
):
|
||||
self.advance()
|
||||
if not self.is_at_end():
|
||||
self.advance()
|
||||
if not self.is_at_end():
|
||||
self.advance()
|
||||
self.add_token(TokenType.COMMENT)
|
||||
@@ -11,12 +11,23 @@ class TokenType(Enum):
|
||||
RIGHT_PAREN = auto()
|
||||
LEFT_BRACKET = auto()
|
||||
RIGHT_BRACKET = auto()
|
||||
LEFT_BRACE = auto()
|
||||
RIGHT_BRACE = auto()
|
||||
COLON = auto()
|
||||
COMMA = auto()
|
||||
UNDERSCORE = auto()
|
||||
|
||||
# Operators
|
||||
PLUS = auto()
|
||||
MINUS = auto()
|
||||
STAR = auto()
|
||||
SLASH = auto()
|
||||
GREATER = auto()
|
||||
GREATER_EQUAL = auto()
|
||||
LESS = auto()
|
||||
LESS_EQUAL = auto()
|
||||
EQUAL = auto()
|
||||
EQUAL_EQUAL = auto()
|
||||
|
||||
# Literals
|
||||
IDENTIFIER = auto()
|
||||
@@ -25,6 +36,11 @@ class TokenType(Enum):
|
||||
FALSE = auto()
|
||||
NONE = auto()
|
||||
|
||||
# Keywords
|
||||
TYPE = auto()
|
||||
OP = auto()
|
||||
CONSTRAINT = auto()
|
||||
|
||||
# Misc
|
||||
COMMENT = auto()
|
||||
WHITESPACE = auto()
|
||||
@@ -35,6 +51,7 @@ class TokenType(Enum):
|
||||
@dataclass(frozen=True)
|
||||
class Token:
|
||||
"""A scanned token"""
|
||||
|
||||
type: TokenType
|
||||
lexeme: str
|
||||
value: Any
|
||||
|
||||
Reference in New Issue
Block a user