188 lines
6.0 KiB
Python
188 lines
6.0 KiB
Python
from typing import Optional, Any
|
|
|
|
from src.keyword import KEYWORDS
|
|
from src.position import Position
|
|
from src.token import Token, TokenType
|
|
|
|
|
|
class Lexer:
|
|
def __init__(self):
|
|
self.path: str = "<main>"
|
|
self.source: str = ""
|
|
self.tokens: list[Token] = []
|
|
self.start: int = 0
|
|
self.idx: int = 0
|
|
self.length: int = 0
|
|
self.line: int = 1
|
|
self.column: int = 1
|
|
self.start_pos: Position = self.get_position()
|
|
|
|
def error(self, msg: str):
|
|
raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}")
|
|
|
|
def process(self, source: str, path: Optional[str] = None) -> list[Token]:
|
|
self.path = path or "<main>"
|
|
self.source = source
|
|
self.tokens = []
|
|
self.start = 0
|
|
self.idx = 0
|
|
self.length = len(self.source)
|
|
self.line = 1
|
|
self.column = 1
|
|
|
|
while not self.is_at_end():
|
|
self.start_pos = self.get_position()
|
|
self.start = self.idx
|
|
self.scan_token()
|
|
|
|
self.tokens.append(Token(TokenType.EOF, "", None, self.get_position()))
|
|
|
|
return self.tokens
|
|
|
|
def is_at_end(self) -> bool:
|
|
return self.idx >= self.length
|
|
|
|
def get_position(self) -> Position:
|
|
return Position(self.path, self.line, self.column)
|
|
|
|
def peek(self) -> str:
|
|
if self.idx < self.length:
|
|
return self.source[self.idx]
|
|
return ""
|
|
|
|
def peek_next(self) -> str:
|
|
if self.idx + 1 < self.length:
|
|
return self.source[self.idx + 1]
|
|
return ""
|
|
|
|
def advance(self) -> str:
|
|
char: str = self.peek()
|
|
self.idx += 1
|
|
self.column += 1
|
|
if char == "\n":
|
|
self.newline()
|
|
return char
|
|
|
|
def newline(self):
|
|
self.line += 1
|
|
self.column = 1
|
|
|
|
def match(self, expected: str) -> bool:
|
|
if self.peek() == expected:
|
|
self.advance()
|
|
return True
|
|
return False
|
|
|
|
def add_token(self, token_type: TokenType, value: Optional[Any] = None):
|
|
lexeme: str = self.source[self.start:self.idx]
|
|
self.tokens.append(
|
|
Token(
|
|
position=self.start_pos,
|
|
type=token_type,
|
|
lexeme=lexeme,
|
|
value=value
|
|
)
|
|
)
|
|
|
|
def scan_token(self):
|
|
char: str = self.advance()
|
|
match char:
|
|
case "(":
|
|
self.add_token(TokenType.LEFT_PAREN)
|
|
case ")":
|
|
self.add_token(TokenType.RIGHT_PAREN)
|
|
case "{":
|
|
self.add_token(TokenType.LEFT_BRACE)
|
|
case "}":
|
|
self.add_token(TokenType.RIGHT_BRACE)
|
|
case ",":
|
|
self.add_token(TokenType.COMMA)
|
|
case ".":
|
|
self.add_token(TokenType.DOT)
|
|
case ";":
|
|
self.add_token(TokenType.SEMICOLON)
|
|
case ":":
|
|
self.add_token(TokenType.COLON)
|
|
case "+":
|
|
self.add_token(TokenType.PLUS_EQUAL if self.match("=") else TokenType.PLUS)
|
|
case "-":
|
|
self.add_token(TokenType.MINUS_EQUAL if self.match("=") else TokenType.MINUS)
|
|
case "*":
|
|
self.add_token(TokenType.STAR_EQUAL if self.match("=") else TokenType.STAR)
|
|
case "=":
|
|
self.add_token(TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL)
|
|
case "!":
|
|
self.add_token(TokenType.BANG_EQUAL if self.match("=") else TokenType.BANG)
|
|
case ">":
|
|
self.add_token(TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER)
|
|
case "<":
|
|
self.add_token(TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS)
|
|
case "/":
|
|
if self.match("/"):
|
|
self.scan_comment()
|
|
elif self.match("*"):
|
|
self.scan_comment_multiline()
|
|
else:
|
|
self.add_token(TokenType.SLASH_EQUAL if self.match("=") else TokenType.SLASH)
|
|
case "\n":
|
|
self.add_token(TokenType.NEWLINE)
|
|
case " " | "\r" | "\t":
|
|
while self.peek().isspace() and self.peek() != "\n" and not self.is_at_end():
|
|
self.advance()
|
|
self.add_token(TokenType.WHITESPACE)
|
|
case '"':
|
|
self.scan_string()
|
|
case _:
|
|
if char.isdigit():
|
|
self.scan_number()
|
|
elif char.isalpha():
|
|
self.scan_identifier()
|
|
else:
|
|
self.error("Unexpected character")
|
|
return None
|
|
|
|
def scan_string(self):
|
|
while self.peek() != '"' and not self.is_at_end():
|
|
self.advance()
|
|
|
|
if self.is_at_end():
|
|
self.error("Unterminated string")
|
|
|
|
self.advance()
|
|
value: str = self.source[self.start + 1:self.idx - 1]
|
|
self.add_token(TokenType.STRING, value)
|
|
|
|
def scan_number(self):
|
|
while self.peek().isdigit():
|
|
self.advance()
|
|
|
|
if self.peek() == "." and self.peek_next().isdigit():
|
|
self.advance()
|
|
while self.peek().isdigit():
|
|
self.advance()
|
|
|
|
value: float = float(self.source[self.start:self.idx])
|
|
self.add_token(TokenType.NUMBER, value)
|
|
|
|
def scan_identifier(self):
|
|
while self.peek().isalnum() or self.peek() == "_":
|
|
self.advance()
|
|
|
|
lexeme: str = self.source[self.start:self.idx]
|
|
token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER)
|
|
self.add_token(token_type)
|
|
|
|
def scan_comment(self):
|
|
while self.peek() != "\n" and not self.is_at_end():
|
|
self.advance()
|
|
self.add_token(TokenType.COMMENT)
|
|
|
|
def scan_comment_multiline(self):
|
|
while not (self.peek() == "*" and self.peek_next() == "/") and not self.is_at_end():
|
|
self.advance()
|
|
if not self.is_at_end():
|
|
self.advance()
|
|
if not self.is_at_end():
|
|
self.advance()
|
|
self.add_token(TokenType.COMMENT)
|