Files
pebble/src/lexer.py

188 lines
6.0 KiB
Python

from typing import Optional, Any
from src.keyword import KEYWORDS
from src.position import Position
from src.token import Token, TokenType
class Lexer:
def __init__(self):
self.path: str = "<main>"
self.source: str = ""
self.tokens: list[Token] = []
self.start: int = 0
self.idx: int = 0
self.length: int = 0
self.line: int = 1
self.column: int = 1
self.start_pos: Position = self.get_position()
def error(self, msg: str):
raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}")
def process(self, source: str, path: Optional[str] = None) -> list[Token]:
self.path = path or "<main>"
self.source = source
self.tokens = []
self.start = 0
self.idx = 0
self.length = len(self.source)
self.line = 1
self.column = 1
while not self.is_at_end():
self.start_pos = self.get_position()
self.start = self.idx
self.scan_token()
self.tokens.append(Token(TokenType.EOF, "", None, self.get_position()))
return self.tokens
def is_at_end(self) -> bool:
return self.idx >= self.length
def get_position(self) -> Position:
return Position(self.path, self.line, self.column)
def peek(self) -> str:
if self.idx < self.length:
return self.source[self.idx]
return ""
def peek_next(self) -> str:
if self.idx + 1 < self.length:
return self.source[self.idx + 1]
return ""
def advance(self) -> str:
char: str = self.peek()
self.idx += 1
self.column += 1
if char == "\n":
self.newline()
return char
def newline(self):
self.line += 1
self.column = 1
def match(self, expected: str) -> bool:
if self.peek() == expected:
self.advance()
return True
return False
def add_token(self, token_type: TokenType, value: Optional[Any] = None):
lexeme: str = self.source[self.start:self.idx]
self.tokens.append(
Token(
position=self.start_pos,
type=token_type,
lexeme=lexeme,
value=value
)
)
def scan_token(self):
char: str = self.advance()
match char:
case "(":
self.add_token(TokenType.LEFT_PAREN)
case ")":
self.add_token(TokenType.RIGHT_PAREN)
case "{":
self.add_token(TokenType.LEFT_BRACE)
case "}":
self.add_token(TokenType.RIGHT_BRACE)
case ",":
self.add_token(TokenType.COMMA)
case ".":
self.add_token(TokenType.DOT)
case ";":
self.add_token(TokenType.SEMICOLON)
case ":":
self.add_token(TokenType.COLON)
case "+":
self.add_token(TokenType.PLUS_EQUAL if self.match("=") else TokenType.PLUS)
case "-":
self.add_token(TokenType.MINUS_EQUAL if self.match("=") else TokenType.MINUS)
case "*":
self.add_token(TokenType.STAR_EQUAL if self.match("=") else TokenType.STAR)
case "=":
self.add_token(TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL)
case "!":
self.add_token(TokenType.BANG_EQUAL if self.match("=") else TokenType.BANG)
case ">":
self.add_token(TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER)
case "<":
self.add_token(TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS)
case "/":
if self.match("/"):
self.scan_comment()
elif self.match("*"):
self.scan_comment_multiline()
else:
self.add_token(TokenType.SLASH_EQUAL if self.match("=") else TokenType.SLASH)
case "\n":
self.add_token(TokenType.NEWLINE)
case " " | "\r" | "\t":
while self.peek().isspace() and self.peek() != "\n" and not self.is_at_end():
self.advance()
self.add_token(TokenType.WHITESPACE)
case '"':
self.scan_string()
case _:
if char.isdigit():
self.scan_number()
elif char.isalpha():
self.scan_identifier()
else:
self.error("Unexpected character")
return None
def scan_string(self):
while self.peek() != '"' and not self.is_at_end():
self.advance()
if self.is_at_end():
self.error("Unterminated string")
self.advance()
value: str = self.source[self.start + 1:self.idx - 1]
self.add_token(TokenType.STRING, value)
def scan_number(self):
while self.peek().isdigit():
self.advance()
if self.peek() == "." and self.peek_next().isdigit():
self.advance()
while self.peek().isdigit():
self.advance()
value: float = float(self.source[self.start:self.idx])
self.add_token(TokenType.NUMBER, value)
def scan_identifier(self):
while self.peek().isalnum() or self.peek() == "_":
self.advance()
lexeme: str = self.source[self.start:self.idx]
token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER)
self.add_token(token_type)
def scan_comment(self):
while self.peek() != "\n" and not self.is_at_end():
self.advance()
self.add_token(TokenType.COMMENT)
def scan_comment_multiline(self):
while not (self.peek() == "*" and self.peek_next() == "/") and not self.is_at_end():
self.advance()
if not self.is_at_end():
self.advance()
if not self.is_at_end():
self.advance()
self.add_token(TokenType.COMMENT)