diff --git a/.gitignore b/.gitignore index 600d2d3..f63541d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ -.vscode \ No newline at end of file +.vscode +__pycache__ +.env +venv +.venv +*.pyc \ No newline at end of file diff --git a/lexer/__init__.py b/lexer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lexer/base.py b/lexer/base.py new file mode 100644 index 0000000..1104e7a --- /dev/null +++ b/lexer/base.py @@ -0,0 +1,166 @@ +from abc import ABC, abstractmethod +from typing import Any, Callable, Optional + +from lexer.position import Position +from lexer.token import Token, TokenType + + +class Lexer(ABC): + """An abstract lexer which provides methods to easily extend it into a concrete one + + This implementation is based on the [_Crafting Interpreters_][1] book by Robert Nystrom, + more specifically on my [previous Python implementation](https://git.kb28.ch/HEL/pebble) + + [1]: https://craftinginterpreters.com/ + """ + + def __init__(self, source: str, file: Optional[str] = None) -> None: + """Create a new lexer to scan for tokens in the given source + + Args: + source (str): the source to scan + file (Optional[str], optional): the path of the given source. Can be a file path or any string identifier. Defaults to None. + """ + self.source: str = source + self.file: Optional[str] = file + self.tokens: list[Token] = [] + self.start: int = 0 + self.idx: int = 0 + self.length: int = len(self.source) + self.line: int = 1 + self.column: int = 1 + self.start_pos: Position = self.get_position() + + def error(self, msg: str): + """Raise a syntax error + + Args: + msg (str): the error message + + Raises: + SyntaxError + """ + raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}") + + def process(self) -> list[Token]: + """Scan tokens out of the source text + + Returns: + list[Token]: all the tokens that could be scanned + + Raises: + SyntaxError: if a syntax error is found + """ + self.scan_tokens() + self.tokens.append(Token(TokenType.EOF, "", None, self.get_position())) + return self.tokens + + def is_at_end(self) -> bool: + """Whether the lexer is at the end of the source + + Returns: + bool: True if the current index is at the end of the source + """ + return self.idx >= self.length + + def get_position(self) -> Position: + """Get the current position + + Returns: + Position: the current position + """ + return Position(file=self.file, line=self.line, column=self.column) + + def peek(self) -> str: + """Get the current character without advancing, if any + + Returns: + str: the current character, or an empty string if at EOF + """ + if self.idx < self.length: + return self.source[self.idx] + return "" + + def peek_next(self) -> str: + """Get the next character without advancing, if any + + Returns: + str: the next character, or an empty string if at EOF + """ + if self.idx + 1 < self.length: + return self.source[self.idx + 1] + return "" + + def advance(self) -> str: + """Get the new character and advance + + Returns: + str: the current character, before advancing + """ + char: str = self.peek() + self.idx += 1 + self.column += 1 + if char == "\n": + self.newline() + return char + + def newline(self): + """Update the current position after encountering a newline character""" + self.line += 1 + self.column = 1 + + def match(self, expected: str) -> bool: + """Consume the next character if it matches the given value + + Args: + expected (str): the expected character + + Returns: + bool: whether a character was matched and consumed + """ + if self.peek() == expected: + self.advance() + return True + return False + + def update_start(self): + """Update the starting position of the current lexeme + + The cursor marking the start of the lexeme currently being scanned is + moved to the current position + """ + self.start_pos = self.get_position() + self.start = self.idx + + def add_token(self, token_type: TokenType, value: Optional[Any] = None): + """Add the current lexeme to the list of scanned tokens + + Args: + token_type (TokenType): the type of token to add + value (Optional[Any], optional): the value of the token (useful for numbers or constants). Defaults to None. + """ + lexeme: str = self.source[self.start : self.idx] + self.tokens.append( + Token(position=self.start_pos, type=token_type, lexeme=lexeme, value=value) + ) + + def scan_tokens(self, condition: Optional[Callable[[], bool]] = None): + """Scan tokens until EOF is reached or the given condition becomes False + + Args: + condition (Optional[Callable[[], bool]], optional): the condition to continue scanning tokens. + If None, defaults to always being True, effectively scanning tokens until EOF is reached. Defaults to None. + """ + if condition is None: + condition = lambda: True # noqa: E731 + while condition() and not self.is_at_end(): + self.update_start() + self.scan_token() + + @abstractmethod + def scan_token(self) -> None: + """Scan a token + + This function should (at least) consume the current character and produce the appropriate token(s), using `add_token` + """ + pass diff --git a/lexer/position.py b/lexer/position.py new file mode 100644 index 0000000..306e24d --- /dev/null +++ b/lexer/position.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass(frozen=True) +class Position: + """A simple structure to store the position of a token""" + file: Optional[str] + line: int + column: int + + def __repr__(self): + return f"{self.file or ''}L{self.line}:{self.column}" diff --git a/lexer/token.py b/lexer/token.py new file mode 100644 index 0000000..3d0b293 --- /dev/null +++ b/lexer/token.py @@ -0,0 +1,39 @@ +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any + +from lexer.position import Position + + +class TokenType(Enum): + # Punctuation + LEFT_PAREN = auto() + RIGHT_PAREN = auto() + COLON = auto() + COMMA = auto() + UNDERSCORE = auto() + + # Operators + PLUS = auto() + + # Literals + IDENTIFIER = auto() + NUMBER = auto() + TRUE = auto() + FALSE = auto() + NONE = auto() + + # Misc + COMMENT = auto() + WHITESPACE = auto() + EOF = auto() + NEWLINE = auto() + + +@dataclass(frozen=True) +class Token: + """A scanned token""" + type: TokenType + lexeme: str + value: Any + position: Position