feat(parser): add base lexer class

the lexer and token structures were adapted from another project (see docstring on the Lexer class)
2026-05-13 19:17:55 +02:00
parent 9b59306604
commit fedc582e16
5 changed files with 224 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,6 @@
-.vscode
+.vscode
+__pycache__
+.env
+venv
+.venv
+*.pyc
--- a/lexer/init.py
+++ b/lexer/init.py
--- a/lexer/base.py
+++ b/lexer/base.py
@@ -0,0 +1,166 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Optional
+
+from lexer.position import Position
+from lexer.token import Token, TokenType
+
+
+class Lexer(ABC):
+    """An abstract lexer which provides methods to easily extend it into a concrete one
+
+    This implementation is based on the [_Crafting Interpreters_][1] book by Robert Nystrom,
+    more specifically on my [previous Python implementation](https://git.kb28.ch/HEL/pebble)
+
+    [1]: https://craftinginterpreters.com/
+    """
+
+    def __init__(self, source: str, file: Optional[str] = None) -> None:
+        """Create a new lexer to scan for tokens in the given source
+
+        Args:
+            source (str): the source to scan
+            file (Optional[str], optional): the path of the given source. Can be a file path or any string identifier. Defaults to None.
+        """
+        self.source: str = source
+        self.file: Optional[str] = file
+        self.tokens: list[Token] = []
+        self.start: int = 0
+        self.idx: int = 0
+        self.length: int = len(self.source)
+        self.line: int = 1
+        self.column: int = 1
+        self.start_pos: Position = self.get_position()
+
+    def error(self, msg: str):
+        """Raise a syntax error
+
+        Args:
+            msg (str): the error message
+
+        Raises:
+            SyntaxError
+        """
+        raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}")
+
+    def process(self) -> list[Token]:
+        """Scan tokens out of the source text
+
+        Returns:
+            list[Token]: all the tokens that could be scanned
+
+        Raises:
+            SyntaxError: if a syntax error is found
+        """
+        self.scan_tokens()
+        self.tokens.append(Token(TokenType.EOF, "", None, self.get_position()))
+        return self.tokens
+
+    def is_at_end(self) -> bool:
+        """Whether the lexer is at the end of the source
+
+        Returns:
+            bool: True if the current index is at the end of the source
+        """
+        return self.idx >= self.length
+
+    def get_position(self) -> Position:
+        """Get the current position
+
+        Returns:
+            Position: the current position
+        """
+        return Position(file=self.file, line=self.line, column=self.column)
+
+    def peek(self) -> str:
+        """Get the current character without advancing, if any
+
+        Returns:
+            str: the current character, or an empty string if at EOF
+        """
+        if self.idx < self.length:
+            return self.source[self.idx]
+        return ""
+
+    def peek_next(self) -> str:
+        """Get the next character without advancing, if any
+
+        Returns:
+            str: the next character, or an empty string if at EOF
+        """
+        if self.idx + 1 < self.length:
+            return self.source[self.idx + 1]
+        return ""
+
+    def advance(self) -> str:
+        """Get the new character and advance
+
+        Returns:
+            str: the current character, before advancing
+        """
+        char: str = self.peek()
+        self.idx += 1
+        self.column += 1
+        if char == "\n":
+            self.newline()
+        return char
+
+    def newline(self):
+        """Update the current position after encountering a newline character"""
+        self.line += 1
+        self.column = 1
+
+    def match(self, expected: str) -> bool:
+        """Consume the next character if it matches the given value
+
+        Args:
+            expected (str): the expected character
+
+        Returns:
+            bool: whether a character was matched and consumed
+        """
+        if self.peek() == expected:
+            self.advance()
+            return True
+        return False
+
+    def update_start(self):
+        """Update the starting position of the current lexeme
+
+        The cursor marking the start of the lexeme currently being scanned is
+        moved to the current position
+        """
+        self.start_pos = self.get_position()
+        self.start = self.idx
+
+    def add_token(self, token_type: TokenType, value: Optional[Any] = None):
+        """Add the current lexeme to the list of scanned tokens
+
+        Args:
+            token_type (TokenType): the type of token to add
+            value (Optional[Any], optional): the value of the token (useful for numbers or constants). Defaults to None.
+        """
+        lexeme: str = self.source[self.start : self.idx]
+        self.tokens.append(
+            Token(position=self.start_pos, type=token_type, lexeme=lexeme, value=value)
+        )
+
+    def scan_tokens(self, condition: Optional[Callable[[], bool]] = None):
+        """Scan tokens until EOF is reached or the given condition becomes False
+
+        Args:
+            condition (Optional[Callable[[], bool]], optional): the condition to continue scanning tokens.
+                If None, defaults to always being True, effectively scanning tokens until EOF is reached. Defaults to None.
+        """
+        if condition is None:
+            condition = lambda: True  # noqa: E731
+        while condition() and not self.is_at_end():
+            self.update_start()
+            self.scan_token()
+
+    @abstractmethod
+    def scan_token(self) -> None:
+        """Scan a token
+
+        This function should (at least) consume the current character and produce the appropriate token(s), using `add_token`
+        """
+        pass
--- a/lexer/position.py
+++ b/lexer/position.py
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass(frozen=True)
+class Position:
+    """A simple structure to store the position of a token"""
+    file: Optional[str]
+    line: int
+    column: int
+
+    def __repr__(self):
+        return f"{self.file or ''}L{self.line}:{self.column}"
--- a/lexer/token.py
+++ b/lexer/token.py
@@ -0,0 +1,39 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Any
+
+from lexer.position import Position
+
+
+class TokenType(Enum):
+    # Punctuation
+    LEFT_PAREN = auto()
+    RIGHT_PAREN = auto()
+    COLON = auto()
+    COMMA = auto()
+    UNDERSCORE = auto()
+
+    # Operators
+    PLUS = auto()
+
+    # Literals
+    IDENTIFIER = auto()
+    NUMBER = auto()
+    TRUE = auto()
+    FALSE = auto()
+    NONE = auto()
+
+    # Misc
+    COMMENT = auto()
+    WHITESPACE = auto()
+    EOF = auto()
+    NEWLINE = auto()
+
+
+@dataclass(frozen=True)
+class Token:
+    """A scanned token"""
+    type: TokenType
+    lexeme: str
+    value: Any
+    position: Position