feat(parser): add base lexer class

the lexer and token structures were adapted from another project (see docstring on the Lexer class)
2026-05-13 19:17:55 +02:00
parent 9b59306604
commit fedc582e16
5 changed files with 224 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,6 @@
 .vscode
 __pycache__
 .env
 venv
 .venv
 *.pyc
--- a/lexer/init.py
+++ b/lexer/init.py
--- a/lexer/base.py
+++ b/lexer/base.py
@@ -0,0 +1,166 @@
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Optional
 from lexer.position import Position
 from lexer.token import Token, TokenType
 class Lexer(ABC):
    """An abstract lexer which provides methods to easily extend it into a concrete one
    This implementation is based on the [_Crafting Interpreters_][1] book by Robert Nystrom,
    more specifically on my [previous Python implementation](https://git.kb28.ch/HEL/pebble)
    [1]: https://craftinginterpreters.com/
    """
    def __init__(self, source: str, file: Optional[str] = None) -> None:
        """Create a new lexer to scan for tokens in the given source
        Args:
            source (str): the source to scan
            file (Optional[str], optional): the path of the given source. Can be a file path or any string identifier. Defaults to None.
        """
        self.source: str = source
        self.file: Optional[str] = file
        self.tokens: list[Token] = []
        self.start: int = 0
        self.idx: int = 0
        self.length: int = len(self.source)
        self.line: int = 1
        self.column: int = 1
        self.start_pos: Position = self.get_position()
    def error(self, msg: str):
        """Raise a syntax error
        Args:
            msg (str): the error message
        Raises:
            SyntaxError
        """
        raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}")
    def process(self) -> list[Token]:
        """Scan tokens out of the source text
        Returns:
            list[Token]: all the tokens that could be scanned
        Raises:
            SyntaxError: if a syntax error is found
        """
        self.scan_tokens()
        self.tokens.append(Token(TokenType.EOF, "", None, self.get_position()))
        return self.tokens
    def is_at_end(self) -> bool:
        """Whether the lexer is at the end of the source
        Returns:
            bool: True if the current index is at the end of the source
        """
        return self.idx >= self.length
    def get_position(self) -> Position:
        """Get the current position
        Returns:
            Position: the current position
        """
        return Position(file=self.file, line=self.line, column=self.column)
    def peek(self) -> str:
        """Get the current character without advancing, if any
        Returns:
            str: the current character, or an empty string if at EOF
        """
        if self.idx < self.length:
            return self.source[self.idx]
        return ""
    def peek_next(self) -> str:
        """Get the next character without advancing, if any
        Returns:
            str: the next character, or an empty string if at EOF
        """
        if self.idx + 1 < self.length:
            return self.source[self.idx + 1]
        return ""
    def advance(self) -> str:
        """Get the new character and advance
        Returns:
            str: the current character, before advancing
        """
        char: str = self.peek()
        self.idx += 1
        self.column += 1
        if char == "\n":
            self.newline()
        return char
    def newline(self):
        """Update the current position after encountering a newline character"""
        self.line += 1
        self.column = 1
    def match(self, expected: str) -> bool:
        """Consume the next character if it matches the given value
        Args:
            expected (str): the expected character
        Returns:
            bool: whether a character was matched and consumed
        """
        if self.peek() == expected:
            self.advance()
            return True
        return False
    def update_start(self):
        """Update the starting position of the current lexeme
        The cursor marking the start of the lexeme currently being scanned is
        moved to the current position
        """
        self.start_pos = self.get_position()
        self.start = self.idx
    def add_token(self, token_type: TokenType, value: Optional[Any] = None):
        """Add the current lexeme to the list of scanned tokens
        Args:
            token_type (TokenType): the type of token to add
            value (Optional[Any], optional): the value of the token (useful for numbers or constants). Defaults to None.
        """
        lexeme: str = self.source[self.start : self.idx]
        self.tokens.append(
            Token(position=self.start_pos, type=token_type, lexeme=lexeme, value=value)
        )
    def scan_tokens(self, condition: Optional[Callable[[], bool]] = None):
        """Scan tokens until EOF is reached or the given condition becomes False
        Args:
            condition (Optional[Callable[[], bool]], optional): the condition to continue scanning tokens.
                If None, defaults to always being True, effectively scanning tokens until EOF is reached. Defaults to None.
        """
        if condition is None:
            condition = lambda: True  # noqa: E731
        while condition() and not self.is_at_end():
            self.update_start()
            self.scan_token()
    @abstractmethod
    def scan_token(self) -> None:
        """Scan a token
        This function should (at least) consume the current character and produce the appropriate token(s), using `add_token`
        """
        pass
--- a/lexer/position.py
+++ b/lexer/position.py
@@ -0,0 +1,13 @@
 from dataclasses import dataclass
 from typing import Optional
@dataclass(frozen=True)
 class Position:
    """A simple structure to store the position of a token"""
    file: Optional[str]
    line: int
    column: int
    def __repr__(self):
        return f"{self.file or ''}L{self.line}:{self.column}"
--- a/lexer/token.py
+++ b/lexer/token.py
@@ -0,0 +1,39 @@
 from dataclasses import dataclass
 from enum import Enum, auto
 from typing import Any
 from lexer.position import Position
 class TokenType(Enum):
    # Punctuation
    LEFT_PAREN = auto()
    RIGHT_PAREN = auto()
    COLON = auto()
    COMMA = auto()
    UNDERSCORE = auto()
    # Operators
    PLUS = auto()
    # Literals
    IDENTIFIER = auto()
    NUMBER = auto()
    TRUE = auto()
    FALSE = auto()
    NONE = auto()
    # Misc
    COMMENT = auto()
    WHITESPACE = auto()
    EOF = auto()
    NEWLINE = auto()
@dataclass(frozen=True)
 class Token:
    """A scanned token"""
    type: TokenType
    lexeme: str
    value: Any
    position: Position