feat: add basic lexer for punctuation

2026-02-05 02:59:20 +01:00
parent 27481e36bc
commit 225c467b5a
5 changed files with 189 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,12 @@
 from src.lexer import Lexer
 from src.token import Token
 def main():
    source: str = """(),{;:}.."""
    lexer: Lexer = Lexer()
    tokens: list[Token] = lexer.process(source)
    print(tokens)
 if __name__ == '__main__':
    main()
--- a/src/init.py
+++ b/src/init.py
--- a/src/lexer.py
+++ b/src/lexer.py
@@ -0,0 +1,91 @@
 from typing import Optional, Any
 from src.position import Position
 from src.token import Token, TokenType
 class Lexer:
    def __init__(self):
        self.path: str = "<main>"
        self.source: str = ""
        self.tokens: list[Token] = []
        self.start: int = 0
        self.idx: int = 0
        self.length: int = 0
        self.line: int = 0
        self.column: int = 0
        self.start_pos: Position = self.get_position()
    def process(self, source: str, path: Optional[str] = None) -> list[Token]:
        self.path = path or "<main>"
        self.source = source
        self.tokens = []
        self.start = 0
        self.idx = 0
        self.length = len(self.source)
        self.line = 0
        self.column = 0
        while not self.is_at_end():
            self.start_pos = self.get_position()
            self.start = self.idx
            self.scan_token()
        self.tokens.append(Token(TokenType.EOF, "", None, self.get_position()))
        return self.tokens
    def is_at_end(self) -> bool:
        return self.idx >= self.length
    def get_position(self) -> Position:
        return Position(self.path, self.line, self.column)
    def peek(self) -> str:
        if self.idx < self.length:
            return self.source[self.idx]
        return ""
    def advance(self) -> str:
        char: str = self.peek()
        self.idx += 1
        self.column += 1
        if char == "\n":
            self.line += 1
            self.column = 0
        return char
    def match(self, expected: str) -> bool:
        if self.peek() == expected:
            self.advance()
            return True
        return False
    def add_token(self, token_type: TokenType, value: Optional[Any] = None):
        lexeme: str = self.source[self.start:self.idx]
        self.tokens.append(
            Token(
                position=self.start_pos,
                type=token_type,
                lexeme=lexeme,
                value=value
            )
        )
    def scan_token(self):
        char: str = self.advance()
        match char:
            case "(": self.add_token(TokenType.LEFT_PAREN)
            case ")": self.add_token(TokenType.RIGHT_PAREN)
            case "{": self.add_token(TokenType.LEFT_BRACE)
            case "}": self.add_token(TokenType.RIGHT_BRACE)
            case ",": self.add_token(TokenType.COMMA)
            case ".": self.add_token(TokenType.DOT)
            case ";": self.add_token(TokenType.SEMICOLON)
            case ":": self.add_token(TokenType.COLON)
            case _: self.error("Unexpected character")
        return None
    def error(self, msg: str):
        raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}")
--- a/src/position.py
+++ b/src/position.py
@@ -0,0 +1,11 @@
 from dataclasses import dataclass
@dataclass(frozen=True)
 class Position:
    path: str
    line: int
    column: int
    def __repr__(self):
        return f"{self.path}#{self.line}:{self.column}"
--- a/src/token.py
+++ b/src/token.py
@@ -0,0 +1,75 @@
 from dataclasses import dataclass
 from enum import Enum, auto
 from typing import Any
 from src.position import Position
 class TokenType(Enum):
    # Punctuation
    LEFT_PAREN = auto()
    RIGHT_PAREN = auto()
    LEFT_BRACE = auto()
    RIGHT_BRACE = auto()
    COMMA = auto()
    DOT = auto()
    SEMICOLON = auto()
    COLON = auto()
    # Operators
    PLUS = auto()
    PLUS_EQUAL = auto()
    MINUS = auto()
    MINUS_EQUAL = auto()
    SLASH = auto()
    SLASH_EQUAL = auto()
    STAR = auto()
    STAR_EQUAL = auto()
    EQUAL = auto()
    EQUAL_EQUAL = auto()
    BANG = auto()
    BANG_EQUAL = auto()
    GREATER = auto()
    GREATER_EQUAL = auto()
    LESS = auto()
    LESS_EQUAL = auto()
    # Literals
    IDENTIFIER = auto()
    STRING = auto()
    NUMBER = auto()
    TRUE = auto()
    FALSE = auto()
    # Keywords
    LET = auto()
    AND = auto()
    OR = auto()
    IF = auto()
    ELSE = auto()
    FOR = auto()
    WHILE = auto()
    FROM = auto()
    TO = auto()
    BY = auto()
    # Misc
    PRINT = auto()
    COMMENT = auto()
    WHITESPACE = auto()
    EOF = auto()
@dataclass
 class Token:
    type: TokenType
    lexeme: str
    value: Any
    position: Position
    def __repr__(self) -> str:
        res: str = f"[{self.type.name}"
        if self.value is not None:
            res += f" ({self.value})"
        res += "]"
        return res