feat: add basic lexer for punctuation

2026-02-05 02:59:20 +01:00
parent 27481e36bc
commit 225c467b5a
5 changed files with 189 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,12 @@
+from src.lexer import Lexer
+from src.token import Token
+
+
+def main():
+    source: str = """(),{;:}.."""
+    lexer: Lexer = Lexer()
+    tokens: list[Token] = lexer.process(source)
+    print(tokens)
+
+if __name__ == '__main__':
+    main()
--- a/src/init.py
+++ b/src/init.py
--- a/src/lexer.py
+++ b/src/lexer.py
@@ -0,0 +1,91 @@
+from typing import Optional, Any
+
+from src.position import Position
+from src.token import Token, TokenType
+
+
+class Lexer:
+    def __init__(self):
+        self.path: str = "<main>"
+        self.source: str = ""
+        self.tokens: list[Token] = []
+        self.start: int = 0
+        self.idx: int = 0
+        self.length: int = 0
+        self.line: int = 0
+        self.column: int = 0
+        self.start_pos: Position = self.get_position()
+
+    def process(self, source: str, path: Optional[str] = None) -> list[Token]:
+        self.path = path or "<main>"
+        self.source = source
+        self.tokens = []
+        self.start = 0
+        self.idx = 0
+        self.length = len(self.source)
+        self.line = 0
+        self.column = 0
+
+        while not self.is_at_end():
+            self.start_pos = self.get_position()
+            self.start = self.idx
+            self.scan_token()
+
+        self.tokens.append(Token(TokenType.EOF, "", None, self.get_position()))
+
+        return self.tokens
+
+    def is_at_end(self) -> bool:
+        return self.idx >= self.length
+
+    def get_position(self) -> Position:
+        return Position(self.path, self.line, self.column)
+
+    def peek(self) -> str:
+        if self.idx < self.length:
+            return self.source[self.idx]
+        return ""
+
+    def advance(self) -> str:
+        char: str = self.peek()
+        self.idx += 1
+        self.column += 1
+        if char == "\n":
+            self.line += 1
+            self.column = 0
+        return char
+
+    def match(self, expected: str) -> bool:
+        if self.peek() == expected:
+            self.advance()
+            return True
+        return False
+
+    def add_token(self, token_type: TokenType, value: Optional[Any] = None):
+        lexeme: str = self.source[self.start:self.idx]
+        self.tokens.append(
+            Token(
+                position=self.start_pos,
+                type=token_type,
+                lexeme=lexeme,
+                value=value
+            )
+        )
+
+    def scan_token(self):
+        char: str = self.advance()
+        match char:
+            case "(": self.add_token(TokenType.LEFT_PAREN)
+            case ")": self.add_token(TokenType.RIGHT_PAREN)
+            case "{": self.add_token(TokenType.LEFT_BRACE)
+            case "}": self.add_token(TokenType.RIGHT_BRACE)
+            case ",": self.add_token(TokenType.COMMA)
+            case ".": self.add_token(TokenType.DOT)
+            case ";": self.add_token(TokenType.SEMICOLON)
+            case ":": self.add_token(TokenType.COLON)
+            case _: self.error("Unexpected character")
+
+        return None
+
+    def error(self, msg: str):
+        raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}")
--- a/src/position.py
+++ b/src/position.py
@@ -0,0 +1,11 @@
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Position:
+    path: str
+    line: int
+    column: int
+
+    def __repr__(self):
+        return f"{self.path}#{self.line}:{self.column}"
--- a/src/token.py
+++ b/src/token.py
@@ -0,0 +1,75 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Any
+
+from src.position import Position
+
+
+class TokenType(Enum):
+    # Punctuation
+    LEFT_PAREN = auto()
+    RIGHT_PAREN = auto()
+    LEFT_BRACE = auto()
+    RIGHT_BRACE = auto()
+    COMMA = auto()
+    DOT = auto()
+    SEMICOLON = auto()
+    COLON = auto()
+
+    # Operators
+    PLUS = auto()
+    PLUS_EQUAL = auto()
+    MINUS = auto()
+    MINUS_EQUAL = auto()
+    SLASH = auto()
+    SLASH_EQUAL = auto()
+    STAR = auto()
+    STAR_EQUAL = auto()
+    EQUAL = auto()
+    EQUAL_EQUAL = auto()
+    BANG = auto()
+    BANG_EQUAL = auto()
+    GREATER = auto()
+    GREATER_EQUAL = auto()
+    LESS = auto()
+    LESS_EQUAL = auto()
+
+    # Literals
+    IDENTIFIER = auto()
+    STRING = auto()
+    NUMBER = auto()
+    TRUE = auto()
+    FALSE = auto()
+
+    # Keywords
+    LET = auto()
+    AND = auto()
+    OR = auto()
+    IF = auto()
+    ELSE = auto()
+    FOR = auto()
+    WHILE = auto()
+    FROM = auto()
+    TO = auto()
+    BY = auto()
+
+    # Misc
+    PRINT = auto()
+    COMMENT = auto()
+    WHITESPACE = auto()
+    EOF = auto()
+
+
+@dataclass
+class Token:
+    type: TokenType
+    lexeme: str
+    value: Any
+    position: Position
+
+    def __repr__(self) -> str:
+        res: str = f"[{self.type.name}"
+        if self.value is not None:
+            res += f" ({self.value})"
+        res += "]"
+        return res