feat: add basic lexer for punctuation

This commit is contained in:
2026-02-05 02:59:20 +01:00
parent 27481e36bc
commit 225c467b5a
5 changed files with 189 additions and 0 deletions

12
main.py Normal file
View File

@@ -0,0 +1,12 @@
from src.lexer import Lexer
from src.token import Token
def main():
source: str = """(),{;:}.."""
lexer: Lexer = Lexer()
tokens: list[Token] = lexer.process(source)
print(tokens)
if __name__ == '__main__':
main()

0
src/__init__.py Normal file
View File

91
src/lexer.py Normal file
View File

@@ -0,0 +1,91 @@
from typing import Optional, Any
from src.position import Position
from src.token import Token, TokenType
class Lexer:
def __init__(self):
self.path: str = "<main>"
self.source: str = ""
self.tokens: list[Token] = []
self.start: int = 0
self.idx: int = 0
self.length: int = 0
self.line: int = 0
self.column: int = 0
self.start_pos: Position = self.get_position()
def process(self, source: str, path: Optional[str] = None) -> list[Token]:
self.path = path or "<main>"
self.source = source
self.tokens = []
self.start = 0
self.idx = 0
self.length = len(self.source)
self.line = 0
self.column = 0
while not self.is_at_end():
self.start_pos = self.get_position()
self.start = self.idx
self.scan_token()
self.tokens.append(Token(TokenType.EOF, "", None, self.get_position()))
return self.tokens
def is_at_end(self) -> bool:
return self.idx >= self.length
def get_position(self) -> Position:
return Position(self.path, self.line, self.column)
def peek(self) -> str:
if self.idx < self.length:
return self.source[self.idx]
return ""
def advance(self) -> str:
char: str = self.peek()
self.idx += 1
self.column += 1
if char == "\n":
self.line += 1
self.column = 0
return char
def match(self, expected: str) -> bool:
if self.peek() == expected:
self.advance()
return True
return False
def add_token(self, token_type: TokenType, value: Optional[Any] = None):
lexeme: str = self.source[self.start:self.idx]
self.tokens.append(
Token(
position=self.start_pos,
type=token_type,
lexeme=lexeme,
value=value
)
)
def scan_token(self):
char: str = self.advance()
match char:
case "(": self.add_token(TokenType.LEFT_PAREN)
case ")": self.add_token(TokenType.RIGHT_PAREN)
case "{": self.add_token(TokenType.LEFT_BRACE)
case "}": self.add_token(TokenType.RIGHT_BRACE)
case ",": self.add_token(TokenType.COMMA)
case ".": self.add_token(TokenType.DOT)
case ";": self.add_token(TokenType.SEMICOLON)
case ":": self.add_token(TokenType.COLON)
case _: self.error("Unexpected character")
return None
def error(self, msg: str):
raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}")

11
src/position.py Normal file
View File

@@ -0,0 +1,11 @@
from dataclasses import dataclass
@dataclass(frozen=True)
class Position:
path: str
line: int
column: int
def __repr__(self):
return f"{self.path}#{self.line}:{self.column}"

75
src/token.py Normal file
View File

@@ -0,0 +1,75 @@
from dataclasses import dataclass
from enum import Enum, auto
from typing import Any
from src.position import Position
class TokenType(Enum):
# Punctuation
LEFT_PAREN = auto()
RIGHT_PAREN = auto()
LEFT_BRACE = auto()
RIGHT_BRACE = auto()
COMMA = auto()
DOT = auto()
SEMICOLON = auto()
COLON = auto()
# Operators
PLUS = auto()
PLUS_EQUAL = auto()
MINUS = auto()
MINUS_EQUAL = auto()
SLASH = auto()
SLASH_EQUAL = auto()
STAR = auto()
STAR_EQUAL = auto()
EQUAL = auto()
EQUAL_EQUAL = auto()
BANG = auto()
BANG_EQUAL = auto()
GREATER = auto()
GREATER_EQUAL = auto()
LESS = auto()
LESS_EQUAL = auto()
# Literals
IDENTIFIER = auto()
STRING = auto()
NUMBER = auto()
TRUE = auto()
FALSE = auto()
# Keywords
LET = auto()
AND = auto()
OR = auto()
IF = auto()
ELSE = auto()
FOR = auto()
WHILE = auto()
FROM = auto()
TO = auto()
BY = auto()
# Misc
PRINT = auto()
COMMENT = auto()
WHITESPACE = auto()
EOF = auto()
@dataclass
class Token:
type: TokenType
lexeme: str
value: Any
position: Position
def __repr__(self) -> str:
res: str = f"[{self.type.name}"
if self.value is not None:
res += f" ({self.value})"
res += "]"
return res