diff --git a/main.py b/main.py new file mode 100644 index 0000000..173ca3c --- /dev/null +++ b/main.py @@ -0,0 +1,12 @@ +from src.lexer import Lexer +from src.token import Token + + +def main(): + source: str = """(),{;:}..""" + lexer: Lexer = Lexer() + tokens: list[Token] = lexer.process(source) + print(tokens) + +if __name__ == '__main__': + main() diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lexer.py b/src/lexer.py new file mode 100644 index 0000000..3ae9fe3 --- /dev/null +++ b/src/lexer.py @@ -0,0 +1,91 @@ +from typing import Optional, Any + +from src.position import Position +from src.token import Token, TokenType + + +class Lexer: + def __init__(self): + self.path: str = "
" + self.source: str = "" + self.tokens: list[Token] = [] + self.start: int = 0 + self.idx: int = 0 + self.length: int = 0 + self.line: int = 0 + self.column: int = 0 + self.start_pos: Position = self.get_position() + + def process(self, source: str, path: Optional[str] = None) -> list[Token]: + self.path = path or "
" + self.source = source + self.tokens = [] + self.start = 0 + self.idx = 0 + self.length = len(self.source) + self.line = 0 + self.column = 0 + + while not self.is_at_end(): + self.start_pos = self.get_position() + self.start = self.idx + self.scan_token() + + self.tokens.append(Token(TokenType.EOF, "", None, self.get_position())) + + return self.tokens + + def is_at_end(self) -> bool: + return self.idx >= self.length + + def get_position(self) -> Position: + return Position(self.path, self.line, self.column) + + def peek(self) -> str: + if self.idx < self.length: + return self.source[self.idx] + return "" + + def advance(self) -> str: + char: str = self.peek() + self.idx += 1 + self.column += 1 + if char == "\n": + self.line += 1 + self.column = 0 + return char + + def match(self, expected: str) -> bool: + if self.peek() == expected: + self.advance() + return True + return False + + def add_token(self, token_type: TokenType, value: Optional[Any] = None): + lexeme: str = self.source[self.start:self.idx] + self.tokens.append( + Token( + position=self.start_pos, + type=token_type, + lexeme=lexeme, + value=value + ) + ) + + def scan_token(self): + char: str = self.advance() + match char: + case "(": self.add_token(TokenType.LEFT_PAREN) + case ")": self.add_token(TokenType.RIGHT_PAREN) + case "{": self.add_token(TokenType.LEFT_BRACE) + case "}": self.add_token(TokenType.RIGHT_BRACE) + case ",": self.add_token(TokenType.COMMA) + case ".": self.add_token(TokenType.DOT) + case ";": self.add_token(TokenType.SEMICOLON) + case ":": self.add_token(TokenType.COLON) + case _: self.error("Unexpected character") + + return None + + def error(self, msg: str): + raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}") diff --git a/src/position.py b/src/position.py new file mode 100644 index 0000000..bc199dd --- /dev/null +++ b/src/position.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Position: + path: str + line: int + column: int + + def __repr__(self): + return f"{self.path}#{self.line}:{self.column}" diff --git a/src/token.py b/src/token.py new file mode 100644 index 0000000..3985f3c --- /dev/null +++ b/src/token.py @@ -0,0 +1,75 @@ +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any + +from src.position import Position + + +class TokenType(Enum): + # Punctuation + LEFT_PAREN = auto() + RIGHT_PAREN = auto() + LEFT_BRACE = auto() + RIGHT_BRACE = auto() + COMMA = auto() + DOT = auto() + SEMICOLON = auto() + COLON = auto() + + # Operators + PLUS = auto() + PLUS_EQUAL = auto() + MINUS = auto() + MINUS_EQUAL = auto() + SLASH = auto() + SLASH_EQUAL = auto() + STAR = auto() + STAR_EQUAL = auto() + EQUAL = auto() + EQUAL_EQUAL = auto() + BANG = auto() + BANG_EQUAL = auto() + GREATER = auto() + GREATER_EQUAL = auto() + LESS = auto() + LESS_EQUAL = auto() + + # Literals + IDENTIFIER = auto() + STRING = auto() + NUMBER = auto() + TRUE = auto() + FALSE = auto() + + # Keywords + LET = auto() + AND = auto() + OR = auto() + IF = auto() + ELSE = auto() + FOR = auto() + WHILE = auto() + FROM = auto() + TO = auto() + BY = auto() + + # Misc + PRINT = auto() + COMMENT = auto() + WHITESPACE = auto() + EOF = auto() + + +@dataclass +class Token: + type: TokenType + lexeme: str + value: Any + position: Position + + def __repr__(self) -> str: + res: str = f"[{self.type.name}" + if self.value is not None: + res += f" ({self.value})" + res += "]" + return res