From 9af843e802e4679b177c42510a873ab266d0e82c Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Sat, 7 Feb 2026 17:56:59 +0100 Subject: [PATCH] feat: add format spec lexer --- examples/basic/23_format_spec.peb | 9 +++ src/core/format_spec/__init__.py | 0 src/core/format_spec/lexer.py | 129 ++++++++++++++++++++++++++++++ src/core/format_spec/token.py | 50 ++++++++++++ src/token/lexer.py | 12 ++- src/token/token.py | 1 + 6 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 examples/basic/23_format_spec.peb create mode 100644 src/core/format_spec/__init__.py create mode 100644 src/core/format_spec/lexer.py create mode 100644 src/core/format_spec/token.py diff --git a/examples/basic/23_format_spec.peb b/examples/basic/23_format_spec.peb new file mode 100644 index 0000000..92f46d5 --- /dev/null +++ b/examples/basic/23_format_spec.peb @@ -0,0 +1,9 @@ +let a = 42 +print(f"int: {a:d}; hex: {a:h}; HEX: {a:H}; oct: {a:o}; bin: {a:b}") +let b = 1234567890 +print(f"{b:,}") +print(f"{b:_}") + +let pts = 19 +let total = 22 +print(f"Correct answers: {points/total:.2%}") \ No newline at end of file diff --git a/src/core/format_spec/__init__.py b/src/core/format_spec/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/core/format_spec/lexer.py b/src/core/format_spec/lexer.py new file mode 100644 index 0000000..e7b697a --- /dev/null +++ b/src/core/format_spec/lexer.py @@ -0,0 +1,129 @@ +from typing import Optional, Any, Callable + +from src.core.position import Position +from src.core.format_spec.token import Token, TokenType + + +class FormatSpecLexer: + def __init__(self, source: str, path: Optional[str] = None): + self.path: str = path or "
" + self.source: str = source + self.tokens: list[Token] = [] + self.start: int = 0 + self.idx: int = 0 + self.length: int = len(self.source) + self.line: int = 1 + self.column: int = 1 + self.start_pos: Position = self.get_position() + + def error(self, msg: str): + raise SyntaxError(f"[ERROR] Format spect error at {self.start_pos}: {msg}") + + def process(self) -> list[Token]: + self.scan_tokens() + self.tokens.append(Token(TokenType.EOF, "", None, self.get_position())) + return self.tokens + + def is_at_end(self) -> bool: + return self.idx >= self.length + + def get_position(self) -> Position: + return Position(self.path, self.line, self.column) + + def peek(self) -> str: + if self.idx < self.length: + return self.source[self.idx] + return "" + + def peek_next(self) -> str: + if self.idx + 1 < self.length: + return self.source[self.idx + 1] + return "" + + def advance(self) -> str: + char: str = self.peek() + self.idx += 1 + self.column += 1 + if char == "\n": + self.newline() + return char + + def newline(self): + self.line += 1 + self.column = 1 + + def update_start(self): + self.start_pos = self.get_position() + self.start = self.idx + + def add_token(self, token_type: TokenType, value: Optional[Any] = None): + lexeme: str = self.source[self.start:self.idx] + self.tokens.append( + Token( + position=self.start_pos, + type=token_type, + lexeme=lexeme, + value=value + ) + ) + + def scan_tokens(self, condition: Optional[Callable[[], bool]] = None): + if condition is None: + condition = lambda: True + while condition() and not self.is_at_end(): + self.update_start() + self.scan_token() + + def scan_token(self): + char: str = self.advance() + match char: + case "+": + self.add_token(TokenType.PLUS) + case "-": + self.add_token(TokenType.MINUS) + case " ": + self.add_token(TokenType.SPACE) + case ",": + self.add_token(TokenType.COMMA) + case "_": + self.add_token(TokenType.UNDERSCORE) + case "s": + self.add_token(TokenType.T_STR) + case "b": + self.add_token(TokenType.T_BIN) + case "d": + self.add_token(TokenType.T_DEC) + case "o": + self.add_token(TokenType.T_OCT) + case "h" | "H": + self.add_token(TokenType.T_HEX) + case "e": + self.add_token(TokenType.T_SCI) + case "f": + self.add_token(TokenType.T_FIX) + case "%": + self.add_token(TokenType.T_PCT) + case "." if self.peek().isdigit(): + self.scan_number(True) + case _: + if char.isdigit(): + self.scan_number() + else: + self.error("Unexpected character") + return None + + def scan_number(self, decimal_only: bool = False): + while self.peek().isdigit(): + self.advance() + + if not decimal_only: + if self.peek() == "." and self.peek_next().isdigit(): + self.advance() + while self.peek().isdigit(): + self.advance() + + value_str: str = self.source[self.start:self.idx] + if decimal_only: + value_str = f"0{value_str}" + value: float = float(value_str) + self.add_token(TokenType.NUMBER, value) diff --git a/src/core/format_spec/token.py b/src/core/format_spec/token.py new file mode 100644 index 0000000..a9bdcba --- /dev/null +++ b/src/core/format_spec/token.py @@ -0,0 +1,50 @@ +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any + +from src.core.position import Position + + +class TokenType(Enum): + # Sign + PLUS = auto() + MINUS = auto() + SPACE = auto() + + # Grouping + COMMA = auto() + UNDERSCORE = auto() + + # Type + ## Str + T_STR = auto() + + ## Int + T_BIN = auto() + T_DEC = auto() + T_OCT = auto() + T_HEX = auto() + + ## Float + T_SCI = auto() + T_FIX = auto() + T_PCT = auto() + + # Misc + NUMBER = auto() + EOF = auto() + + +@dataclass(frozen=True) +class Token: + type: TokenType + lexeme: str + value: Any + position: Position + + def __repr__(self) -> str: + res: str = f"[{self.type.name}" + if self.value is not None: + res += f" ({self.value!r})" + res += "]" + return res diff --git a/src/token/lexer.py b/src/token/lexer.py index b88b5f2..5f8598c 100644 --- a/src/token/lexer.py +++ b/src/token/lexer.py @@ -1,5 +1,6 @@ from typing import Optional, Any, Callable +from src.core.format_spec.lexer import FormatSpecLexer from src.token.keyword import KEYWORDS from src.core.position import Position from src.token.token import Token, TokenType @@ -172,10 +173,19 @@ class Lexer: self.update_start() def scan_fstring_embed(self): - self.scan_tokens(lambda: self.peek() != "}") + self.scan_tokens(lambda: self.peek() != "}" and self.peek() != ":") if self.is_at_end(): self.error("Unterminated f-string embed") self.update_start() + + if self.match(":"): + self.update_start() + while self.peek() != "}": + self.advance() + format_spec_str: str = self.source[self.start:self.idx] + format_spec: list = FormatSpecLexer(format_spec_str, self.path).process() + self.add_token(TokenType.FORMAT_SPEC, format_spec) + self.update_start() self.advance() self.add_token(TokenType.RIGHT_BRACE) self.update_start() diff --git a/src/token/token.py b/src/token/token.py index 9090516..b8880f5 100644 --- a/src/token/token.py +++ b/src/token/token.py @@ -70,6 +70,7 @@ class TokenType(Enum): WHITESPACE = auto() EOF = auto() NEWLINE = auto() + FORMAT_SPEC = auto() @dataclass(frozen=True)