From 9af843e802e4679b177c42510a873ab266d0e82c Mon Sep 17 00:00:00 2001
From: LordBaryhobal <lordbaryhobal@gmail.com>
Date: Sat, 7 Feb 2026 17:56:59 +0100
Subject: [PATCH] feat: add format spec lexer

---
 examples/basic/23_format_spec.peb |   9 +++
 src/core/format_spec/__init__.py  |   0
 src/core/format_spec/lexer.py     | 129 ++++++++++++++++++++++++++++++
 src/core/format_spec/token.py     |  50 ++++++++++++
 src/token/lexer.py                |  12 ++-
 src/token/token.py                |   1 +
 6 files changed, 200 insertions(+), 1 deletion(-)
 create mode 100644 examples/basic/23_format_spec.peb
 create mode 100644 src/core/format_spec/__init__.py
 create mode 100644 src/core/format_spec/lexer.py
 create mode 100644 src/core/format_spec/token.py
diff --git a/examples/basic/23_format_spec.peb b/examples/basic/23_format_spec.peb
new file mode 100644
index 0000000..92f46d5
--- /dev/null
+++ b/examples/basic/23_format_spec.peb
@@ -0,0 +1,9 @@
+let a = 42
+print(f"int: {a:d}; hex: {a:h}; HEX: {a:H}; oct: {a:o}; bin: {a:b}")
+let b = 1234567890
+print(f"{b:,}")
+print(f"{b:_}")
+
+let pts = 19
+let total = 22
+print(f"Correct answers: {points/total:.2%}")
\ No newline at end of file
diff --git a/src/core/format_spec/__init__.py b/src/core/format_spec/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/core/format_spec/lexer.py b/src/core/format_spec/lexer.py
new file mode 100644
index 0000000..e7b697a
--- /dev/null
+++ b/src/core/format_spec/lexer.py
@@ -0,0 +1,129 @@
+from typing import Optional, Any, Callable
+
+from src.core.position import Position
+from src.core.format_spec.token import Token, TokenType
+
+
+class FormatSpecLexer:
+    def __init__(self, source: str, path: Optional[str] = None):
+        self.path: str = path or "<main>"
+        self.source: str = source
+        self.tokens: list[Token] = []
+        self.start: int = 0
+        self.idx: int = 0
+        self.length: int = len(self.source)
+        self.line: int = 1
+        self.column: int = 1
+        self.start_pos: Position = self.get_position()
+
+    def error(self, msg: str):
+        raise SyntaxError(f"[ERROR] Format spect error at {self.start_pos}: {msg}")
+
+    def process(self) -> list[Token]:
+        self.scan_tokens()
+        self.tokens.append(Token(TokenType.EOF, "", None, self.get_position()))
+        return self.tokens
+
+    def is_at_end(self) -> bool:
+        return self.idx >= self.length
+
+    def get_position(self) -> Position:
+        return Position(self.path, self.line, self.column)
+
+    def peek(self) -> str:
+        if self.idx < self.length:
+            return self.source[self.idx]
+        return ""
+
+    def peek_next(self) -> str:
+        if self.idx + 1 < self.length:
+            return self.source[self.idx + 1]
+        return ""
+
+    def advance(self) -> str:
+        char: str = self.peek()
+        self.idx += 1
+        self.column += 1
+        if char == "\n":
+            self.newline()
+        return char
+
+    def newline(self):
+        self.line += 1
+        self.column = 1
+
+    def update_start(self):
+        self.start_pos = self.get_position()
+        self.start = self.idx
+
+    def add_token(self, token_type: TokenType, value: Optional[Any] = None):
+        lexeme: str = self.source[self.start:self.idx]
+        self.tokens.append(
+            Token(
+                position=self.start_pos,
+                type=token_type,
+                lexeme=lexeme,
+                value=value
+            )
+        )
+
+    def scan_tokens(self, condition: Optional[Callable[[], bool]] = None):
+        if condition is None:
+            condition = lambda: True
+        while condition() and not self.is_at_end():
+            self.update_start()
+            self.scan_token()
+
+    def scan_token(self):
+        char: str = self.advance()
+        match char:
+            case "+":
+                self.add_token(TokenType.PLUS)
+            case "-":
+                self.add_token(TokenType.MINUS)
+            case " ":
+                self.add_token(TokenType.SPACE)
+            case ",":
+                self.add_token(TokenType.COMMA)
+            case "_":
+                self.add_token(TokenType.UNDERSCORE)
+            case "s":
+                self.add_token(TokenType.T_STR)
+            case "b":
+                self.add_token(TokenType.T_BIN)
+            case "d":
+                self.add_token(TokenType.T_DEC)
+            case "o":
+                self.add_token(TokenType.T_OCT)
+            case "h" | "H":
+                self.add_token(TokenType.T_HEX)
+            case "e":
+                self.add_token(TokenType.T_SCI)
+            case "f":
+                self.add_token(TokenType.T_FIX)
+            case "%":
+                self.add_token(TokenType.T_PCT)
+            case "." if self.peek().isdigit():
+                self.scan_number(True)
+            case _:
+                if char.isdigit():
+                    self.scan_number()
+                else:
+                    self.error("Unexpected character")
+        return None
+
+    def scan_number(self, decimal_only: bool = False):
+        while self.peek().isdigit():
+            self.advance()
+
+        if not decimal_only:
+            if self.peek() == "." and self.peek_next().isdigit():
+                self.advance()
+                while self.peek().isdigit():
+                    self.advance()
+
+        value_str: str = self.source[self.start:self.idx]
+        if decimal_only:
+            value_str = f"0{value_str}"
+        value: float = float(value_str)
+        self.add_token(TokenType.NUMBER, value)
diff --git a/src/core/format_spec/token.py b/src/core/format_spec/token.py
new file mode 100644
index 0000000..a9bdcba
--- /dev/null
+++ b/src/core/format_spec/token.py
@@ -0,0 +1,50 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Any
+
+from src.core.position import Position
+
+
+class TokenType(Enum):
+    # Sign
+    PLUS = auto()
+    MINUS = auto()
+    SPACE = auto()
+
+    # Grouping
+    COMMA = auto()
+    UNDERSCORE = auto()
+
+    # Type
+    ## Str
+    T_STR = auto()
+
+    ## Int
+    T_BIN = auto()
+    T_DEC = auto()
+    T_OCT = auto()
+    T_HEX = auto()
+
+    ## Float
+    T_SCI = auto()
+    T_FIX = auto()
+    T_PCT = auto()
+
+    # Misc
+    NUMBER = auto()
+    EOF = auto()
+
+
+@dataclass(frozen=True)
+class Token:
+    type: TokenType
+    lexeme: str
+    value: Any
+    position: Position
+
+    def __repr__(self) -> str:
+        res: str = f"[{self.type.name}"
+        if self.value is not None:
+            res += f" ({self.value!r})"
+        res += "]"
+        return res
diff --git a/src/token/lexer.py b/src/token/lexer.py
index b88b5f2..5f8598c 100644
--- a/src/token/lexer.py
+++ b/src/token/lexer.py
@@ -1,5 +1,6 @@
 from typing import Optional, Any, Callable
 
+from src.core.format_spec.lexer import FormatSpecLexer
 from src.token.keyword import KEYWORDS
 from src.core.position import Position
 from src.token.token import Token, TokenType
@@ -172,10 +173,19 @@ class Lexer:
         self.update_start()
 
     def scan_fstring_embed(self):
-        self.scan_tokens(lambda: self.peek() != "}")
+        self.scan_tokens(lambda: self.peek() != "}" and self.peek() != ":")
         if self.is_at_end():
             self.error("Unterminated f-string embed")
         self.update_start()
+
+        if self.match(":"):
+            self.update_start()
+            while self.peek() != "}":
+                self.advance()
+            format_spec_str: str = self.source[self.start:self.idx]
+            format_spec: list = FormatSpecLexer(format_spec_str, self.path).process()
+            self.add_token(TokenType.FORMAT_SPEC, format_spec)
+            self.update_start()
         self.advance()
         self.add_token(TokenType.RIGHT_BRACE)
         self.update_start()
diff --git a/src/token/token.py b/src/token/token.py
index 9090516..b8880f5 100644
--- a/src/token/token.py
+++ b/src/token/token.py
@@ -70,6 +70,7 @@ class TokenType(Enum):
     WHITESPACE = auto()
     EOF = auto()
     NEWLINE = auto()
+    FORMAT_SPEC = auto()
 
 
 @dataclass(frozen=True)