From 3cf3011160eda0475e1fea9ee80d9a8c02360cb3 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 14:36:51 +0200 Subject: [PATCH 01/41] feat: add some syntax examples --- .../00_syntax_prototype/01_simple_types.py | 16 +++++++++ .../00_syntax_prototype/02_custom_types.midas | 24 +++++++++++++ .../00_syntax_prototype/02_custom_types.py | 34 +++++++++++++++++++ 3 files changed, 74 insertions(+) create mode 100644 examples/00_syntax_prototype/01_simple_types.py create mode 100644 examples/00_syntax_prototype/02_custom_types.midas create mode 100644 examples/00_syntax_prototype/02_custom_types.py diff --git a/examples/00_syntax_prototype/01_simple_types.py b/examples/00_syntax_prototype/01_simple_types.py new file mode 100644 index 0000000..cfb10aa --- /dev/null +++ b/examples/00_syntax_prototype/01_simple_types.py @@ -0,0 +1,16 @@ +# type: ignore +# ruff: disable[F821] +from __future__ import annotations + +# A simple data-frame with different column of various simple types +# Columns can be named and/or typed +df: Frame[ + verified: bool, + birth_year: int, + height: float, + name: str, + date: datetime, + float, # unnamed + unknown: _, # untyped + _ # unnamed and untyped +] diff --git a/examples/00_syntax_prototype/02_custom_types.midas b/examples/00_syntax_prototype/02_custom_types.midas new file mode 100644 index 0000000..ba8b758 --- /dev/null +++ b/examples/00_syntax_prototype/02_custom_types.midas @@ -0,0 +1,24 @@ +// Simple custom type derived from floats +type Latitude +type Longitude + +// Complex custom type, containing two values accessible through properties +type GeoLocation { + lat: Latitude + lon: Longitude +} + +type LatitudeDiff +type LongitudeDiff + +// Simple operation defined on our custom types +op - = +op - = + +// Simple custom type with a constraint +type Age + +// Predefined custom constraints that can be referenced in other definitions +constraint Positive = _ >= 0 +constraint StrictlyPositive = _ > 0 +constraint Even = _ % 2 == 0 \ No newline at end of file diff --git a/examples/00_syntax_prototype/02_custom_types.py b/examples/00_syntax_prototype/02_custom_types.py new file mode 100644 index 0000000..0297058 --- /dev/null +++ b/examples/00_syntax_prototype/02_custom_types.py @@ -0,0 +1,34 @@ +# type: ignore +# ruff: disable[F821] +from __future__ import annotations + +# Prototype of custom type import to use valid Python syntax +import midas +midas.using("02_custom_types.midas") + +# A data-frame using a custom type +df: Frame[ + location: GeoLocation +] + +# Properties of a type can be used on a column of that type +lat: Column[GeoLocation] = df["location"].lat +lon: Column[GeoLocation] = df["location"].lon + +# Unregistered operations between types are not permitted +lat + lon # Invalid operation + +# Registered operations are permitted +lat1: Latitude = lat[0] +lat2: Latitude = lat[1] +lat_diff: LatitudeDiff = lat2 - lat1 # Valid operation + +# In addition to the type, a column can have one or more constraints, either defined inline or in a separate file +df2: Frame[ + age: int + (_ >= 0), + height: float + (_ >= 0), +] +df2_bis: Frame[ + age: int + Positive, + height: float + Positive, +] From 9b593066042f6a9bd5cde81d46930bf11eb5598e Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 14:37:43 +0200 Subject: [PATCH 02/41] feat: add vscode extension for basic syntax highlighting --- .gitignore | 1 + vscode-ext/language-configurations.json | 19 +++ vscode-ext/package.json | 33 ++++++ vscode-ext/syntaxes/midas.tmLanguage.json | 135 ++++++++++++++++++++++ 4 files changed, 188 insertions(+) create mode 100644 vscode-ext/language-configurations.json create mode 100644 vscode-ext/package.json create mode 100644 vscode-ext/syntaxes/midas.tmLanguage.json diff --git a/.gitignore b/.gitignore index e69de29..600d2d3 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +.vscode \ No newline at end of file diff --git a/vscode-ext/language-configurations.json b/vscode-ext/language-configurations.json new file mode 100644 index 0000000..ffd5219 --- /dev/null +++ b/vscode-ext/language-configurations.json @@ -0,0 +1,19 @@ +{ + "brackets": [ + ["{", "}"], + ["[", "]"], + ["<", ">"] + ], + "autoClosingPairs": [ + { "open": "{", "close": "}" }, + { "open": "[", "close": "]" }, + { "open": "(", "close": ")" }, + { "open": "<", "close": ">" } + ], + "surroundingPairs": [ + ["{", "}"], + ["[", "]"], + ["(", ")"], + ["<", ">"] + ] +} \ No newline at end of file diff --git a/vscode-ext/package.json b/vscode-ext/package.json new file mode 100644 index 0000000..bd2c40b --- /dev/null +++ b/vscode-ext/package.json @@ -0,0 +1,33 @@ +{ + "name": "midas", + "version": "0.1.0", + "engines": { + "vscode": "*" + }, + "categories": ["Programming Languages"], + "contributes": { + "languages": [ + { + "id": "midas", + "extensions": [ + ".mpy", + ".midas" + ], + "aliases": [ + "Midas" + ], + "configuration": "./language-configuration.json" + } + ], + "grammars": [ + { + "language": "midas", + "scopeName": "source.midas", + "path": "./syntaxes/midas.tmLanguage.json", + "embeddedLanguages": { + "meta.embedded.block.python": "python" + } + } + ] + } +} \ No newline at end of file diff --git a/vscode-ext/syntaxes/midas.tmLanguage.json b/vscode-ext/syntaxes/midas.tmLanguage.json new file mode 100644 index 0000000..44745b0 --- /dev/null +++ b/vscode-ext/syntaxes/midas.tmLanguage.json @@ -0,0 +1,135 @@ +{ + "$schema": "https://raw.githubusercontent.com/martinring/tmlanguage/master/tmlanguage.json", + "name": "Midas", + "scopeName": "source.midas", + "patterns": [{ "include": "#statement" }], + "repository": { + "comment": { + "begin": "(//)", + "end": "($)", + "name": "comment.line", + "beginCaptures": { + "1": { + "name": "comment.line.double-dash" + } + } + }, + "type-def": { + "begin": "\\b(type)\\s+([a-zA-Z_][a-zA-Z_\\d]*)", + "end": "$", + "beginCaptures": { + "1": { + "name": "keyword.control.type.midas" + }, + "2": { + "name" : "variable.name" + } + }, + "patterns": [ + { "include": "#type-base" }, + { "include": "#type-body" } + ] + }, + "type-base": { + "begin": "<", + "end": ">", + "beginCaptures": { + "0": { + "name": "punctuation.definition.base.begin.midas" + } + }, + "endCaptures": { + "0": { + "name": "punctuation.definition.base.end.midas" + } + }, + "patterns": [ + {"include": "source.python"} + ] + }, + "type-body": { + "begin": "\\{", + "end": "\\}", + "beginCaptures": { + "0": { + "name": "punctuation.definition.type-body.begin.midas" + } + }, + "endCaptures": { + "0": { + "name": "punctuation.definition.type-body.end.midas" + } + }, + "patterns": [ + {"include": "#type-prop"} + ] + }, + "type-prop": { + "match": "([a-zA-Z_][a-zA-Z_\\d]*)(:)\\s*([a-zA-Z_][a-zA-Z_\\d]*)", + "captures": { + "1": { + "name": "variable.name" + }, + "2": { + "name": "punctuation.separator.annotation.midas" + }, + "3": { + "name": "meta.type.name" + } + } + }, + "op-def": { + "match": "\\b(op)\\s+<([a-zA-Z_][a-zA-Z_\\d]*)>\\s+(\\S+)\\s+<([a-zA-Z_][a-zA-Z_\\d]*)>\\s+(=)\\s+<([a-zA-Z_][a-zA-Z_\\d]*)>", + "captures": { + "1": { + "name": "keyword.control.op.midas" + }, + "2": { + "name" : "variable.name" + }, + "3": { + "name" : "keyword.operator" + }, + "4": { + "name" : "variable.name" + }, + "5": { + "name" : "keyword.operator.assignment" + }, + "6": { + "name" : "variable.name" + } + }, + "patterns": [ + { "include": "#type-base" }, + { "include": "#type-body" } + ] + }, + "constr-def": { + "begin": "(constraint)\\s+([a-zA-Z_][a-zA-Z_\\d]*)\\s*(=)", + "end": "$", + "beginCaptures": { + "1": { + "name": "keyword.control.constr.midas" + }, + "2": { + "name": "variable.name" + }, + "3": { + "name": "keyword.operator.assignment" + } + }, + "patterns": [ + { "include": "source.python" } + ] + }, + "statement": { + "patterns": [ + { "include": "#comment" }, + { "include": "#type-def" }, + { "include": "#op-def" }, + { "include": "#constr-def" } + ] + } + } +} \ No newline at end of file From fedc582e16a4055b7060728f5ff4f3f69f6a4147 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 19:17:55 +0200 Subject: [PATCH 03/41] feat(parser): add base lexer class the lexer and token structures were adapted from another project (see docstring on the Lexer class) --- .gitignore | 7 +- lexer/__init__.py | 0 lexer/base.py | 166 ++++++++++++++++++++++++++++++++++++++++++++++ lexer/position.py | 13 ++++ lexer/token.py | 39 +++++++++++ 5 files changed, 224 insertions(+), 1 deletion(-) create mode 100644 lexer/__init__.py create mode 100644 lexer/base.py create mode 100644 lexer/position.py create mode 100644 lexer/token.py diff --git a/.gitignore b/.gitignore index 600d2d3..f63541d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ -.vscode \ No newline at end of file +.vscode +__pycache__ +.env +venv +.venv +*.pyc \ No newline at end of file diff --git a/lexer/__init__.py b/lexer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lexer/base.py b/lexer/base.py new file mode 100644 index 0000000..1104e7a --- /dev/null +++ b/lexer/base.py @@ -0,0 +1,166 @@ +from abc import ABC, abstractmethod +from typing import Any, Callable, Optional + +from lexer.position import Position +from lexer.token import Token, TokenType + + +class Lexer(ABC): + """An abstract lexer which provides methods to easily extend it into a concrete one + + This implementation is based on the [_Crafting Interpreters_][1] book by Robert Nystrom, + more specifically on my [previous Python implementation](https://git.kb28.ch/HEL/pebble) + + [1]: https://craftinginterpreters.com/ + """ + + def __init__(self, source: str, file: Optional[str] = None) -> None: + """Create a new lexer to scan for tokens in the given source + + Args: + source (str): the source to scan + file (Optional[str], optional): the path of the given source. Can be a file path or any string identifier. Defaults to None. + """ + self.source: str = source + self.file: Optional[str] = file + self.tokens: list[Token] = [] + self.start: int = 0 + self.idx: int = 0 + self.length: int = len(self.source) + self.line: int = 1 + self.column: int = 1 + self.start_pos: Position = self.get_position() + + def error(self, msg: str): + """Raise a syntax error + + Args: + msg (str): the error message + + Raises: + SyntaxError + """ + raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}") + + def process(self) -> list[Token]: + """Scan tokens out of the source text + + Returns: + list[Token]: all the tokens that could be scanned + + Raises: + SyntaxError: if a syntax error is found + """ + self.scan_tokens() + self.tokens.append(Token(TokenType.EOF, "", None, self.get_position())) + return self.tokens + + def is_at_end(self) -> bool: + """Whether the lexer is at the end of the source + + Returns: + bool: True if the current index is at the end of the source + """ + return self.idx >= self.length + + def get_position(self) -> Position: + """Get the current position + + Returns: + Position: the current position + """ + return Position(file=self.file, line=self.line, column=self.column) + + def peek(self) -> str: + """Get the current character without advancing, if any + + Returns: + str: the current character, or an empty string if at EOF + """ + if self.idx < self.length: + return self.source[self.idx] + return "" + + def peek_next(self) -> str: + """Get the next character without advancing, if any + + Returns: + str: the next character, or an empty string if at EOF + """ + if self.idx + 1 < self.length: + return self.source[self.idx + 1] + return "" + + def advance(self) -> str: + """Get the new character and advance + + Returns: + str: the current character, before advancing + """ + char: str = self.peek() + self.idx += 1 + self.column += 1 + if char == "\n": + self.newline() + return char + + def newline(self): + """Update the current position after encountering a newline character""" + self.line += 1 + self.column = 1 + + def match(self, expected: str) -> bool: + """Consume the next character if it matches the given value + + Args: + expected (str): the expected character + + Returns: + bool: whether a character was matched and consumed + """ + if self.peek() == expected: + self.advance() + return True + return False + + def update_start(self): + """Update the starting position of the current lexeme + + The cursor marking the start of the lexeme currently being scanned is + moved to the current position + """ + self.start_pos = self.get_position() + self.start = self.idx + + def add_token(self, token_type: TokenType, value: Optional[Any] = None): + """Add the current lexeme to the list of scanned tokens + + Args: + token_type (TokenType): the type of token to add + value (Optional[Any], optional): the value of the token (useful for numbers or constants). Defaults to None. + """ + lexeme: str = self.source[self.start : self.idx] + self.tokens.append( + Token(position=self.start_pos, type=token_type, lexeme=lexeme, value=value) + ) + + def scan_tokens(self, condition: Optional[Callable[[], bool]] = None): + """Scan tokens until EOF is reached or the given condition becomes False + + Args: + condition (Optional[Callable[[], bool]], optional): the condition to continue scanning tokens. + If None, defaults to always being True, effectively scanning tokens until EOF is reached. Defaults to None. + """ + if condition is None: + condition = lambda: True # noqa: E731 + while condition() and not self.is_at_end(): + self.update_start() + self.scan_token() + + @abstractmethod + def scan_token(self) -> None: + """Scan a token + + This function should (at least) consume the current character and produce the appropriate token(s), using `add_token` + """ + pass diff --git a/lexer/position.py b/lexer/position.py new file mode 100644 index 0000000..306e24d --- /dev/null +++ b/lexer/position.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass(frozen=True) +class Position: + """A simple structure to store the position of a token""" + file: Optional[str] + line: int + column: int + + def __repr__(self): + return f"{self.file or ''}L{self.line}:{self.column}" diff --git a/lexer/token.py b/lexer/token.py new file mode 100644 index 0000000..3d0b293 --- /dev/null +++ b/lexer/token.py @@ -0,0 +1,39 @@ +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any + +from lexer.position import Position + + +class TokenType(Enum): + # Punctuation + LEFT_PAREN = auto() + RIGHT_PAREN = auto() + COLON = auto() + COMMA = auto() + UNDERSCORE = auto() + + # Operators + PLUS = auto() + + # Literals + IDENTIFIER = auto() + NUMBER = auto() + TRUE = auto() + FALSE = auto() + NONE = auto() + + # Misc + COMMENT = auto() + WHITESPACE = auto() + EOF = auto() + NEWLINE = auto() + + +@dataclass(frozen=True) +class Token: + """A scanned token""" + type: TokenType + lexeme: str + value: Any + position: Position From 10ee4991c3799a25edc230cc6c4ed510fd83d6c3 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 19:26:09 +0200 Subject: [PATCH 04/41] feat(parser): add a basic lexer for annotations --- lexer/annotations.py | 81 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 lexer/annotations.py diff --git a/lexer/annotations.py b/lexer/annotations.py new file mode 100644 index 0000000..b8c7cf7 --- /dev/null +++ b/lexer/annotations.py @@ -0,0 +1,81 @@ +from lexer.base import Lexer +from lexer.token import TokenType + + +class AnnotationLexer(Lexer): + def scan_token(self) -> None: + char: str = self.advance() + match char: + case "(": + self.add_token(TokenType.LEFT_PAREN) + case ")": + self.add_token(TokenType.RIGHT_PAREN) + case "[": + self.add_token(TokenType.LEFT_BRACKET) + case "]": + self.add_token(TokenType.RIGHT_BRACKET) + case ":": + self.add_token(TokenType.COLON) + case ",": + self.add_token(TokenType.COMMA) + case "_": + self.add_token(TokenType.UNDERSCORE) + case "+": + self.add_token(TokenType.PLUS) + case "#": + self.scan_comment() + case "\n": + self.add_token(TokenType.NEWLINE) + case " " | "\r" | "\t": + # Consume all whitespace characters until EOL or EOF + while ( + self.peek().isspace() + and self.peek() != "\n" + and not self.is_at_end() + ): + self.advance() + self.add_token(TokenType.WHITESPACE) + case _: + if char.isdigit(): + self.scan_number() + elif char.isalpha(): + self.scan_identifier() + else: + self.error("Unexpected character") + return None + + def scan_number(self): + """Scan the rest of number and add it as a token + + This method handles both simple integers and floats. Scientific notation + and base prefixes (0x, 0b, 0o) are not supported + """ + while self.peek().isdigit(): + self.advance() + + if self.peek() == "." and self.peek_next().isdigit(): + self.advance() + while self.peek().isdigit(): + self.advance() + + value: float = float(self.source[self.start : self.idx]) + self.add_token(TokenType.NUMBER, value) + + def scan_identifier(self): + """Scan the rest of an identifier and add it as a token + + An identifier starts with a letter, followed by any number of + alphanumerical characters or underscores + """ + while self.peek().isalnum() or self.peek() == "_": + self.advance() + self.add_token(TokenType.IDENTIFIER) + + def scan_comment(self): + """Scan the rest of a comment and add it as a token + + A comment starts with a '#' character and ends at the EOL/EOF + """ + while self.peek() != "\n" and not self.is_at_end(): + self.advance() + self.add_token(TokenType.COMMENT) From fcbea218a434ae872e184ec3d3043962ba9020f9 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 21:31:28 +0200 Subject: [PATCH 05/41] feat(parser): add a test script for the annotation lexer --- lexer/token.py | 2 ++ test.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 test.py diff --git a/lexer/token.py b/lexer/token.py index 3d0b293..e06194c 100644 --- a/lexer/token.py +++ b/lexer/token.py @@ -9,6 +9,8 @@ class TokenType(Enum): # Punctuation LEFT_PAREN = auto() RIGHT_PAREN = auto() + LEFT_BRACKET = auto() + RIGHT_BRACKET = auto() COLON = auto() COMMA = auto() UNDERSCORE = auto() diff --git a/test.py b/test.py new file mode 100644 index 0000000..32c5d97 --- /dev/null +++ b/test.py @@ -0,0 +1,15 @@ +import importlib + +from lexer.annotations import AnnotationLexer +from lexer.token import Token + + +mod = importlib.import_module("examples.00_syntax_prototype.01_simple_types") + +annotation: str = mod.__annotations__["df"] +lexer: AnnotationLexer = AnnotationLexer(annotation, "01_simple_types.py") +tokens: list[Token] = lexer.process() +print([ + f"{t.type.name}('{t.lexeme}')" + for t in tokens +]) \ No newline at end of file From 1fc842e23f9b35cc8eab590ba519cc61ff885a26 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 22:06:32 +0200 Subject: [PATCH 06/41] feat(parser): add basic lexer for type definitions --- .../00_syntax_prototype/02_custom_types.midas | 2 +- lexer/annotations.py | 8 +- lexer/keyword.py | 9 ++ lexer/midas.py | 126 ++++++++++++++++++ lexer/token.py | 17 +++ 5 files changed, 157 insertions(+), 5 deletions(-) create mode 100644 lexer/keyword.py create mode 100644 lexer/midas.py diff --git a/examples/00_syntax_prototype/02_custom_types.midas b/examples/00_syntax_prototype/02_custom_types.midas index ba8b758..8248e16 100644 --- a/examples/00_syntax_prototype/02_custom_types.midas +++ b/examples/00_syntax_prototype/02_custom_types.midas @@ -21,4 +21,4 @@ type Age // Predefined custom constraints that can be referenced in other definitions constraint Positive = _ >= 0 constraint StrictlyPositive = _ > 0 -constraint Even = _ % 2 == 0 \ No newline at end of file +//constraint Even = _ % 2 == 0 \ No newline at end of file diff --git a/lexer/annotations.py b/lexer/annotations.py index b8c7cf7..3cc0431 100644 --- a/lexer/annotations.py +++ b/lexer/annotations.py @@ -46,7 +46,7 @@ class AnnotationLexer(Lexer): def scan_number(self): """Scan the rest of number and add it as a token - + This method handles both simple integers and floats. Scientific notation and base prefixes (0x, 0b, 0o) are not supported """ @@ -63,7 +63,7 @@ class AnnotationLexer(Lexer): def scan_identifier(self): """Scan the rest of an identifier and add it as a token - + An identifier starts with a letter, followed by any number of alphanumerical characters or underscores """ @@ -73,8 +73,8 @@ class AnnotationLexer(Lexer): def scan_comment(self): """Scan the rest of a comment and add it as a token - - A comment starts with a '#' character and ends at the EOL/EOF + + A comment starts with a `#` character and ends at the EOL/EOF """ while self.peek() != "\n" and not self.is_at_end(): self.advance() diff --git a/lexer/keyword.py b/lexer/keyword.py new file mode 100644 index 0000000..a4f03cf --- /dev/null +++ b/lexer/keyword.py @@ -0,0 +1,9 @@ +from lexer.token import TokenType + +KEYWORDS: dict[str, TokenType] = { + "type": TokenType.TYPE, + "op": TokenType.OP, + "constraint": TokenType.CONSTRAINT, + "true": TokenType.TRUE, + "false": TokenType.FALSE, +} diff --git a/lexer/midas.py b/lexer/midas.py new file mode 100644 index 0000000..16440da --- /dev/null +++ b/lexer/midas.py @@ -0,0 +1,126 @@ +from lexer.base import Lexer +from lexer.keyword import KEYWORDS +from lexer.token import TokenType + + +class MidasLexer(Lexer): + def scan_token(self) -> None: + char: str = self.advance() + match char: + case "(": + self.add_token(TokenType.LEFT_PAREN) + case ")": + self.add_token(TokenType.RIGHT_PAREN) + case "[": + self.add_token(TokenType.LEFT_BRACKET) + case "]": + self.add_token(TokenType.RIGHT_BRACKET) + case "{": + self.add_token(TokenType.LEFT_BRACE) + case "}": + self.add_token(TokenType.RIGHT_BRACE) + case "<": + self.add_token( + TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS + ) + case ">": + self.add_token( + TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER + ) + case "=": + self.add_token( + TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL + ) + case ":": + self.add_token(TokenType.COLON) + case ",": + self.add_token(TokenType.COMMA) + case "_": + self.add_token(TokenType.UNDERSCORE) + case "+": + self.add_token(TokenType.PLUS) + case "-": + self.add_token(TokenType.MINUS) + case "*": + self.add_token(TokenType.STAR) + case "/": + if self.match("/"): + self.scan_comment() + elif self.match("*"): + self.scan_comment_multiline() + else: + self.add_token(TokenType.SLASH) + case "\n": + self.add_token(TokenType.NEWLINE) + case " " | "\r" | "\t": + # Consume all whitespace characters until EOL or EOF + while ( + self.peek().isspace() + and self.peek() != "\n" + and not self.is_at_end() + ): + self.advance() + self.add_token(TokenType.WHITESPACE) + case _: + if char.isdigit(): + self.scan_number() + elif char.isalpha(): + self.scan_identifier() + else: + self.error("Unexpected character") + return None + + def scan_number(self): + """Scan the rest of number and add it as a token + + This method handles both simple integers and floats. Scientific notation + and base prefixes (0x, 0b, 0o) are not supported + """ + while self.peek().isdigit(): + self.advance() + + if self.peek() == "." and self.peek_next().isdigit(): + self.advance() + while self.peek().isdigit(): + self.advance() + + value: float = float(self.source[self.start : self.idx]) + self.add_token(TokenType.NUMBER, value) + + def scan_identifier(self): + """Scan the rest of an identifier and add it as a token + + An identifier starts with a letter, followed by any number of + alphanumerical characters or underscores + """ + while self.peek().isalnum() or self.peek() == "_": + self.advance() + + lexeme: str = self.source[self.start : self.idx] + token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER) + self.add_token(token_type) + + def scan_comment(self): + """Scan the rest of a comment and add it as a token + + A comment starts with `//` and ends at the EOL/EOF + """ + while self.peek() != "\n" and not self.is_at_end(): + self.advance() + self.add_token(TokenType.COMMENT) + + def scan_comment_multiline(self): + """Scan the rest of a multiline comment and add it as a token + + A multiline comment starts with `/*` and ends with `*/` or at the EOF + """ + while ( + not (self.peek() == "*" and self.peek_next() == "/") + and not self.is_at_end() + ): + self.advance() + if not self.is_at_end(): + self.advance() + if not self.is_at_end(): + self.advance() + self.add_token(TokenType.COMMENT) diff --git a/lexer/token.py b/lexer/token.py index e06194c..9b5bc13 100644 --- a/lexer/token.py +++ b/lexer/token.py @@ -11,12 +11,23 @@ class TokenType(Enum): RIGHT_PAREN = auto() LEFT_BRACKET = auto() RIGHT_BRACKET = auto() + LEFT_BRACE = auto() + RIGHT_BRACE = auto() COLON = auto() COMMA = auto() UNDERSCORE = auto() # Operators PLUS = auto() + MINUS = auto() + STAR = auto() + SLASH = auto() + GREATER = auto() + GREATER_EQUAL = auto() + LESS = auto() + LESS_EQUAL = auto() + EQUAL = auto() + EQUAL_EQUAL = auto() # Literals IDENTIFIER = auto() @@ -25,6 +36,11 @@ class TokenType(Enum): FALSE = auto() NONE = auto() + # Keywords + TYPE = auto() + OP = auto() + CONSTRAINT = auto() + # Misc COMMENT = auto() WHITESPACE = auto() @@ -35,6 +51,7 @@ class TokenType(Enum): @dataclass(frozen=True) class Token: """A scanned token""" + type: TokenType lexeme: str value: Any From cc4b5dabf218f4b42f46e46af4f910562038ba36 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 22:07:23 +0200 Subject: [PATCH 07/41] feat(parser): add midas lexer to test script --- test.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/test.py b/test.py index 32c5d97..abc3fc9 100644 --- a/test.py +++ b/test.py @@ -1,9 +1,12 @@ import importlib +from pathlib import Path from lexer.annotations import AnnotationLexer +from lexer.midas import MidasLexer from lexer.token import Token +# Frame annotation mod = importlib.import_module("examples.00_syntax_prototype.01_simple_types") annotation: str = mod.__annotations__["df"] @@ -12,4 +15,14 @@ tokens: list[Token] = lexer.process() print([ f"{t.type.name}('{t.lexeme}')" for t in tokens -]) \ No newline at end of file +]) + +# Midas type definitions +path: Path = Path("examples") / "00_syntax_prototype" / "02_custom_types.midas" +definitions: str = path.read_text() +midas_lexer: MidasLexer = MidasLexer(definitions, path.name) +tokens = midas_lexer.process() +print([ + f"{t.type.name}('{t.lexeme}')" + for t in tokens +]) From 8252f452f2d4d78c4ee0bac104832b38a66b8706 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Wed, 13 May 2026 22:38:41 +0200 Subject: [PATCH 08/41] feat(parser): add base parser class the parser was adapted from another project (see docstring on the Parser class) --- parser/base.py | 163 +++++++++++++++++++++++++++++++++++++++++++++++ parser/errors.py | 2 + 2 files changed, 165 insertions(+) create mode 100644 parser/base.py create mode 100644 parser/errors.py diff --git a/parser/base.py b/parser/base.py new file mode 100644 index 0000000..2195f72 --- /dev/null +++ b/parser/base.py @@ -0,0 +1,163 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Generic, TypeVar + +from lexer.token import Token, TokenType +from parser.errors import ParsingError + + +@dataclass(frozen=True) +class TokenError: + """A parsing error linked to a particular token""" + + token: Token + message: str + + def get_report(self) -> str: + """Get a detailed error message + + Returns: + str: the complete error message + """ + where: str = f"'{self.token.lexeme}'" + if self.token.type == TokenType.EOF: + where = "end" + return f"({self.token.position}) Error at {where}: {self.message}" + + +T = TypeVar("T") + + +class Parser(ABC, Generic[T]): + """An abstract parser which provides methods to easily extend it into a concrete one + + This implementation is based on the [_Crafting Interpreters_][1] book by Robert Nystrom, + more specifically on my [previous Python implementation](https://git.kb28.ch/HEL/pebble) + + [1]: https://craftinginterpreters.com/ + """ + + IGNORE: set[TokenType] = { + TokenType.WHITESPACE, + TokenType.COMMENT, + TokenType.NEWLINE, + } + + def __init__(self, tokens: list[Token]) -> None: + """Create a new parser to parse the given tokens + + Args: + tokens (list[Token]): the tokens to parse + """ + self.tokens: list[Token] = list( + filter(lambda t: t.type not in self.IGNORE, tokens) + ) + self.current: int = 0 + self.length: int = len(self.tokens) + self.errors: list[TokenError] + + def error(self, token: Token, message: str): + """Record an error + + Args: + token (Token): the token at which the error was detected + message (str): a message explaining the error + + Returns: + ParsingError: the parsing error to raise + """ + self.errors.append(TokenError(token=token, message=message)) + return ParsingError() + + @abstractmethod + def parse(self) -> T: + """Parse the tokens + + Returns: + T: the parsed element(s) + """ + pass + + def is_at_end(self) -> bool: + """Whether the parser is at the end of the token list + + Returns: + bool: True if the current index is at the end of the token list + """ + return self.peek().type == TokenType.EOF + + def peek(self) -> Token: + """Get the current token without advancing + + Returns: + Token: the current token + """ + return self.tokens[self.current] + + def previous(self) -> Token: + """Get the previous token + + This function is unsafe and will raise an IndexError if called when + the parser is at the begin of the token list + + Returns: + Token: the previous token + """ + return self.tokens[self.current - 1] + + def check(self, token_type: TokenType) -> bool: + """Check whether the current token is of the given type + + This function always returns False if the parser is at the EOF token + + Args: + token_type (TokenType): the type of token to check + + Returns: + bool: True if the current token is of the given type and not EOF + """ + if self.is_at_end(): + return False + return self.peek().type == token_type + + def advance(self) -> Token: + """Consume and return the current token, if not at the EOF + + Returns: + Token: the current token, before advancing + """ + if not self.is_at_end(): + self.current += 1 + return self.previous() + + def match(self, *types: TokenType) -> bool: + """Consume the next token if it matches one of the given types + + Returns: + bool: whether a token was matched and consumed + """ + for token_type in types: + if self.check(token_type): + self.advance() + return True + return False + + def consume(self, token_type: TokenType, error_msg: str) -> Token: + """Consume the current token if it matches the given type or raise an error + + If the current token doesn't match the given type, an error is raised + with the provided message + + Args: + token_type (TokenType): the expected token type + error_msg (str): the error message if the token doesn't match + + Raises: + SyntaxError: if the current token doesn't match the given type + + Returns: + Token: the current token which matched the given type + """ + if self.check(token_type): + return self.advance() + raise self.error(self.peek(), error_msg) diff --git a/parser/errors.py b/parser/errors.py new file mode 100644 index 0000000..e8e65fb --- /dev/null +++ b/parser/errors.py @@ -0,0 +1,2 @@ +class ParsingError(RuntimeError): + pass From 721ed812df1607a0951e51fb7464bc58f568d62d Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 00:07:46 +0200 Subject: [PATCH 09/41] feat(parser): add a basic annotation parser --- core/ast/annotations.py | 53 ++++++++++++++++++++++++++++++++++ parser/annotations.py | 64 +++++++++++++++++++++++++++++++++++++++++ parser/base.py | 22 +++++++++++++- 3 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 core/ast/annotations.py create mode 100644 parser/annotations.py diff --git a/core/ast/annotations.py b/core/ast/annotations.py new file mode 100644 index 0000000..78a7ce6 --- /dev/null +++ b/core/ast/annotations.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Generic, Optional, TypeVar + +from lexer.token import Token + +T = TypeVar("T") + + +@dataclass(frozen=True) +class Expr(ABC): + @abstractmethod + def accept(self, visitor: Visitor[T]) -> T: ... + + class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_type_expr(self, expr: TypeExpr) -> T: ... + + @abstractmethod + def visit_schema_expr(self, expr: SchemaExpr) -> T: ... + + @abstractmethod + def visit_schema_element_expr(self, expr: SchemaElementExpr) -> T: ... + + +@dataclass(frozen=True) +class TypeExpr(Expr): + name: Token + schema: Optional[SchemaExpr] + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_type_expr(self) + + +@dataclass(frozen=True) +class SchemaExpr(Expr): + left: Token + elements: list[Expr] + right: Token + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_schema_expr(self) + + +@dataclass(frozen=True) +class SchemaElementExpr(Expr): + name: Optional[Token] + type: Optional[Expr] + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_schema_element_expr(self) diff --git a/parser/annotations.py b/parser/annotations.py new file mode 100644 index 0000000..4b1228d --- /dev/null +++ b/parser/annotations.py @@ -0,0 +1,64 @@ +from typing import Optional + +from core.ast.annotations import Expr, SchemaElementExpr, SchemaExpr, TypeExpr +from lexer.token import Token, TokenType +from parser.base import Parser +from parser.errors import ParsingError + + +class AnnotationParser(Parser): + SYNC_BOUNDARY: set[TokenType] = set() + + def parse(self) -> Optional[Expr]: + expression: Optional[Expr] = self.annotation() + if not self.is_at_end(): + self.error(self.peek(), "Extra tokens") + return expression + + def synchronize(self): + self.advance() + while not self.is_at_end(): + if self.peek().type in self.SYNC_BOUNDARY: + return + self.advance() + + def annotation(self) -> Optional[Expr]: + try: + return self.type() + except ParsingError: + self.synchronize() + return None + + def type(self) -> TypeExpr: + name: Token = self.consume(TokenType.IDENTIFIER, "Expected type identifier") + schema: Optional[SchemaExpr] = None + if self.match(TokenType.LEFT_BRACKET): + schema = self.schema() + return TypeExpr(name=name, schema=schema) + + def schema(self) -> SchemaExpr: + left: Token = self.previous() + elements: list[Expr] = [] + while not self.check(TokenType.RIGHT_BRACKET) and not self.is_at_end(): + elements.append(self.schema_element()) + if not self.check(TokenType.RIGHT_BRACKET): + self.consume(TokenType.COMMA, "Expected ',' between schema elements") + + right: Token = self.consume(TokenType.RIGHT_BRACKET, "Unclosed schema") + return SchemaExpr(left=left, elements=elements, right=right) + + def schema_element(self) -> Expr: + if self.match(TokenType.UNDERSCORE): + return SchemaElementExpr(name=None, type=None) + + if not self.check(TokenType.IDENTIFIER): + raise self.error(self.peek(), "Expected schema element") + + name: Optional[Token] = None + type: Optional[TypeExpr] = None + if self.check_next(TokenType.COLON): + name = self.advance() + self.advance() + if not self.match(TokenType.UNDERSCORE): + type = self.type() + return SchemaElementExpr(name=name, type=type) diff --git a/parser/base.py b/parser/base.py index 2195f72..74962db 100644 --- a/parser/base.py +++ b/parser/base.py @@ -54,7 +54,7 @@ class Parser(ABC, Generic[T]): ) self.current: int = 0 self.length: int = len(self.tokens) - self.errors: list[TokenError] + self.errors: list[TokenError] = [] def error(self, token: Token, message: str): """Record an error @@ -120,6 +120,26 @@ class Parser(ABC, Generic[T]): return False return self.peek().type == token_type + def check_next(self, token_type: TokenType) -> bool: + """Check whether the next token is of the given type + + This function always returns False if the parser is at the EOF token + + Args: + token_type (TokenType): the type of token to check + + Returns: + bool: True if the current token is of the given type and not EOF + """ + if self.is_at_end(): + return False + if self.current + 1 >= self.length: + return False + token: Token = self.tokens[self.current + 1] + if token.type == TokenType.EOF: + return False + return token.type == token_type + def advance(self) -> Token: """Consume and return the current token, if not at the EOF From c420e5e254703290179eb7ef7e6e4e641ca923d5 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 00:55:31 +0200 Subject: [PATCH 10/41] feat(parser): add an annotation AST printer --- core/ast/printer.py | 109 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 core/ast/printer.py diff --git a/core/ast/printer.py b/core/ast/printer.py new file mode 100644 index 0000000..c9d6841 --- /dev/null +++ b/core/ast/printer.py @@ -0,0 +1,109 @@ +from typing import Optional + +from core.ast.annotations import Expr, TypeExpr, SchemaExpr, SchemaElementExpr + + +class AnnotationAstPrinter(Expr.Visitor[str]): + LAST_CHILD = "└── " + CHILD = "├── " + VERTICAL = "│ " + EMPTY = " " + + def __init__(self): + self.level: int = 0 + self.idx: Optional[int] = None + self.last: bool = False + self.levels: list[int] = [] + + def print(self, expr: Expr): + return expr.accept(self) + + def print_line(self, text: str) -> str: + indent: str = "" + for enabled in self.levels[:-1]: + if enabled: + indent += self.VERTICAL + else: + indent += self.EMPTY + + if len(self.levels) > 0: + if self.levels[-1] == 2: + indent += self.LAST_CHILD + self.levels[-1] = 0 + else: + indent += self.CHILD + if self.idx is not None: + text = f"[{self.idx}] {text}" + self.idx = None + return indent + text + "\n" + + def visit_type_expr(self, expr: TypeExpr) -> str: + res: str = self.print_line("TypeExpr") + self.levels.append(1) + res += self.print_line(f'name: "{expr.name.lexeme}"') + self.levels[-1] = 2 + if expr.schema is None: + res += self.print_line("schema: None") + else: + res += self.print_line("schema") + self.levels.append(2) + res += expr.schema.accept(self) + self.levels.pop() + self.levels.pop() + return res + + def visit_schema_expr(self, expr: SchemaExpr) -> str: + res: str = self.print_line("SchemaExpr") + self.levels.append(1) + for i, elmt in enumerate(expr.elements): + self.idx = i + if i == len(expr.elements) - 1: + self.levels[-1] = 2 + res += elmt.accept(self) + self.levels.pop() + return res + + def visit_schema_element_expr(self, expr: SchemaElementExpr) -> str: + res: str = self.print_line("SchemaElementExpr") + self.levels.append(1) + res += self.print_line( + "name: " + ("None" if expr.name is None else f'"{expr.name.lexeme}"') + ) + self.levels[-1] = 2 + if expr.type is None: + res += self.print_line("type: None") + else: + res += self.print_line("type") + self.levels.append(2) + res += expr.type.accept(self) + self.levels.pop() + self.levels.pop() + return res + + +class AnnotationPrinter(Expr.Visitor[str]): + def print(self, expr: Expr): + return expr.accept(self) + + def visit_type_expr(self, expr: TypeExpr) -> str: + schema: str = "" + if expr.schema is not None: + schema = expr.schema.accept(self) + return f"{expr.name.lexeme}{schema}" + + def visit_schema_expr(self, expr: SchemaExpr) -> str: + res: str = expr.left.lexeme + res += ", ".join(elmt.accept(self) for elmt in expr.elements) + res += expr.right.lexeme + return res + + def visit_schema_element_expr(self, expr: SchemaElementExpr) -> str: + parts: list[str] = [] + if expr.name is not None: + parts.append(expr.name.lexeme) + + if expr.type is None: + parts.append("_") + else: + parts.append(expr.type.accept(self)) + return ": ".join(parts) From 052339ad3a4eedcbddda6b3cd2135f6988b26eb7 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 01:11:37 +0200 Subject: [PATCH 11/41] refactor(parser): improve AST printer refactored the messy AST printer impletation with Claude to use a context manager, an enum and extract common functions Co-authored-by: Claude --- core/ast/printer.py | 139 ++++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 63 deletions(-) diff --git a/core/ast/printer.py b/core/ast/printer.py index c9d6841..c624637 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -1,84 +1,97 @@ -from typing import Optional +from contextlib import contextmanager +from enum import Enum, auto +import io +from typing import Generator, Optional from core.ast.annotations import Expr, TypeExpr, SchemaExpr, SchemaElementExpr -class AnnotationAstPrinter(Expr.Visitor[str]): +class _Level(Enum): + EMPTY = auto() + ACTIVE = auto() + LAST = auto() + + +class AnnotationAstPrinter(Expr.Visitor[None]): LAST_CHILD = "└── " CHILD = "├── " VERTICAL = "│ " EMPTY = " " def __init__(self): - self.level: int = 0 - self.idx: Optional[int] = None - self.last: bool = False - self.levels: list[int] = [] + self._levels: list[_Level] = [] + self._idx: Optional[int] = None + self._buf: io.StringIO = io.StringIO() def print(self, expr: Expr): - return expr.accept(self) + self._buf = io.StringIO() + expr.accept(self) + return self._buf.getvalue() - def print_line(self, text: str) -> str: - indent: str = "" - for enabled in self.levels[:-1]: - if enabled: - indent += self.VERTICAL + @contextmanager + def _child_level(self, last: bool = False) -> Generator[None, None, None]: + self._levels.append(_Level.LAST if last else _Level.ACTIVE) + try: + yield + finally: + self._levels.pop() + + def _mark_last(self): + if self._levels: + self._levels[-1] = _Level.LAST + + def _write_line(self, text: str): + indent: str = self._build_indent() + if self._idx is not None: + text = f"[{self._idx}] {text}" + self._idx = None + self._buf.write(indent + text + "\n") + + def _build_indent(self) -> str: + parts: list[str] = [] + for level in self._levels[:-1]: + parts.append(self.EMPTY if level == _Level.EMPTY else self.VERTICAL) + if self._levels: + if self._levels[-1] == _Level.LAST: + parts.append(self.LAST_CHILD) + self._levels[-1] = _Level.EMPTY else: - indent += self.EMPTY + parts.append(self.CHILD) + return "".join(parts) - if len(self.levels) > 0: - if self.levels[-1] == 2: - indent += self.LAST_CHILD - self.levels[-1] = 0 - else: - indent += self.CHILD - if self.idx is not None: - text = f"[{self.idx}] {text}" - self.idx = None - return indent + text + "\n" - - def visit_type_expr(self, expr: TypeExpr) -> str: - res: str = self.print_line("TypeExpr") - self.levels.append(1) - res += self.print_line(f'name: "{expr.name.lexeme}"') - self.levels[-1] = 2 - if expr.schema is None: - res += self.print_line("schema: None") + def _write_optional_child( + self, label: str, child: Optional[Expr], *, last: bool = False + ): + if last: + self._mark_last() + if child is None: + self._write_line(f"{label}: None") else: - res += self.print_line("schema") - self.levels.append(2) - res += expr.schema.accept(self) - self.levels.pop() - self.levels.pop() - return res + self._write_line(label) + with self._child_level(last=True): + child.accept(self) - def visit_schema_expr(self, expr: SchemaExpr) -> str: - res: str = self.print_line("SchemaExpr") - self.levels.append(1) - for i, elmt in enumerate(expr.elements): - self.idx = i - if i == len(expr.elements) - 1: - self.levels[-1] = 2 - res += elmt.accept(self) - self.levels.pop() - return res + def visit_type_expr(self, expr: TypeExpr): + self._write_line("TypeExpr") + with self._child_level(): + self._write_line(f'name: "{expr.name.lexeme}"') + self._write_optional_child("schema", expr.schema, last=True) - def visit_schema_element_expr(self, expr: SchemaElementExpr) -> str: - res: str = self.print_line("SchemaElementExpr") - self.levels.append(1) - res += self.print_line( - "name: " + ("None" if expr.name is None else f'"{expr.name.lexeme}"') - ) - self.levels[-1] = 2 - if expr.type is None: - res += self.print_line("type: None") - else: - res += self.print_line("type") - self.levels.append(2) - res += expr.type.accept(self) - self.levels.pop() - self.levels.pop() - return res + def visit_schema_expr(self, expr: SchemaExpr): + self._write_line("SchemaExpr") + with self._child_level(): + for i, elmt in enumerate(expr.elements): + self._idx = i + if i == len(expr.elements) - 1: + self._mark_last() + elmt.accept(self) + + def visit_schema_element_expr(self, expr: SchemaElementExpr): + self._write_line("SchemaElementExpr") + with self._child_level(): + name_text: str = "None" if expr.name is None else f'"{expr.name.lexeme}"' + self._write_line(f"name: {name_text}") + self._write_optional_child("type", expr.type, last=True) class AnnotationPrinter(Expr.Visitor[str]): From 6d885a044956bd9c9f73304e3363dc4570df5092 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 01:12:05 +0200 Subject: [PATCH 12/41] feat(parser): use AST printer in test script --- test.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test.py b/test.py index abc3fc9..551e69a 100644 --- a/test.py +++ b/test.py @@ -1,10 +1,11 @@ import importlib from pathlib import Path +from core.ast.printer import AnnotationAstPrinter from lexer.annotations import AnnotationLexer from lexer.midas import MidasLexer from lexer.token import Token - +from parser.annotations import AnnotationParser # Frame annotation mod = importlib.import_module("examples.00_syntax_prototype.01_simple_types") @@ -12,17 +13,20 @@ mod = importlib.import_module("examples.00_syntax_prototype.01_simple_types") annotation: str = mod.__annotations__["df"] lexer: AnnotationLexer = AnnotationLexer(annotation, "01_simple_types.py") tokens: list[Token] = lexer.process() -print([ - f"{t.type.name}('{t.lexeme}')" - for t in tokens -]) +# print([f"{t.type.name}('{t.lexeme}')" for t in tokens]) + +parser = AnnotationParser(tokens) +parsed = parser.parse() +print(parsed) +for err in parser.errors: + print(err.get_report()) +printer = AnnotationAstPrinter() +if parsed is not None: + print(printer.print(parsed)) # Midas type definitions path: Path = Path("examples") / "00_syntax_prototype" / "02_custom_types.midas" definitions: str = path.read_text() midas_lexer: MidasLexer = MidasLexer(definitions, path.name) tokens = midas_lexer.process() -print([ - f"{t.type.name}('{t.lexeme}')" - for t in tokens -]) +# print([f"{t.type.name}('{t.lexeme}')" for t in tokens]) From 4d25b43a4e55b060f7e187987a6971344c17b3de Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 02:00:52 +0200 Subject: [PATCH 13/41] fix(parser): prepare printer for midas printer --- core/ast/printer.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/core/ast/printer.py b/core/ast/printer.py index c624637..4394834 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -1,9 +1,11 @@ +from __future__ import annotations + from contextlib import contextmanager from enum import Enum, auto import io -from typing import Generator, Optional +from typing import Generator, Generic, Optional, Protocol, TypeVar -from core.ast.annotations import Expr, TypeExpr, SchemaExpr, SchemaElementExpr +import core.ast.annotations as a class _Level(Enum): @@ -12,7 +14,14 @@ class _Level(Enum): LAST = auto() -class AnnotationAstPrinter(Expr.Visitor[None]): +class Expr(Protocol): + def accept(self, printer: AstPrinter) -> None: ... + + +T = TypeVar("T", bound=Expr) + + +class AstPrinter(Generic[T]): LAST_CHILD = "└── " CHILD = "├── " VERTICAL = "│ " @@ -23,7 +32,7 @@ class AnnotationAstPrinter(Expr.Visitor[None]): self._idx: Optional[int] = None self._buf: io.StringIO = io.StringIO() - def print(self, expr: Expr): + def print(self, expr: T): self._buf = io.StringIO() expr.accept(self) return self._buf.getvalue() @@ -60,7 +69,7 @@ class AnnotationAstPrinter(Expr.Visitor[None]): return "".join(parts) def _write_optional_child( - self, label: str, child: Optional[Expr], *, last: bool = False + self, label: str, child: Optional[T], *, last: bool = False ): if last: self._mark_last() @@ -71,13 +80,15 @@ class AnnotationAstPrinter(Expr.Visitor[None]): with self._child_level(last=True): child.accept(self) - def visit_type_expr(self, expr: TypeExpr): + +class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None]): + def visit_type_expr(self, expr: a.TypeExpr): self._write_line("TypeExpr") with self._child_level(): self._write_line(f'name: "{expr.name.lexeme}"') self._write_optional_child("schema", expr.schema, last=True) - def visit_schema_expr(self, expr: SchemaExpr): + def visit_schema_expr(self, expr: a.SchemaExpr): self._write_line("SchemaExpr") with self._child_level(): for i, elmt in enumerate(expr.elements): @@ -86,7 +97,7 @@ class AnnotationAstPrinter(Expr.Visitor[None]): self._mark_last() elmt.accept(self) - def visit_schema_element_expr(self, expr: SchemaElementExpr): + def visit_schema_element_expr(self, expr: a.SchemaElementExpr): self._write_line("SchemaElementExpr") with self._child_level(): name_text: str = "None" if expr.name is None else f'"{expr.name.lexeme}"' @@ -94,23 +105,23 @@ class AnnotationAstPrinter(Expr.Visitor[None]): self._write_optional_child("type", expr.type, last=True) -class AnnotationPrinter(Expr.Visitor[str]): - def print(self, expr: Expr): +class AnnotationPrinter(a.Expr.Visitor[str]): + def print(self, expr: a.Expr): return expr.accept(self) - def visit_type_expr(self, expr: TypeExpr) -> str: + def visit_type_expr(self, expr: a.TypeExpr) -> str: schema: str = "" if expr.schema is not None: schema = expr.schema.accept(self) return f"{expr.name.lexeme}{schema}" - def visit_schema_expr(self, expr: SchemaExpr) -> str: + def visit_schema_expr(self, expr: a.SchemaExpr) -> str: res: str = expr.left.lexeme res += ", ".join(elmt.accept(self) for elmt in expr.elements) res += expr.right.lexeme return res - def visit_schema_element_expr(self, expr: SchemaElementExpr) -> str: + def visit_schema_element_expr(self, expr: a.SchemaElementExpr) -> str: parts: list[str] = [] if expr.name is not None: parts.append(expr.name.lexeme) From 6482e06bcaa42cae3e18d52b0fbe9ebf0fb50ef0 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 02:19:50 +0200 Subject: [PATCH 14/41] feat(parser): add base Midas parser (incomplete) --- core/ast/midas.py | 86 +++++++++++++++++++++++++++++++++++++++++++++ core/ast/printer.py | 50 ++++++++++++++++++++++++++ parser/midas.py | 67 +++++++++++++++++++++++++++++++++++ 3 files changed, 203 insertions(+) create mode 100644 core/ast/midas.py create mode 100644 parser/midas.py diff --git a/core/ast/midas.py b/core/ast/midas.py new file mode 100644 index 0000000..1c85f66 --- /dev/null +++ b/core/ast/midas.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Generic, Optional, TypeVar + +from lexer.token import Token + +T = TypeVar("T") + + +# Statements + + +@dataclass(frozen=True) +class Stmt(ABC): + @abstractmethod + def accept(self, visitor: Visitor[T]) -> T: ... + + class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_type_stmt(self, stmt: TypeStmt) -> T: ... + + @abstractmethod + def visit_property_stmt(self, stmt: PropertyStmt) -> T: ... + + +@dataclass(frozen=True) +class TypeStmt(Stmt): + name: Token + bases: list[TypeExpr] + body: Optional[TypeBodyExpr] + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_type_stmt(self) + + +@dataclass(frozen=True) +class PropertyStmt(Stmt): + name: Token + type: TypeExpr + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_property_stmt(self) + + +# Expressions + + +@dataclass(frozen=True) +class Expr(ABC): + @abstractmethod + def accept(self, visitor: Visitor[T]) -> T: ... + + class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_type_expr(self, expr: TypeExpr) -> T: ... + + @abstractmethod + def visit_constraint_expr(self, expr: ConstraintExpr) -> T: ... + + @abstractmethod + def visit_type_body_expr(self, expr: TypeBodyExpr) -> T: ... + + +@dataclass(frozen=True) +class TypeExpr(Expr): + name: Token + constraints: list[ConstraintExpr] + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_type_expr(self) + + +@dataclass(frozen=True) +class ConstraintExpr(Expr): + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_constraint_expr(self) + + +@dataclass(frozen=True) +class TypeBodyExpr(Expr): + properties: list[PropertyStmt] + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_type_body_expr(self) diff --git a/core/ast/printer.py b/core/ast/printer.py index 4394834..8c873eb 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -6,6 +6,7 @@ import io from typing import Generator, Generic, Optional, Protocol, TypeVar import core.ast.annotations as a +import core.ast.midas as m class _Level(Enum): @@ -131,3 +132,52 @@ class AnnotationPrinter(a.Expr.Visitor[str]): else: parts.append(expr.type.accept(self)) return ": ".join(parts) + + +class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): + def visit_type_stmt(self, stmt: m.TypeStmt): + self._write_line("TypeStmt") + with self._child_level(): + self._write_line(f'name: "{stmt.name.lexeme}"') + self._write_line("bases") + with self._child_level(): + for i, base in enumerate(stmt.bases): + self._idx = i + if i == len(stmt.bases) - 1: + self._mark_last() + base.accept(self) + self._write_optional_child("body", stmt.body, last=True) + + def visit_property_stmt(self, stmt: m.PropertyStmt): + self._write_line("PropertyStmt") + with self._child_level(): + self._write_line(f'name: "{stmt.name.lexeme}"') + self._write_line("type") + with self._child_level(): + stmt.type.accept(self) + + def visit_type_expr(self, expr: m.TypeExpr): + self._write_line("TypeExpr") + with self._child_level(): + self._write_line(f'name: "{expr.name.lexeme}"') + self._write_line("constraints") + with self._child_level(): + for i, constraint in enumerate(expr.constraints): + self._idx = i + if i == len(expr.constraints) - 1: + self._mark_last() + constraint.accept(self) + + def visit_constraint_expr(self, expr: m.ConstraintExpr): + self._write_line("ConstraintExpr") + + def visit_type_body_expr(self, expr: m.TypeBodyExpr): + self._write_line("TypeBodyExpr") + with self._child_level(): + self._write_line("properties") + with self._child_level(): + for i, property in enumerate(expr.properties): + self._idx = i + if i == len(expr.properties) - 1: + self._mark_last() + property.accept(self) diff --git a/parser/midas.py b/parser/midas.py new file mode 100644 index 0000000..fd02c04 --- /dev/null +++ b/parser/midas.py @@ -0,0 +1,67 @@ +from typing import Optional + +from core.ast.midas import ConstraintExpr, Stmt, TypeBodyExpr, TypeExpr, TypeStmt +from lexer.token import Token, TokenType +from parser.base import Parser +from parser.errors import ParsingError + + +class MidasParser(Parser): + SYNC_BOUNDARY: set[TokenType] = {TokenType.TYPE, TokenType.OP, TokenType.CONSTRAINT} + + def parse(self) -> list[Stmt]: + statements: list[Stmt] = [] + while not self.is_at_end(): + stmt: Optional[Stmt] = self.declaration() + if stmt is None: + print("Early stop") + break + statements.append(stmt) + return statements + + def synchronize(self): + self.advance() + while not self.is_at_end(): + if self.previous().type == TokenType.NEWLINE: + return + if self.peek().type in self.SYNC_BOUNDARY: + return + self.advance() + + def declaration(self) -> Optional[Stmt]: + try: + if self.match(TokenType.TYPE): + return self.type_declaration() + # if self.match(TokenType.OP): + # return self.op_declaration() + # if self.match(TokenType.CONSTRAINT): + # return self.constraint_declaration() + except ParsingError: + self.synchronize() + return None + + def type_declaration(self) -> TypeStmt: + name: Token = self.consume(TokenType.IDENTIFIER, "Expected type name") + self.consume(TokenType.LESS, "Expected '<' after type name") + bases: list[TypeExpr] = [] + while not self.check(TokenType.GREATER) and not self.is_at_end(): + bases.append(self.type_expr()) + if not self.check(TokenType.GREATER): + self.consume(TokenType.COMMA, "Expected ',' between type bases") + self.consume(TokenType.GREATER, "Expected '>' after base type") + + body: Optional[TypeBodyExpr] = None + return TypeStmt(name=name, bases=bases, body=body) + + def type_expr(self) -> TypeExpr: + name: Token = self.consume(TokenType.IDENTIFIER, "Expected type name") + constraints: list[ConstraintExpr] = [] + + while not self.is_at_end() and self.match(TokenType.PLUS): + constraints.append(self.constraint_expr()) + + return TypeExpr(name=name, constraints=constraints) + + def constraint_expr(self) -> ConstraintExpr: + # TODO + return ConstraintExpr() From 3b40abaa2b4bee9c6d2ad51deca55a009d280f7e Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 02:29:08 +0200 Subject: [PATCH 15/41] feat(parser): parse type body --- core/ast/printer.py | 11 +++++++---- parser/midas.py | 26 +++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/core/ast/printer.py b/core/ast/printer.py index 8c873eb..46c45da 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -50,7 +50,9 @@ class AstPrinter(Generic[T]): if self._levels: self._levels[-1] = _Level.LAST - def _write_line(self, text: str): + def _write_line(self, text: str, *, last: bool = False): + if last: + self._mark_last() indent: str = self._build_indent() if self._idx is not None: text = f"[{self._idx}] {text}" @@ -152,15 +154,16 @@ class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): self._write_line("PropertyStmt") with self._child_level(): self._write_line(f'name: "{stmt.name.lexeme}"') - self._write_line("type") + self._write_line("type", last=True) with self._child_level(): + self._mark_last() stmt.type.accept(self) def visit_type_expr(self, expr: m.TypeExpr): self._write_line("TypeExpr") with self._child_level(): self._write_line(f'name: "{expr.name.lexeme}"') - self._write_line("constraints") + self._write_line("constraints", last=True) with self._child_level(): for i, constraint in enumerate(expr.constraints): self._idx = i @@ -174,7 +177,7 @@ class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): def visit_type_body_expr(self, expr: m.TypeBodyExpr): self._write_line("TypeBodyExpr") with self._child_level(): - self._write_line("properties") + self._write_line("properties", last=True) with self._child_level(): for i, property in enumerate(expr.properties): self._idx = i diff --git a/parser/midas.py b/parser/midas.py index fd02c04..f03029d 100644 --- a/parser/midas.py +++ b/parser/midas.py @@ -1,6 +1,13 @@ from typing import Optional -from core.ast.midas import ConstraintExpr, Stmt, TypeBodyExpr, TypeExpr, TypeStmt +from core.ast.midas import ( + ConstraintExpr, + PropertyStmt, + Stmt, + TypeBodyExpr, + TypeExpr, + TypeStmt, +) from lexer.token import Token, TokenType from parser.base import Parser from parser.errors import ParsingError @@ -51,6 +58,9 @@ class MidasParser(Parser): self.consume(TokenType.GREATER, "Expected '>' after base type") body: Optional[TypeBodyExpr] = None + + if self.check(TokenType.LEFT_BRACE): + body = self.type_body_expr() return TypeStmt(name=name, bases=bases, body=body) def type_expr(self) -> TypeExpr: @@ -65,3 +75,17 @@ class MidasParser(Parser): def constraint_expr(self) -> ConstraintExpr: # TODO return ConstraintExpr() + + def type_body_expr(self) -> TypeBodyExpr: + self.consume(TokenType.LEFT_BRACE, "Expected '{' to start type body") + properties: list[PropertyStmt] = [] + while not self.check(TokenType.RIGHT_BRACE) and not self.is_at_end(): + properties.append(self.property_stmt()) + self.consume(TokenType.RIGHT_BRACE, "Unclosed type body") + return TypeBodyExpr(properties=properties) + + def property_stmt(self) -> PropertyStmt: + name: Token = self.consume(TokenType.IDENTIFIER, "Expected property name") + self.consume(TokenType.COLON, "Expected ':' after property name") + type: TypeExpr = self.type_expr() + return PropertyStmt(name=name, type=type) From 0af31a6f859574e98e3ce771d94d56fd0714154e Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 02:37:50 +0200 Subject: [PATCH 16/41] feat(parser): parse op statements --- core/ast/midas.py | 14 ++++++++++++++ core/ast/printer.py | 20 ++++++++++++++++++++ parser/midas.py | 24 ++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/core/ast/midas.py b/core/ast/midas.py index 1c85f66..6f64b57 100644 --- a/core/ast/midas.py +++ b/core/ast/midas.py @@ -24,6 +24,9 @@ class Stmt(ABC): @abstractmethod def visit_property_stmt(self, stmt: PropertyStmt) -> T: ... + @abstractmethod + def visit_op_stmt(self, stmt: OpStmt) -> T: ... + @dataclass(frozen=True) class TypeStmt(Stmt): @@ -44,6 +47,17 @@ class PropertyStmt(Stmt): return visitor.visit_property_stmt(self) +@dataclass(frozen=True) +class OpStmt(Stmt): + left: TypeExpr + op: Token + right: TypeExpr + result: TypeExpr + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_op_stmt(self) + + # Expressions diff --git a/core/ast/printer.py b/core/ast/printer.py index 46c45da..c745221 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -159,6 +159,26 @@ class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): self._mark_last() stmt.type.accept(self) + def visit_op_stmt(self, stmt: m.OpStmt) -> None: + self._write_line("OpStmt") + with self._child_level(): + self._write_line("left") + with self._child_level(): + self._mark_last() + stmt.left.accept(self) + + self._write_line(f'op: "{stmt.op.lexeme}"') + + self._write_line("right") + with self._child_level(): + self._mark_last() + stmt.right.accept(self) + + self._write_line("result", last=True) + with self._child_level(): + self._mark_last() + stmt.result.accept(self) + def visit_type_expr(self, expr: m.TypeExpr): self._write_line("TypeExpr") with self._child_level(): diff --git a/parser/midas.py b/parser/midas.py index f03029d..bcf1477 100644 --- a/parser/midas.py +++ b/parser/midas.py @@ -2,6 +2,7 @@ from typing import Optional from core.ast.midas import ( ConstraintExpr, + OpStmt, PropertyStmt, Stmt, TypeBodyExpr, @@ -39,8 +40,8 @@ class MidasParser(Parser): try: if self.match(TokenType.TYPE): return self.type_declaration() - # if self.match(TokenType.OP): - # return self.op_declaration() + if self.match(TokenType.OP): + return self.op_declaration() # if self.match(TokenType.CONSTRAINT): # return self.constraint_declaration() except ParsingError: @@ -89,3 +90,22 @@ class MidasParser(Parser): self.consume(TokenType.COLON, "Expected ':' after property name") type: TypeExpr = self.type_expr() return PropertyStmt(name=name, type=type) + + def op_declaration(self) -> OpStmt: + self.consume(TokenType.LESS, "Expected '<' before first type") + left: TypeExpr = self.type_expr() + self.consume(TokenType.GREATER, "Expected '>' after first type") + + op: Token = self.advance() + + self.consume(TokenType.LESS, "Expected '<' before second type") + right: TypeExpr = self.type_expr() + self.consume(TokenType.GREATER, "Expected '>' after second type") + + self.consume(TokenType.EQUAL, "Expected '=' after second type") + + self.consume(TokenType.LESS, "Expected '<' before result type") + result: TypeExpr = self.type_expr() + self.consume(TokenType.GREATER, "Expected '>' after result type") + + return OpStmt(left=left, op=op, right=right, result=result) From 4b715ed33aea100f086f9a7fc8252b969a52ee75 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 02:38:04 +0200 Subject: [PATCH 17/41] feat(parser): use midas parser in test script --- test.py | 62 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/test.py b/test.py index 551e69a..0476438 100644 --- a/test.py +++ b/test.py @@ -1,32 +1,52 @@ import importlib from pathlib import Path -from core.ast.printer import AnnotationAstPrinter +from core.ast.printer import AnnotationAstPrinter, MidasAstPrinter from lexer.annotations import AnnotationLexer from lexer.midas import MidasLexer from lexer.token import Token from parser.annotations import AnnotationParser +from parser.midas import MidasParser -# Frame annotation -mod = importlib.import_module("examples.00_syntax_prototype.01_simple_types") -annotation: str = mod.__annotations__["df"] -lexer: AnnotationLexer = AnnotationLexer(annotation, "01_simple_types.py") -tokens: list[Token] = lexer.process() -# print([f"{t.type.name}('{t.lexeme}')" for t in tokens]) +def test_annotation(): + # Frame annotation + mod = importlib.import_module("examples.00_syntax_prototype.01_simple_types") -parser = AnnotationParser(tokens) -parsed = parser.parse() -print(parsed) -for err in parser.errors: - print(err.get_report()) -printer = AnnotationAstPrinter() -if parsed is not None: - print(printer.print(parsed)) + annotation: str = mod.__annotations__["df"] + lexer: AnnotationLexer = AnnotationLexer(annotation, "01_simple_types.py") + tokens: list[Token] = lexer.process() + # print([f"{t.type.name}('{t.lexeme}')" for t in tokens]) -# Midas type definitions -path: Path = Path("examples") / "00_syntax_prototype" / "02_custom_types.midas" -definitions: str = path.read_text() -midas_lexer: MidasLexer = MidasLexer(definitions, path.name) -tokens = midas_lexer.process() -# print([f"{t.type.name}('{t.lexeme}')" for t in tokens]) + parser = AnnotationParser(tokens) + parsed = parser.parse() + print(parsed) + for err in parser.errors: + print(err.get_report()) + printer = AnnotationAstPrinter() + if parsed is not None: + print(printer.print(parsed)) + + +def test_midas(): + # Midas type definitions + path: Path = Path("examples") / "00_syntax_prototype" / "02_custom_types.midas" + definitions: str = path.read_text() + midas_lexer: MidasLexer = MidasLexer(definitions, path.name) + tokens: list[Token] = midas_lexer.process() + # print([f"{t.type.name}('{t.lexeme}')" for t in tokens]) + + parser = MidasParser(tokens) + parsed = parser.parse() + print(parsed) + for err in parser.errors: + print(err.get_report()) + printer = MidasAstPrinter() + for stmt in parsed: + if stmt is None: + print("None") + continue + print(printer.print(stmt)) + + +test_midas() From 61b36ee50fede921d5a399b85381c517838ec1be Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Thu, 14 May 2026 02:44:21 +0200 Subject: [PATCH 18/41] feat(parser): parse constraint statements --- core/ast/midas.py | 12 ++++++++++++ core/ast/printer.py | 9 +++++++++ parser/midas.py | 11 +++++++++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/core/ast/midas.py b/core/ast/midas.py index 6f64b57..6d1b035 100644 --- a/core/ast/midas.py +++ b/core/ast/midas.py @@ -27,6 +27,9 @@ class Stmt(ABC): @abstractmethod def visit_op_stmt(self, stmt: OpStmt) -> T: ... + @abstractmethod + def visit_constraint_stmt(self, stmt: ConstraintStmt) -> T: ... + @dataclass(frozen=True) class TypeStmt(Stmt): @@ -58,6 +61,15 @@ class OpStmt(Stmt): return visitor.visit_op_stmt(self) +@dataclass(frozen=True) +class ConstraintStmt(Stmt): + name: Token + constraint: ConstraintExpr + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_constraint_stmt(self) + + # Expressions diff --git a/core/ast/printer.py b/core/ast/printer.py index c745221..859de7b 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -179,6 +179,15 @@ class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): self._mark_last() stmt.result.accept(self) + def visit_constraint_stmt(self, stmt: m.ConstraintStmt): + self._write_line("ConstraintStmt") + with self._child_level(): + self._write_line(f'name: "{stmt.name.lexeme}"') + self._write_line("constraint", last=True) + with self._child_level(): + self._mark_last() + stmt.constraint.accept(self) + def visit_type_expr(self, expr: m.TypeExpr): self._write_line("TypeExpr") with self._child_level(): diff --git a/parser/midas.py b/parser/midas.py index bcf1477..c056038 100644 --- a/parser/midas.py +++ b/parser/midas.py @@ -2,6 +2,7 @@ from typing import Optional from core.ast.midas import ( ConstraintExpr, + ConstraintStmt, OpStmt, PropertyStmt, Stmt, @@ -42,8 +43,8 @@ class MidasParser(Parser): return self.type_declaration() if self.match(TokenType.OP): return self.op_declaration() - # if self.match(TokenType.CONSTRAINT): - # return self.constraint_declaration() + if self.match(TokenType.CONSTRAINT): + return self.constraint_declaration() except ParsingError: self.synchronize() return None @@ -109,3 +110,9 @@ class MidasParser(Parser): self.consume(TokenType.GREATER, "Expected '>' after result type") return OpStmt(left=left, op=op, right=right, result=result) + + def constraint_declaration(self) -> ConstraintStmt: + name: Token = self.consume(TokenType.IDENTIFIER, "Expected constraint name") + self.consume(TokenType.EQUAL, "Expected '=' after constraint name") + constraint: ConstraintExpr = self.constraint_expr() + return ConstraintStmt(name=name, constraint=constraint) From 5831906f26deaae30e441ea391061caab2e27033 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 08:52:38 +0200 Subject: [PATCH 19/41] feat(parser): add documentation to annotation parser --- parser/annotations.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/parser/annotations.py b/parser/annotations.py index 4b1228d..dcd3a32 100644 --- a/parser/annotations.py +++ b/parser/annotations.py @@ -7,6 +7,8 @@ from parser.errors import ParsingError class AnnotationParser(Parser): + """A simple parser for custom type annotations""" + SYNC_BOUNDARY: set[TokenType] = set() def parse(self) -> Optional[Expr]: @@ -16,6 +18,11 @@ class AnnotationParser(Parser): return expression def synchronize(self): + """Skip tokens until a synchronization boundary is found + + This method allows gracefully recovering from a parse error + to a safe place and continue parsing + """ self.advance() while not self.is_at_end(): if self.peek().type in self.SYNC_BOUNDARY: @@ -23,6 +30,13 @@ class AnnotationParser(Parser): self.advance() def annotation(self) -> Optional[Expr]: + """Try and parse an annotation + + Any parsing error is caught and None is returned + + Returns: + Optional[Expr]: the parsed annotation expression, or None if a ParsingError was raised + """ try: return self.type() except ParsingError: @@ -30,6 +44,13 @@ class AnnotationParser(Parser): return None def type(self) -> TypeExpr: + """Parse a type definition + + `Type` or `Type[Schema]` + + Returns: + TypeExpr: the parsed type expression + """ name: Token = self.consume(TokenType.IDENTIFIER, "Expected type identifier") schema: Optional[SchemaExpr] = None if self.match(TokenType.LEFT_BRACKET): @@ -37,6 +58,13 @@ class AnnotationParser(Parser): return TypeExpr(name=name, schema=schema) def schema(self) -> SchemaExpr: + """Parse a schema definition + + A comma separated list of schema elements + + Returns: + SchemaExpr: the parsed schema expression + """ left: Token = self.previous() elements: list[Expr] = [] while not self.check(TokenType.RIGHT_BRACKET) and not self.is_at_end(): @@ -47,7 +75,15 @@ class AnnotationParser(Parser): right: Token = self.consume(TokenType.RIGHT_BRACKET, "Unclosed schema") return SchemaExpr(left=left, elements=elements, right=right) - def schema_element(self) -> Expr: + def schema_element(self) -> SchemaElementExpr: + """Parse a schema element + + An anonymous element (`_`), a type, an untyped named column (`name: _`), + or a named column (`name: Type`) + + Returns: + SchemaElementExpr: the parsed schema element expression + """ if self.match(TokenType.UNDERSCORE): return SchemaElementExpr(name=None, type=None) From 539084f6d8007d683ae51b4e90b6e9a71fd58427 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 09:17:57 +0200 Subject: [PATCH 20/41] feat(parser): add documentation to Midas parser --- parser/midas.py | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/parser/midas.py b/parser/midas.py index c056038..bf7c1d7 100644 --- a/parser/midas.py +++ b/parser/midas.py @@ -16,6 +16,8 @@ from parser.errors import ParsingError class MidasParser(Parser): + """A simple parser for midas type definitions""" + SYNC_BOUNDARY: set[TokenType] = {TokenType.TYPE, TokenType.OP, TokenType.CONSTRAINT} def parse(self) -> list[Stmt]: @@ -29,6 +31,11 @@ class MidasParser(Parser): return statements def synchronize(self): + """Skip tokens until a synchronization boundary is found + + This method allows gracefully recovering from a parse error + to a safe place and continue parsing + """ self.advance() while not self.is_at_end(): if self.previous().type == TokenType.NEWLINE: @@ -38,6 +45,13 @@ class MidasParser(Parser): self.advance() def declaration(self) -> Optional[Stmt]: + """Try and parse a declaration + + Any parsing error is caught and None is returned + + Returns: + Optional[Stmt]: the parsed Midas statement, or None if a ParsingError was raised + """ try: if self.match(TokenType.TYPE): return self.type_declaration() @@ -50,6 +64,13 @@ class MidasParser(Parser): return None def type_declaration(self) -> TypeStmt: + """Parse a type declaration + + A type declaration is written `type Name` optionally followed by a brace-wrapped body + + Returns: + TypeStmt: the parsed type declaration statement + """ name: Token = self.consume(TokenType.IDENTIFIER, "Expected type name") self.consume(TokenType.LESS, "Expected '<' after type name") bases: list[TypeExpr] = [] @@ -66,6 +87,11 @@ class MidasParser(Parser): return TypeStmt(name=name, bases=bases, body=body) def type_expr(self) -> TypeExpr: + """Parse a type expression + + Returns: + TypeExpr: the parsed type expression + """ name: Token = self.consume(TokenType.IDENTIFIER, "Expected type name") constraints: list[ConstraintExpr] = [] @@ -75,10 +101,23 @@ class MidasParser(Parser): return TypeExpr(name=name, constraints=constraints) def constraint_expr(self) -> ConstraintExpr: + """Parse a type constraint + + Returns: + ConstraintExpr: the parsed type constraint expression + """ # TODO return ConstraintExpr() def type_body_expr(self) -> TypeBodyExpr: + """Parse a type definition body + + A type definition body is a set of whitespace-separated + property statements enclosed in curly braces + + Returns: + TypeBodyExpr: the parsed type body expression + """ self.consume(TokenType.LEFT_BRACE, "Expected '{' to start type body") properties: list[PropertyStmt] = [] while not self.check(TokenType.RIGHT_BRACE) and not self.is_at_end(): @@ -87,12 +126,26 @@ class MidasParser(Parser): return TypeBodyExpr(properties=properties) def property_stmt(self) -> PropertyStmt: + """Parse a property statement + + A type property statement is written `name: Type` + + Returns: + PropertyStmt: the parsed property statement + """ name: Token = self.consume(TokenType.IDENTIFIER, "Expected property name") self.consume(TokenType.COLON, "Expected ':' after property name") type: TypeExpr = self.type_expr() return PropertyStmt(name=name, type=type) def op_declaration(self) -> OpStmt: + """Parse an operation definition + + An operation is written `op operator = ` where `operator` can be any single token + + Returns: + OpStmt: the parsed operation statement + """ self.consume(TokenType.LESS, "Expected '<' before first type") left: TypeExpr = self.type_expr() self.consume(TokenType.GREATER, "Expected '>' after first type") @@ -112,6 +165,13 @@ class MidasParser(Parser): return OpStmt(left=left, op=op, right=right, result=result) def constraint_declaration(self) -> ConstraintStmt: + """Parse a type constraint declaration + + A constraint is written `constraint Name = constraint_expression` + + Returns: + ConstraintStmt: the parsed constraint declaration statement + """ name: Token = self.consume(TokenType.IDENTIFIER, "Expected constraint name") self.consume(TokenType.EQUAL, "Expected '=' after constraint name") constraint: ConstraintExpr = self.constraint_expr() From 4fe495620bb4e6b86e2bbd4707acd3d17dfcead6 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 09:51:17 +0200 Subject: [PATCH 21/41] feat: add annotations EBNF --- syntax/annotations.ebnf | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 syntax/annotations.ebnf diff --git a/syntax/annotations.ebnf b/syntax/annotations.ebnf new file mode 100644 index 0000000..73caf4f --- /dev/null +++ b/syntax/annotations.ebnf @@ -0,0 +1,20 @@ +identifier ::= '[a-zA-Z][a-zA-Z_]*' + +integer ::= '\d+' +number ::= integer ["." integer] +boolean ::= "False" | "True" +none ::= "None" + +value ::= number | boolean | none +lambda-value ::= "_" | value +lambda-operator ::= ">" | "<" | ">=" | "<=" | "==" | "!=" +lambda ::= lambda-value lambda-operator lambda-value + +constraint ::= identifier | "(" lambda ")" +base-type ::= identifier +type ::= base-type { "+" constraint } + +column-type ::= type | "_" +column-def ::= [ identifier ":" ] column-type + +frame-def ::= column-def { "," column-def } From 903179832e08013421e1e1ef76d4b980d4eba81e Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 10:02:10 +0200 Subject: [PATCH 22/41] feat: add annotations railroad diagrams --- syntax/annotations.typ | 74 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 syntax/annotations.typ diff --git a/syntax/annotations.typ b/syntax/annotations.typ new file mode 100644 index 0000000..8909c80 --- /dev/null +++ b/syntax/annotations.typ @@ -0,0 +1,74 @@ +#import "@preview/fervojo:0.1.1": render + +#let value = ``` +{[`value` < + [`number` 'digit' * ! ], + [`boolean` <"False", "True">], + [`none` "None"] +>]} +``` + +#let constraint = ``` +{[`constraint` "(" <"_", 'value'> <">", "<", ">=", "<=", "==", "!="> <"_", 'value'> ")"]} +``` + +#let type-with-constraints = ``` +{[`type-with-constraints` 'identifier' ]} +``` + +#let column-def = ``` +{[`column-def` <"_", 'type-with-constraints'>]} +``` + +#let frame-def = ``` +{[`frame-def` 'column-def' * ","]} +``` + +#let annotation = ``` +{[`annotation` 'identifier' ]} +``` + +#let rules = ( + value, + constraint, + type-with-constraints, + column-def, + frame-def, + annotation, +) + +#set text(font: "Source Sans 3") + += Type annotation syntax + +#for rule in rules { + render(rule) +} + +/* +#let by-name = ( + annotation: annotation, + frame-def: frame-def, + column-def: column-def, + type-with-constraints: type-with-constraints, + constraint: constraint, + value: value, +) + +#let substitute(base-rule) = { + let new-rule = base-rule + for (key, rule) in by-name.pairs() { + new-rule = new-rule.replace("'" + key + "'", rule.text.slice(1, -1)) + } + if new-rule != base-rule { + new-rule = substitute(new-rule) + } + return new-rule +} + +#let combined = raw(substitute(annotation.text)) + + +#set page(flipped: true) +#render(combined) +*/ \ No newline at end of file From a3ba0ef35da04266285c084f77f2c9373c1eb31b Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 10:23:45 +0200 Subject: [PATCH 23/41] feat: add Midas EBNF --- syntax/midas.ebnf | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 syntax/midas.ebnf diff --git a/syntax/midas.ebnf b/syntax/midas.ebnf new file mode 100644 index 0000000..71b4740 --- /dev/null +++ b/syntax/midas.ebnf @@ -0,0 +1,26 @@ +identifier ::= '[a-zA-Z][a-zA-Z_]*' + +integer ::= '\d+' +number ::= integer ["." integer] +boolean ::= "False" | "True" +none ::= "None" + +value ::= number | boolean | none +lambda-value ::= "_" | value +lambda-operator ::= ">" | "<" | ">=" | "<=" | "==" | "!=" +lambda ::= lambda-value lambda-operator lambda-value + +constraint ::= identifier | "(" lambda ")" +base-type ::= identifier +type ::= base-type { "+" constraint } + +type-property ::= 'identifier' ":" 'type' +type-body ::= "{" { 'type-property' } "}" + +operation-type ::= "<" 'type' ">" + +type-statement ::= "type" 'identifier' "<" 'type' {"," 'type'} ">" ['type-body'] +operation-statement ::= "op" 'operation-type' 'operator' 'operation-type' "=" 'operation-type' +constraint-statement ::= "constraint" 'identifier' "=" 'lambda' + +statement ::= type-statement | operation-statement | constraint-statement \ No newline at end of file From 64d96bd94ef4cf01773156ff52a731a7d4fdf742 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 10:23:53 +0200 Subject: [PATCH 24/41] feat: add Midas railroad diagrams --- syntax/annotations.typ | 4 +- syntax/midas.typ | 97 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 syntax/midas.typ diff --git a/syntax/annotations.typ b/syntax/annotations.typ index 8909c80..8c66031 100644 --- a/syntax/annotations.typ +++ b/syntax/annotations.typ @@ -9,11 +9,11 @@ ``` #let constraint = ``` -{[`constraint` "(" <"_", 'value'> <">", "<", ">=", "<=", "==", "!="> <"_", 'value'> ")"]} +{[`constraint` <"_", 'value'> <">", "<", ">=", "<=", "==", "!="> <"_", 'value'>]} ``` #let type-with-constraints = ``` -{[`type-with-constraints` 'identifier' ]} +{[`type-with-constraints` 'identifier' ]} ``` #let column-def = ``` diff --git a/syntax/midas.typ b/syntax/midas.typ new file mode 100644 index 0000000..17db6d6 --- /dev/null +++ b/syntax/midas.typ @@ -0,0 +1,97 @@ +#import "@preview/fervojo:0.1.1": render + +#let value = ``` +{[`value` < + [`number` 'digit' * ! ], + [`boolean` <"False", "True">], + [`none` "None"] +>]} +``` + +#let constraint = ``` +{[`constraint` <"_", 'value'> <">", "<", ">=", "<=", "==", "!="> <"_", 'value'>]} +``` + +#let type-with-constraints = ``` +{[`type-with-constraints` 'identifier' ]} +``` + +#let type-property = ``` +{[`type-property` 'identifier' ":" 'type-with-constraints']} +``` + +#let type-body = ``` +{[`type-body` "{" "}"]} +``` + +#let operation-type = ``` +{[`operation-type` "<" 'type-with-constraints' ">"]} +``` + +#let type-statement = ``` +{[`type-statement` "type" 'identifier' "<" 'type-with-constraints'*"," ">" ]} +``` + +#let operation-statement = ``` +{[`operation-statement` "op" 'operation-type' "operator" 'operation-type' "=" 'operation-type']} +``` + +#let constraint-statement = ``` +{[`constraint-statement` "constraint" 'identifier' "=" 'constraint']} +``` + +#let statement = ``` +{[`statement` <'type-statement', 'operation-statement', 'constraint-statement'>]} +``` + +#let rules = ( + value, + constraint, + type-with-constraints, + type-property, + type-body, + operation-type, + type-statement, + operation-statement, + constraint-statement, + statement, +) + +#set text(font: "Source Sans 3") + += Midas type definition syntax + +#for rule in rules { + render(rule) +} + +/* +#let by-name = ( + value: value, + constraint: constraint, + type-with-constraints: type-with-constraints, + type-property: type-property, + type-body: type-body, + operation-type: operation-type, + type-statement: type-statement, + operation-statement: operation-statement, + constraint-statement: constraint-statement, +) + +#let substitute(base-rule) = { + let new-rule = base-rule + for (key, rule) in by-name.pairs() { + new-rule = new-rule.replace("'" + key + "'", rule.text.slice(1, -1)) + } + if new-rule != base-rule { + new-rule = substitute(new-rule) + } + return new-rule.replace(regex("`.*?`"), "") +} + +#let combined = raw(substitute(statement.text)) + + +#set page(flipped: true) +#render(combined) +*/ \ No newline at end of file From cbf0f2852ebf36f7434751ad916516228f79c643 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 11:01:39 +0200 Subject: [PATCH 25/41] feat(parser): add AnnotationStmt and ConstraintExpr --- core/ast/annotations.py | 30 ++++++++++++++++- parser/annotations.py | 72 ++++++++++++++++++++++++++++------------- 2 files changed, 78 insertions(+), 24 deletions(-) diff --git a/core/ast/annotations.py b/core/ast/annotations.py index 78a7ce6..c5dce93 100644 --- a/core/ast/annotations.py +++ b/core/ast/annotations.py @@ -9,6 +9,25 @@ from lexer.token import Token T = TypeVar("T") +@dataclass(frozen=True) +class Stmt(ABC): + @abstractmethod + def accept(self, visitor: Visitor[T]) -> T: ... + + class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_annotation_stmt(self, stmt: AnnotationStmt) -> T: ... + + +@dataclass(frozen=True) +class AnnotationStmt(Stmt): + name: Token + schema: Optional[SchemaExpr] + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_annotation_stmt(self) + + @dataclass(frozen=True) class Expr(ABC): @abstractmethod @@ -17,6 +36,9 @@ class Expr(ABC): class Visitor(ABC, Generic[T]): @abstractmethod def visit_type_expr(self, expr: TypeExpr) -> T: ... + + @abstractmethod + def visit_constraint_expr(self, expr: ConstraintExpr) -> T: ... @abstractmethod def visit_schema_expr(self, expr: SchemaExpr) -> T: ... @@ -28,12 +50,18 @@ class Expr(ABC): @dataclass(frozen=True) class TypeExpr(Expr): name: Token - schema: Optional[SchemaExpr] + constraints: list[ConstraintExpr] def accept(self, visitor: Expr.Visitor[T]) -> T: return visitor.visit_type_expr(self) +@dataclass(frozen=True) +class ConstraintExpr(Expr): + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_constraint_expr(self) + + @dataclass(frozen=True) class SchemaExpr(Expr): left: Token diff --git a/parser/annotations.py b/parser/annotations.py index dcd3a32..e90cf43 100644 --- a/parser/annotations.py +++ b/parser/annotations.py @@ -1,6 +1,14 @@ from typing import Optional -from core.ast.annotations import Expr, SchemaElementExpr, SchemaExpr, TypeExpr +from core.ast.annotations import ( + AnnotationStmt, + ConstraintExpr, + Expr, + SchemaElementExpr, + SchemaExpr, + Stmt, + TypeExpr, +) from lexer.token import Token, TokenType from parser.base import Parser from parser.errors import ParsingError @@ -11,11 +19,15 @@ class AnnotationParser(Parser): SYNC_BOUNDARY: set[TokenType] = set() - def parse(self) -> Optional[Expr]: - expression: Optional[Expr] = self.annotation() + def parse(self) -> Optional[Stmt]: + stmt: Optional[Stmt] = None + try: + stmt = self.annotation() + except ParsingError: + self.synchronize() if not self.is_at_end(): self.error(self.peek(), "Extra tokens") - return expression + return stmt def synchronize(self): """Skip tokens until a synchronization boundary is found @@ -29,33 +41,47 @@ class AnnotationParser(Parser): return self.advance() - def annotation(self) -> Optional[Expr]: - """Try and parse an annotation + def annotation(self) -> AnnotationStmt: + """Parse an annotation - Any parsing error is caught and None is returned + An annotation is written as `Type` or `Type[Schema]` Returns: - Optional[Expr]: the parsed annotation expression, or None if a ParsingError was raised + AnnotationStmt: the parsed annotation statement """ - try: - return self.type() - except ParsingError: - self.synchronize() - return None - def type(self) -> TypeExpr: - """Parse a type definition - - `Type` or `Type[Schema]` - - Returns: - TypeExpr: the parsed type expression - """ name: Token = self.consume(TokenType.IDENTIFIER, "Expected type identifier") schema: Optional[SchemaExpr] = None if self.match(TokenType.LEFT_BRACKET): schema = self.schema() - return TypeExpr(name=name, schema=schema) + return AnnotationStmt(name=name, schema=schema) + + def type_expr(self) -> TypeExpr: + """Parse a type expression + + Returns: + TypeExpr: the parsed type expression + """ + name: Token = self.consume(TokenType.IDENTIFIER, "Expected type name") + constraints: list[ConstraintExpr] = [] + + while not self.is_at_end() and self.match(TokenType.PLUS): + print(self.peek()) + print(self.tokens) + self.consume(TokenType.LEFT_PAREN, "Expected '(' before type constraint") + constraints.append(self.constraint_expr()) + self.consume(TokenType.RIGHT_PAREN, "Expected ')' after type constraint") + + return TypeExpr(name=name, constraints=constraints) + + def constraint_expr(self) -> ConstraintExpr: + """Parse a type constraint + + Returns: + ConstraintExpr: the parsed type constraint expression + """ + # TODO + return ConstraintExpr() def schema(self) -> SchemaExpr: """Parse a schema definition @@ -96,5 +122,5 @@ class AnnotationParser(Parser): name = self.advance() self.advance() if not self.match(TokenType.UNDERSCORE): - type = self.type() + type = self.type_expr() return SchemaElementExpr(name=name, type=type) From 340bcc65fda99a9f55e1dca715d35a373f71099d Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 11:02:06 +0200 Subject: [PATCH 26/41] feat(parser): update annotation printers --- core/ast/printer.py | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/core/ast/printer.py b/core/ast/printer.py index 859de7b..2b338c6 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -84,12 +84,28 @@ class AstPrinter(Generic[T]): child.accept(self) -class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None]): +class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None], a.Stmt.Visitor[None]): + def visit_annotation_stmt(self, stmt: a.AnnotationStmt) -> None: + self._write_line("AnnotationStmt") + with self._child_level(): + self._write_line(f'name: "{stmt.name.lexeme}"') + self._write_optional_child("schema", stmt.schema, last=True) + def visit_type_expr(self, expr: a.TypeExpr): self._write_line("TypeExpr") with self._child_level(): self._write_line(f'name: "{expr.name.lexeme}"') - self._write_optional_child("schema", expr.schema, last=True) + self._write_line("constraints", last=True) + with self._child_level(): + for i, constraint in enumerate(expr.constraints): + self._idx = i + if i == len(expr.constraints) - 1: + self._mark_last() + constraint.accept(self) + + def visit_constraint_expr(self, expr: a.ConstraintExpr) -> None: + self._write_line("ConstraintExpr") + # TODO def visit_schema_expr(self, expr: a.SchemaExpr): self._write_line("SchemaExpr") @@ -108,15 +124,25 @@ class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None]): self._write_optional_child("type", expr.type, last=True) -class AnnotationPrinter(a.Expr.Visitor[str]): - def print(self, expr: a.Expr): +class AnnotationPrinter(a.Expr.Visitor[str], a.Stmt.Visitor[str]): + def print(self, expr: a.Expr | a.Stmt): return expr.accept(self) - def visit_type_expr(self, expr: a.TypeExpr) -> str: + def visit_annotation_stmt(self, stmt: a.AnnotationStmt) -> str: schema: str = "" - if expr.schema is not None: - schema = expr.schema.accept(self) - return f"{expr.name.lexeme}{schema}" + if stmt.schema is not None: + schema = stmt.schema.accept(self) + return f"{stmt.name.lexeme}{schema}" + + def visit_type_expr(self, expr: a.TypeExpr) -> str: + parts: list[str] = [expr.name.lexeme] + for constraint in expr.constraints: + parts.append("(" + constraint.accept(self) + ")") + return " + ".join(parts) + + def visit_constraint_expr(self, expr: a.ConstraintExpr) -> str: + # TODO + return "" def visit_schema_expr(self, expr: a.SchemaExpr) -> str: res: str = expr.left.lexeme From be50a8db3512c30ed18ffe0ec5150ef2e674ac79 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 11:02:48 +0200 Subject: [PATCH 27/41] feat(parser): add operators to annotations lexer --- lexer/annotations.py | 17 +++++++++++++++++ lexer/midas.py | 5 +++++ lexer/token.py | 1 + 3 files changed, 23 insertions(+) diff --git a/lexer/annotations.py b/lexer/annotations.py index 3cc0431..f72f81a 100644 --- a/lexer/annotations.py +++ b/lexer/annotations.py @@ -14,6 +14,23 @@ class AnnotationLexer(Lexer): self.add_token(TokenType.LEFT_BRACKET) case "]": self.add_token(TokenType.RIGHT_BRACKET) + case "<": + self.add_token( + TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS + ) + case ">": + self.add_token( + TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER + ) + case "=": + self.add_token( + TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL + ) + case "!": + if self.peek() == "=": + self.add_token(TokenType.BANG_EQUAL) + else: + self.error("Unexpected single bang. Did you mean '!=' ?") case ":": self.add_token(TokenType.COLON) case ",": diff --git a/lexer/midas.py b/lexer/midas.py index 16440da..86bfafe 100644 --- a/lexer/midas.py +++ b/lexer/midas.py @@ -31,6 +31,11 @@ class MidasLexer(Lexer): self.add_token( TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL ) + case "!": + if self.peek() == "=": + self.add_token(TokenType.BANG_EQUAL) + else: + self.error("Unexpected single bang. Did you mean '!=' ?") case ":": self.add_token(TokenType.COLON) case ",": diff --git a/lexer/token.py b/lexer/token.py index 9b5bc13..70a7a1b 100644 --- a/lexer/token.py +++ b/lexer/token.py @@ -28,6 +28,7 @@ class TokenType(Enum): LESS_EQUAL = auto() EQUAL = auto() EQUAL_EQUAL = auto() + BANG_EQUAL = auto() # Literals IDENTIFIER = auto() From 8bc091851702d676a2360dcbdd771805f9d57bf6 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 11:27:52 +0200 Subject: [PATCH 28/41] feat(parser): parse annotation type constraints --- core/ast/annotations.py | 30 ++++++++++++++++++++++++++++-- core/ast/printer.py | 35 ++++++++++++++++++++++++++++++++--- parser/annotations.py | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 90 insertions(+), 7 deletions(-) diff --git a/core/ast/annotations.py b/core/ast/annotations.py index c5dce93..a885e29 100644 --- a/core/ast/annotations.py +++ b/core/ast/annotations.py @@ -2,7 +2,7 @@ from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Generic, Optional, TypeVar +from typing import Any, Generic, Optional, TypeVar from lexer.token import Token @@ -34,9 +34,15 @@ class Expr(ABC): def accept(self, visitor: Visitor[T]) -> T: ... class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_wildcard_expr(self, expr: WildcardExpr) -> T: ... + + @abstractmethod + def visit_literal_expr(self, expr: LiteralExpr) -> T: ... + @abstractmethod def visit_type_expr(self, expr: TypeExpr) -> T: ... - + @abstractmethod def visit_constraint_expr(self, expr: ConstraintExpr) -> T: ... @@ -47,6 +53,22 @@ class Expr(ABC): def visit_schema_element_expr(self, expr: SchemaElementExpr) -> T: ... +@dataclass(frozen=True) +class WildcardExpr(Expr): + token: Token + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_wildcard_expr(self) + + +@dataclass(frozen=True) +class LiteralExpr(Expr): + value: Any + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_literal_expr(self) + + @dataclass(frozen=True) class TypeExpr(Expr): name: Token @@ -58,6 +80,10 @@ class TypeExpr(Expr): @dataclass(frozen=True) class ConstraintExpr(Expr): + left: Expr + op: Token + right: Expr + def accept(self, visitor: Expr.Visitor[T]) -> T: return visitor.visit_constraint_expr(self) diff --git a/core/ast/printer.py b/core/ast/printer.py index 2b338c6..4fdc7fb 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -105,7 +105,18 @@ class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None], a.Stmt.Visitor[None def visit_constraint_expr(self, expr: a.ConstraintExpr) -> None: self._write_line("ConstraintExpr") - # TODO + with self._child_level(): + self._write_line("left") + with self._child_level(): + self._mark_last() + expr.left.accept(self) + + self._write_line(f"operator: {expr.op.lexeme}") + + self._write_line("right", last=True) + with self._child_level(): + self._mark_last() + expr.right.accept(self) def visit_schema_expr(self, expr: a.SchemaExpr): self._write_line("SchemaExpr") @@ -122,6 +133,14 @@ class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None], a.Stmt.Visitor[None name_text: str = "None" if expr.name is None else f'"{expr.name.lexeme}"' self._write_line(f"name: {name_text}") self._write_optional_child("type", expr.type, last=True) + + def visit_wildcard_expr(self, expr: a.WildcardExpr) -> None: + self._write_line("WildcardExpr") + + def visit_literal_expr(self, expr: a.LiteralExpr) -> None: + self._write_line("LiteralExpr") + with self._child_level(): + self._write_line(f'value: {expr.value}', last=True) class AnnotationPrinter(a.Expr.Visitor[str], a.Stmt.Visitor[str]): @@ -141,8 +160,12 @@ class AnnotationPrinter(a.Expr.Visitor[str], a.Stmt.Visitor[str]): return " + ".join(parts) def visit_constraint_expr(self, expr: a.ConstraintExpr) -> str: - # TODO - return "" + parts: list[str] = [ + expr.left.accept(self), + expr.op.lexeme, + expr.right.accept(self) + ] + return " ".join(parts) def visit_schema_expr(self, expr: a.SchemaExpr) -> str: res: str = expr.left.lexeme @@ -161,6 +184,12 @@ class AnnotationPrinter(a.Expr.Visitor[str], a.Stmt.Visitor[str]): parts.append(expr.type.accept(self)) return ": ".join(parts) + def visit_wildcard_expr(self, expr: a.WildcardExpr) -> str: + return "_" + + def visit_literal_expr(self, expr: a.LiteralExpr) -> str: + return str(expr.value) + class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): def visit_type_stmt(self, stmt: m.TypeStmt): diff --git a/parser/annotations.py b/parser/annotations.py index e90cf43..5b75762 100644 --- a/parser/annotations.py +++ b/parser/annotations.py @@ -4,10 +4,12 @@ from core.ast.annotations import ( AnnotationStmt, ConstraintExpr, Expr, + LiteralExpr, SchemaElementExpr, SchemaExpr, Stmt, TypeExpr, + WildcardExpr, ) from lexer.token import Token, TokenType from parser.base import Parser @@ -80,8 +82,34 @@ class AnnotationParser(Parser): Returns: ConstraintExpr: the parsed type constraint expression """ - # TODO - return ConstraintExpr() + + left: Expr = self.constraint_value() + op: Token = self.constraint_operator() + right: Expr = self.constraint_value() + return ConstraintExpr(left=left, op=op, right=right) + + def constraint_value(self) -> Expr: + if self.match(TokenType.UNDERSCORE): + return WildcardExpr(self.previous()) + return self.literal() + + def literal(self) -> LiteralExpr: + if self.match(TokenType.FALSE): + return LiteralExpr(False) + if self.match(TokenType.TRUE): + return LiteralExpr(True) + if self.match(TokenType.NONE): + return LiteralExpr(None) + + if self.match(TokenType.NUMBER): + return LiteralExpr(self.previous().value) + + raise self.error(self.peek(), "Expected literal") + + def constraint_operator(self) -> Token: + if self.match(TokenType.LESS, TokenType.LESS_EQUAL, TokenType.GREATER, TokenType.GREATER_EQUAL, TokenType.EQUAL_EQUAL, TokenType.BANG_EQUAL): + return self.previous() + raise self.error(self.peek(), "Expected constraint operator") def schema(self) -> SchemaExpr: """Parse a schema definition From e10d71a66b336a3385b9b7596725b4278de60ff1 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 11:33:33 +0200 Subject: [PATCH 29/41] feat(parser): split annotation and Midas keywords --- lexer/annotations.py | 6 +++++- lexer/keyword.py | 9 ++++++++- lexer/midas.py | 4 ++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lexer/annotations.py b/lexer/annotations.py index f72f81a..3ee18af 100644 --- a/lexer/annotations.py +++ b/lexer/annotations.py @@ -1,4 +1,5 @@ from lexer.base import Lexer +from lexer.keyword import ANNOTATION_KEYWORDS from lexer.token import TokenType @@ -86,7 +87,10 @@ class AnnotationLexer(Lexer): """ while self.peek().isalnum() or self.peek() == "_": self.advance() - self.add_token(TokenType.IDENTIFIER) + + lexeme: str = self.source[self.start : self.idx] + token_type: TokenType = ANNOTATION_KEYWORDS.get(lexeme, TokenType.IDENTIFIER) + self.add_token(token_type) def scan_comment(self): """Scan the rest of a comment and add it as a token diff --git a/lexer/keyword.py b/lexer/keyword.py index a4f03cf..b66f21a 100644 --- a/lexer/keyword.py +++ b/lexer/keyword.py @@ -1,9 +1,16 @@ from lexer.token import TokenType -KEYWORDS: dict[str, TokenType] = { +ANNOTATION_KEYWORDS: dict[str, TokenType] = { + "True": TokenType.TRUE, + "False": TokenType.FALSE, + "None": TokenType.NONE, +} + +MIDAS_KEYWORDS: dict[str, TokenType] = { "type": TokenType.TYPE, "op": TokenType.OP, "constraint": TokenType.CONSTRAINT, "true": TokenType.TRUE, "false": TokenType.FALSE, + "none": TokenType.NONE, } diff --git a/lexer/midas.py b/lexer/midas.py index 86bfafe..42d8e6b 100644 --- a/lexer/midas.py +++ b/lexer/midas.py @@ -1,5 +1,5 @@ from lexer.base import Lexer -from lexer.keyword import KEYWORDS +from lexer.keyword import MIDAS_KEYWORDS from lexer.token import TokenType @@ -102,7 +102,7 @@ class MidasLexer(Lexer): self.advance() lexeme: str = self.source[self.start : self.idx] - token_type: TokenType = KEYWORDS.get(lexeme, TokenType.IDENTIFIER) + token_type: TokenType = MIDAS_KEYWORDS.get(lexeme, TokenType.IDENTIFIER) self.add_token(token_type) def scan_comment(self): From 74ac9c5381f3a7bb69d4f07c24e66fa99084729a Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 12:18:41 +0200 Subject: [PATCH 30/41] feat(parser): parse Midas type constraints --- core/ast/midas.py | 28 +++++++++++++++++++++++++++- core/ast/printer.py | 36 ++++++++++++++++++++++++++++-------- parser/annotations.py | 2 -- parser/midas.py | 35 +++++++++++++++++++++++++++++++++-- 4 files changed, 88 insertions(+), 13 deletions(-) diff --git a/core/ast/midas.py b/core/ast/midas.py index 6d1b035..4f2b03f 100644 --- a/core/ast/midas.py +++ b/core/ast/midas.py @@ -2,7 +2,7 @@ from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Generic, Optional, TypeVar +from typing import Any, Generic, Optional, TypeVar from lexer.token import Token @@ -79,6 +79,12 @@ class Expr(ABC): def accept(self, visitor: Visitor[T]) -> T: ... class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_wildcard_expr(self, expr: WildcardExpr) -> T: ... + + @abstractmethod + def visit_literal_expr(self, expr: LiteralExpr) -> T: ... + @abstractmethod def visit_type_expr(self, expr: TypeExpr) -> T: ... @@ -89,6 +95,22 @@ class Expr(ABC): def visit_type_body_expr(self, expr: TypeBodyExpr) -> T: ... +@dataclass(frozen=True) +class WildcardExpr(Expr): + token: Token + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_wildcard_expr(self) + + +@dataclass(frozen=True) +class LiteralExpr(Expr): + value: Any + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_literal_expr(self) + + @dataclass(frozen=True) class TypeExpr(Expr): name: Token @@ -100,6 +122,10 @@ class TypeExpr(Expr): @dataclass(frozen=True) class ConstraintExpr(Expr): + left: Expr + op: Token + right: Expr + def accept(self, visitor: Expr.Visitor[T]) -> T: return visitor.visit_constraint_expr(self) diff --git a/core/ast/printer.py b/core/ast/printer.py index 4fdc7fb..1fd48de 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -102,7 +102,7 @@ class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None], a.Stmt.Visitor[None if i == len(expr.constraints) - 1: self._mark_last() constraint.accept(self) - + def visit_constraint_expr(self, expr: a.ConstraintExpr) -> None: self._write_line("ConstraintExpr") with self._child_level(): @@ -110,9 +110,9 @@ class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None], a.Stmt.Visitor[None with self._child_level(): self._mark_last() expr.left.accept(self) - + self._write_line(f"operator: {expr.op.lexeme}") - + self._write_line("right", last=True) with self._child_level(): self._mark_last() @@ -133,14 +133,14 @@ class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None], a.Stmt.Visitor[None name_text: str = "None" if expr.name is None else f'"{expr.name.lexeme}"' self._write_line(f"name: {name_text}") self._write_optional_child("type", expr.type, last=True) - + def visit_wildcard_expr(self, expr: a.WildcardExpr) -> None: self._write_line("WildcardExpr") - + def visit_literal_expr(self, expr: a.LiteralExpr) -> None: self._write_line("LiteralExpr") with self._child_level(): - self._write_line(f'value: {expr.value}', last=True) + self._write_line(f"value: {expr.value}", last=True) class AnnotationPrinter(a.Expr.Visitor[str], a.Stmt.Visitor[str]): @@ -158,12 +158,12 @@ class AnnotationPrinter(a.Expr.Visitor[str], a.Stmt.Visitor[str]): for constraint in expr.constraints: parts.append("(" + constraint.accept(self) + ")") return " + ".join(parts) - + def visit_constraint_expr(self, expr: a.ConstraintExpr) -> str: parts: list[str] = [ expr.left.accept(self), expr.op.lexeme, - expr.right.accept(self) + expr.right.accept(self), ] return " ".join(parts) @@ -257,6 +257,18 @@ class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): def visit_constraint_expr(self, expr: m.ConstraintExpr): self._write_line("ConstraintExpr") + with self._child_level(): + self._write_line("left") + with self._child_level(): + self._mark_last() + expr.left.accept(self) + + self._write_line(f"operator: {expr.op.lexeme}") + + self._write_line("right", last=True) + with self._child_level(): + self._mark_last() + expr.right.accept(self) def visit_type_body_expr(self, expr: m.TypeBodyExpr): self._write_line("TypeBodyExpr") @@ -268,3 +280,11 @@ class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): if i == len(expr.properties) - 1: self._mark_last() property.accept(self) + + def visit_wildcard_expr(self, expr: m.WildcardExpr) -> None: + self._write_line("WildcardExpr") + + def visit_literal_expr(self, expr: m.LiteralExpr) -> None: + self._write_line("LiteralExpr") + with self._child_level(): + self._write_line(f"value: {expr.value}", last=True) diff --git a/parser/annotations.py b/parser/annotations.py index 5b75762..0bf99d6 100644 --- a/parser/annotations.py +++ b/parser/annotations.py @@ -68,8 +68,6 @@ class AnnotationParser(Parser): constraints: list[ConstraintExpr] = [] while not self.is_at_end() and self.match(TokenType.PLUS): - print(self.peek()) - print(self.tokens) self.consume(TokenType.LEFT_PAREN, "Expected '(' before type constraint") constraints.append(self.constraint_expr()) self.consume(TokenType.RIGHT_PAREN, "Expected ')' after type constraint") diff --git a/parser/midas.py b/parser/midas.py index bf7c1d7..631e51c 100644 --- a/parser/midas.py +++ b/parser/midas.py @@ -3,12 +3,15 @@ from typing import Optional from core.ast.midas import ( ConstraintExpr, ConstraintStmt, + Expr, + LiteralExpr, OpStmt, PropertyStmt, Stmt, TypeBodyExpr, TypeExpr, TypeStmt, + WildcardExpr, ) from lexer.token import Token, TokenType from parser.base import Parser @@ -96,7 +99,9 @@ class MidasParser(Parser): constraints: list[ConstraintExpr] = [] while not self.is_at_end() and self.match(TokenType.PLUS): + self.consume(TokenType.LEFT_PAREN, "Expected '(' before type constraint") constraints.append(self.constraint_expr()) + self.consume(TokenType.RIGHT_PAREN, "Expected ')' after type constraint") return TypeExpr(name=name, constraints=constraints) @@ -106,8 +111,34 @@ class MidasParser(Parser): Returns: ConstraintExpr: the parsed type constraint expression """ - # TODO - return ConstraintExpr() + + left: Expr = self.constraint_value() + op: Token = self.constraint_operator() + right: Expr = self.constraint_value() + return ConstraintExpr(left=left, op=op, right=right) + + def constraint_value(self) -> Expr: + if self.match(TokenType.UNDERSCORE): + return WildcardExpr(self.previous()) + return self.literal() + + def literal(self) -> LiteralExpr: + if self.match(TokenType.FALSE): + return LiteralExpr(False) + if self.match(TokenType.TRUE): + return LiteralExpr(True) + if self.match(TokenType.NONE): + return LiteralExpr(None) + + if self.match(TokenType.NUMBER): + return LiteralExpr(self.previous().value) + + raise self.error(self.peek(), "Expected literal") + + def constraint_operator(self) -> Token: + if self.match(TokenType.LESS, TokenType.LESS_EQUAL, TokenType.GREATER, TokenType.GREATER_EQUAL, TokenType.EQUAL_EQUAL, TokenType.BANG_EQUAL): + return self.previous() + raise self.error(self.peek(), "Expected constraint operator") def type_body_expr(self) -> TypeBodyExpr: """Parse a type definition body From 3f199ff1342322c6ab2f319a487a22cec317f40f Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 12:43:05 +0200 Subject: [PATCH 31/41] feat(parser): add Midas pretty printer --- core/ast/printer.py | 70 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/core/ast/printer.py b/core/ast/printer.py index 1fd48de..086c581 100644 --- a/core/ast/printer.py +++ b/core/ast/printer.py @@ -288,3 +288,73 @@ class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): self._write_line("LiteralExpr") with self._child_level(): self._write_line(f"value: {expr.value}", last=True) + +class MidasPrinter(m.Expr.Visitor[str], m.Stmt.Visitor[str]): + def __init__(self, indent: int = 4): + self.indent: int = indent + self.level: int = 0 + + def indented(self, text: str) -> str: + return " " * (self.level * self.indent) + text + + def print(self, expr: m.Expr | m.Stmt): + self.level = 0 + return expr.accept(self) + + def visit_type_stmt(self, stmt: m.TypeStmt): + bases: list[str] = [ + b.accept(self) + for b in stmt.bases + ] + + res: str = self.indented(f"type {stmt.name.lexeme}<{', '.join(bases)}>") + if stmt.body is not None: + res += " {\n" + self.level += 1 + res += stmt.body.accept(self) + self.level -= 1 + res += "\n" + self.indented("}") + + return res + + def visit_property_stmt(self, stmt: m.PropertyStmt): + return f"{stmt.name.lexeme}: {stmt.type.accept(self)}" + + def visit_op_stmt(self, stmt: m.OpStmt): + left: str = stmt.left.accept(self) + op: str = stmt.op.lexeme + right: str = stmt.right.accept(self) + result: str = stmt.result.accept(self) + return self.indented(f"op <{left}> {op} <{right}> = <{result}>") + + def visit_constraint_stmt(self, stmt: m.ConstraintStmt): + name: str = stmt.name.lexeme + constraint: str = stmt.constraint.accept(self) + return self.indented(f"constraint {name} = {constraint}") + + def visit_type_expr(self, expr: m.TypeExpr): + parts: list[str] = [expr.name.lexeme] + for constraint in expr.constraints: + parts.append("(" + constraint.accept(self) + ")") + return " + ".join(parts) + + def visit_constraint_expr(self, expr: m.ConstraintExpr): + parts: list[str] = [ + expr.left.accept(self), + expr.op.lexeme, + expr.right.accept(self), + ] + return " ".join(parts) + + def visit_type_body_expr(self, expr: m.TypeBodyExpr): + properties: list[str] = [ + self.indented(prop.accept(self)) + for prop in expr.properties + ] + return "\n".join(properties) + + def visit_wildcard_expr(self, expr: m.WildcardExpr): + return "_" + + def visit_literal_expr(self, expr: m.LiteralExpr): + return str(expr.value) \ No newline at end of file From ee308fe22307effa41a27dcf7cc19448080ad235 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 12:47:21 +0200 Subject: [PATCH 32/41] fix(parser): update examples --- examples/00_syntax_prototype/01_simple_types.py | 2 +- examples/00_syntax_prototype/02_custom_types.midas | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/00_syntax_prototype/01_simple_types.py b/examples/00_syntax_prototype/01_simple_types.py index cfb10aa..725fdf4 100644 --- a/examples/00_syntax_prototype/01_simple_types.py +++ b/examples/00_syntax_prototype/01_simple_types.py @@ -7,7 +7,7 @@ from __future__ import annotations df: Frame[ verified: bool, birth_year: int, - height: float, + height: float + ( _ > 0 ) + ( _ < 250 ), name: str, date: datetime, float, # unnamed diff --git a/examples/00_syntax_prototype/02_custom_types.midas b/examples/00_syntax_prototype/02_custom_types.midas index 8248e16..017e40c 100644 --- a/examples/00_syntax_prototype/02_custom_types.midas +++ b/examples/00_syntax_prototype/02_custom_types.midas @@ -16,7 +16,7 @@ op - = op - = // Simple custom type with a constraint -type Age +type Age // Predefined custom constraints that can be referenced in other definitions constraint Positive = _ >= 0 From 453c72af6d2b225bea7d101618e196c1e23e6650 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 13:11:00 +0200 Subject: [PATCH 33/41] tests(parser): add basic lexer test add a basic test for the annotation lexer to check punctuation tokens --- tests/lexer/test_annotation_lexer.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/lexer/test_annotation_lexer.py diff --git a/tests/lexer/test_annotation_lexer.py b/tests/lexer/test_annotation_lexer.py new file mode 100644 index 0000000..4d08f7f --- /dev/null +++ b/tests/lexer/test_annotation_lexer.py @@ -0,0 +1,26 @@ +import pytest + +from lexer.annotations import AnnotationLexer +from lexer.token import Token, TokenType + + +def scan(source: str) -> list[Token]: + return AnnotationLexer(source).process() + +def assert_n_tokens(tokens: list[Token], n: int): + assert len(tokens) == n + 1 + assert tokens[-1].type == TokenType.EOF + +@pytest.mark.parametrize("src,expected", [ + ("(", TokenType.LEFT_PAREN), + (")", TokenType.RIGHT_PAREN), + ("[", TokenType.LEFT_BRACKET), + ("]", TokenType.RIGHT_BRACKET), + (":", TokenType.COLON), + (",", TokenType.COMMA), + ("_", TokenType.UNDERSCORE), +]) +def test_punctuation(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected \ No newline at end of file From f41c7578384ec333ac163a0581e08cc9bf9cbc4d Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 13:21:23 +0200 Subject: [PATCH 34/41] tests(parser): complete simple annotation lexer tests --- tests/lexer/test_annotation_lexer.py | 85 ++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 10 deletions(-) diff --git a/tests/lexer/test_annotation_lexer.py b/tests/lexer/test_annotation_lexer.py index 4d08f7f..2e35d18 100644 --- a/tests/lexer/test_annotation_lexer.py +++ b/tests/lexer/test_annotation_lexer.py @@ -7,20 +7,85 @@ from lexer.token import Token, TokenType def scan(source: str) -> list[Token]: return AnnotationLexer(source).process() + def assert_n_tokens(tokens: list[Token], n: int): assert len(tokens) == n + 1 assert tokens[-1].type == TokenType.EOF -@pytest.mark.parametrize("src,expected", [ - ("(", TokenType.LEFT_PAREN), - (")", TokenType.RIGHT_PAREN), - ("[", TokenType.LEFT_BRACKET), - ("]", TokenType.RIGHT_BRACKET), - (":", TokenType.COLON), - (",", TokenType.COMMA), - ("_", TokenType.UNDERSCORE), -]) + +@pytest.mark.parametrize( + "src,expected", + [ + ("(", TokenType.LEFT_PAREN), + (")", TokenType.RIGHT_PAREN), + ("[", TokenType.LEFT_BRACKET), + ("]", TokenType.RIGHT_BRACKET), + (":", TokenType.COLON), + (",", TokenType.COMMA), + ("_", TokenType.UNDERSCORE), + ], +) def test_punctuation(src: str, expected: TokenType): tokens: list[Token] = scan(src) assert_n_tokens(tokens, 1) - assert tokens[0].type == expected \ No newline at end of file + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("+", TokenType.PLUS), + (">", TokenType.GREATER), + (">=", TokenType.GREATER_EQUAL), + ("<", TokenType.LESS), + ("<=", TokenType.LESS_EQUAL), + ("=", TokenType.EQUAL), + ("==", TokenType.EQUAL_EQUAL), + ("!=", TokenType.BANG_EQUAL), + ], +) +def test_operators(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("a", TokenType.IDENTIFIER), + ("foo", TokenType.IDENTIFIER), + ("foo1", TokenType.IDENTIFIER), + ("foo_", TokenType.IDENTIFIER), + ("foo_bar1_baz2", TokenType.IDENTIFIER), + ("FOO_BAR1_BAZ2", TokenType.IDENTIFIER), + ("0", TokenType.NUMBER), + ("0.0", TokenType.NUMBER), + ("1234.56", TokenType.NUMBER), + ("True", TokenType.TRUE), + ("False", TokenType.FALSE), + ("None", TokenType.NONE), + ], +) +def test_literals(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("#", TokenType.COMMENT), + ("# This is a comment", TokenType.COMMENT), + (" ", TokenType.WHITESPACE), + ("\t", TokenType.WHITESPACE), + ("\r", TokenType.WHITESPACE), + (" \t \t", TokenType.WHITESPACE), + ("\n", TokenType.NEWLINE), + ], +) +def test_misc(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected From 1b1fbb834edfea901b31d56a17f94da41f489229 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 13:22:11 +0200 Subject: [PATCH 35/41] fix(parser): fix bang equal consume equal token when matching bang-equal --- lexer/annotations.py | 2 +- lexer/midas.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lexer/annotations.py b/lexer/annotations.py index 3ee18af..ae9faae 100644 --- a/lexer/annotations.py +++ b/lexer/annotations.py @@ -28,7 +28,7 @@ class AnnotationLexer(Lexer): TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL ) case "!": - if self.peek() == "=": + if self.match("="): self.add_token(TokenType.BANG_EQUAL) else: self.error("Unexpected single bang. Did you mean '!=' ?") diff --git a/lexer/midas.py b/lexer/midas.py index 42d8e6b..ad29a68 100644 --- a/lexer/midas.py +++ b/lexer/midas.py @@ -32,7 +32,7 @@ class MidasLexer(Lexer): TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL ) case "!": - if self.peek() == "=": + if self.match("="): self.add_token(TokenType.BANG_EQUAL) else: self.error("Unexpected single bang. Did you mean '!=' ?") From 6f0c0ce3263ef213fef2f5b9b13223c1997f16d6 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 13:34:32 +0200 Subject: [PATCH 36/41] tests(parser): add literal value test --- tests/lexer/test_annotation_lexer.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/lexer/test_annotation_lexer.py b/tests/lexer/test_annotation_lexer.py index 2e35d18..2ab91dd 100644 --- a/tests/lexer/test_annotation_lexer.py +++ b/tests/lexer/test_annotation_lexer.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest from lexer.annotations import AnnotationLexer @@ -59,15 +61,12 @@ def test_operators(src: str, expected: TokenType): ("foo_", TokenType.IDENTIFIER), ("foo_bar1_baz2", TokenType.IDENTIFIER), ("FOO_BAR1_BAZ2", TokenType.IDENTIFIER), - ("0", TokenType.NUMBER), - ("0.0", TokenType.NUMBER), - ("1234.56", TokenType.NUMBER), ("True", TokenType.TRUE), ("False", TokenType.FALSE), ("None", TokenType.NONE), ], ) -def test_literals(src: str, expected: TokenType): +def test_identifiers_keywords(src: str, expected: TokenType): tokens: list[Token] = scan(src) assert_n_tokens(tokens, 1) assert tokens[0].type == expected @@ -89,3 +88,18 @@ def test_misc(src: str, expected: TokenType): tokens: list[Token] = scan(src) assert_n_tokens(tokens, 1) assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected_type,expected_value", + [ + ("0", TokenType.NUMBER, 0), + ("0.0", TokenType.NUMBER, 0), + ("1234.56", TokenType.NUMBER, 1234.56), + ], +) +def test_literals(src: str, expected_type: TokenType, expected_value: Any): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected_type + assert tokens[0].value == expected_value From 7581a35be41b7239946247afb22813d6814d1ef3 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 13:43:12 +0200 Subject: [PATCH 37/41] tests(parser): add syntax error test --- tests/lexer/test_annotation_lexer.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/lexer/test_annotation_lexer.py b/tests/lexer/test_annotation_lexer.py index 2ab91dd..33a83a1 100644 --- a/tests/lexer/test_annotation_lexer.py +++ b/tests/lexer/test_annotation_lexer.py @@ -103,3 +103,27 @@ def test_literals(src: str, expected_type: TokenType, expected_value: Any): assert_n_tokens(tokens, 1) assert tokens[0].type == expected_type assert tokens[0].value == expected_value + + +def test_single_bang_error(): + with pytest.raises(SyntaxError): + scan("!") + + +@pytest.mark.parametrize( + "src", + [ + "-", + "*", + "/", + "{", + "}", + "@", + '"', + "'", + ".", + ], +) +def test_unexpected_character(src: str): + with pytest.raises(SyntaxError): + scan(src) From ae02bab030bc9e9f86ee9ccf95639c35d5c15123 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 14:42:27 +0200 Subject: [PATCH 38/41] tests(parser): add tests for annotation parser --- tests/parser/test_annotation_parser.py | 130 +++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 tests/parser/test_annotation_parser.py diff --git a/tests/parser/test_annotation_parser.py b/tests/parser/test_annotation_parser.py new file mode 100644 index 0000000..9c034dd --- /dev/null +++ b/tests/parser/test_annotation_parser.py @@ -0,0 +1,130 @@ +from typing import Optional + +import pytest + +from core.ast.annotations import ( + AnnotationStmt, + ConstraintExpr, + Expr, + LiteralExpr, + SchemaElementExpr, + SchemaExpr, + Stmt, + TypeExpr, + WildcardExpr, +) +from lexer.annotations import AnnotationLexer +from lexer.position import Position +from lexer.token import Token +from parser.annotations import AnnotationParser + + +class AstSerializer(Stmt.Visitor[str], Expr.Visitor[str]): + def serialize(self, stmt: Stmt): + return stmt.accept(self) + + def visit_annotation_stmt(self, stmt: AnnotationStmt) -> str: + schema: str = "" + if stmt.schema is not None: + schema = " " + stmt.schema.accept(self) + return f"(annotation {stmt.name.lexeme}{schema})" + + def visit_schema_expr(self, expr: SchemaExpr) -> str: + elements: list[str] = [elmt.accept(self) for elmt in expr.elements] + return f"(schema {' '.join(elements)})" + + def visit_schema_element_expr(self, expr: SchemaElementExpr) -> str: + name: str = expr.name.lexeme if expr.name is not None else "_" + type: str = expr.type.accept(self) if expr.type is not None else "_" + return f"({name} {type})" + + def visit_type_expr(self, expr: TypeExpr) -> str: + res: str = f"({expr.name.lexeme}" + for constraint in expr.constraints: + res += " " + constraint.accept(self) + res += ")" + return res + + def visit_constraint_expr(self, expr: ConstraintExpr) -> str: + return f"(constraint {expr.left.accept(self)} {expr.op.lexeme} {expr.right.accept(self)})" + + def visit_wildcard_expr(self, expr: WildcardExpr) -> str: + return "(_)" + + def visit_literal_expr(self, expr: LiteralExpr) -> str: + return f"({expr.value})" + + +def parse(source: str) -> Optional[Stmt]: + tokens: list[Token] = AnnotationLexer(source).process() + return AnnotationParser(tokens).parse() + + +def must_parse(source: str) -> Stmt: + stmt: Optional[Stmt] = parse(source) + assert stmt is not None + return stmt + + +def ast_str(source: str) -> str: + stmt: Stmt = must_parse(source) + return AstSerializer().serialize(stmt) + + +@pytest.mark.parametrize( + "src,expected", + [ + ("Type", "(annotation Type)"), + ("Type[]", "(annotation Type (schema ))"), + ( + """ + Frame[ + verified: bool, + birth_year: int, + height: float + ( _ > 0 ) + ( _ < 250 ), + name: str, + date: datetime, + float, # unnamed + unknown: _, # untyped + _ # unnamed and untyped + ] + """, + "(annotation Frame (schema (verified (bool)) (birth_year (int)) (height (float (constraint (_) > (0.0)) (constraint (_) < (250.0)))) (name (str)) (date (datetime)) (_ (float)) (unknown _) (_ _)))", + ), + ], +) +def test_expressions(src: str, expected: str): + assert ast_str(src) == expected + + +@pytest.mark.parametrize( + "src,pos,should_fail", + [ + ("", (1, 1), True), + ("42", (1, 1), True), + ("True", (1, 1), True), + ("Type[", (1, 6), True), + ("Type[] Type2", (1, 8), False), + ("Type[bool:]", (1, 11), True), + ("Type[3]", (1, 6), True), + ("Type[bool float]", (1, 11), True), + ("Type[bool (_ < 2)]", (1, 11), True), + ("Type[bool + _ < 2)]", (1, 13), True), + ("Type[bool + (_ < 2]", (1, 19), True), + ("Type[bool + (< 2)]", (1, 14), True), + ("Type[bool + (_ + 2)]", (1, 16), True), + ("Type[bool + (Foo + Bar)]", (1, 14), True), + # ("Type[bool,]", (1, 11), True), # trailing comma is accepted, TODO: update parser or EBNF + ("Type[bool, Type[]]", (1, 16), True), + ("Type[foo: 3]", (1, 11), True), + ], +) +def test_parsing_error(src: str, pos: tuple[int, int], should_fail: bool): + tokens: list[Token] = AnnotationLexer(src).process() + parser: AnnotationParser = AnnotationParser(tokens) + stmt: Optional[Stmt] = parser.parse() + if should_fail: + assert stmt is None + assert len(parser.errors) != 0 + error_pos: Position = parser.errors[0].token.position + assert (error_pos.line, error_pos.column) == pos From 6922e49cdfaa4c0a2f9db5eb9c3f4f40e1ded8a8 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 18:43:03 +0200 Subject: [PATCH 39/41] tests(parser): add tests for midas parser --- tests/parser/test_midas_parser.py | 202 ++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 tests/parser/test_midas_parser.py diff --git a/tests/parser/test_midas_parser.py b/tests/parser/test_midas_parser.py new file mode 100644 index 0000000..28a6aa7 --- /dev/null +++ b/tests/parser/test_midas_parser.py @@ -0,0 +1,202 @@ +import textwrap + +import pytest + +from core.ast.midas import ( + ConstraintExpr, + ConstraintStmt, + Expr, + LiteralExpr, + OpStmt, + PropertyStmt, + Stmt, + TypeBodyExpr, + TypeExpr, + TypeStmt, + WildcardExpr, +) +from lexer.midas import MidasLexer +from lexer.position import Position +from lexer.token import Token +from parser.midas import MidasParser + + +class AstSerializer(Stmt.Visitor[str], Expr.Visitor[str]): + def serialize(self, stmt: Stmt): + return stmt.accept(self) + + def visit_type_stmt(self, stmt: TypeStmt) -> str: + res: str = f"(type_def {stmt.name.lexeme}" + for base in stmt.bases: + res += " " + base.accept(self) + if stmt.body is not None: + res += " " + stmt.body.accept(self) + res += ")" + return res + + def visit_type_expr(self, expr: TypeExpr) -> str: + res: str = f"({expr.name.lexeme}" + for constraint in expr.constraints: + res += " " + constraint.accept(self) + res += ")" + return res + + def visit_constraint_expr(self, expr: ConstraintExpr) -> str: + return f"(constraint {expr.left.accept(self)} {expr.op.lexeme} {expr.right.accept(self)})" + + def visit_wildcard_expr(self, expr: WildcardExpr) -> str: + return "(_)" + + def visit_literal_expr(self, expr: LiteralExpr) -> str: + return f"({expr.value})" + + def visit_type_body_expr(self, expr: TypeBodyExpr) -> str: + res: str = "(body" + for prop in expr.properties: + res += " " + prop.accept(self) + res += ")" + return res + + def visit_property_stmt(self, stmt: PropertyStmt) -> str: + return f"(property {stmt.name.lexeme} {stmt.type.accept(self)})" + + def visit_op_stmt(self, stmt: OpStmt) -> str: + left: str = stmt.left.accept(self) + right: str = stmt.right.accept(self) + result: str = stmt.result.accept(self) + return f"(op_def {left} {stmt.op.lexeme} {right} {result})" + + def visit_constraint_stmt(self, stmt: ConstraintStmt) -> str: + return f"(constraint_def {stmt.name.lexeme} {stmt.constraint.accept(self)})" + + +def parse(source: str) -> list[Stmt]: + tokens: list[Token] = MidasLexer(source).process() + return MidasParser(tokens).parse() + + +def ast_str(source: str) -> list[str]: + stmts: list[Stmt] = parse(source) + return [AstSerializer().serialize(stmt) for stmt in stmts] + + +@pytest.mark.parametrize( + "src,expected", + [ + ("type Foo<>", "(type_def Foo)"), + ("type Foo", "(type_def Foo (Bar))"), + ("type Foo", "(type_def Foo (Bar) (Baz))"), + ( + "type Foo", + "(type_def Foo (Bar (constraint (_) < (2.0))) (Baz))", + ), + ( + """ + type Foo<> { + foo: Bar + } + """, + "(type_def Foo (body (property foo (Bar))))", + ), + ( + """ + type Foo<> { + foo: Bar + (_ != none) + foo2: Bar2 + (0 <= _) + (_ <= 100) + } + """, + "(type_def Foo (body (property foo (Bar (constraint (_) != (None)))) (property foo2 (Bar2 (constraint (0.0) <= (_)) (constraint (_) <= (100.0))))))", + ), + ("op + = ", "(op_def (A) + (B) (C))"), + ( + "op + = ", + "(op_def (A (constraint (_) < (100.0))) + (B (constraint (_) < (100.0))) (C (constraint (_) < (200.0))))", + ), + ( + "constraint Positive = _ >= 0", + "(constraint_def Positive (constraint (_) >= (0.0)))", + ), + ], +) +def test_expressions(src: str, expected: str | list[str]): + if isinstance(expected, str): + expected = [expected] + assert ast_str(src) == expected + + +@pytest.mark.parametrize( + "src,pos", + [ + ### + # Misc + ### + ("42", (1, 1)), + ("true", (1, 1)), + ("foo", (1, 1)), + ### + # Type statements + ### + ("type", (1, 5)), + ("type true", (1, 6)), + ("type Foo", (1, 9)), + ("type Foo<1>", (1, 10)), + # ("type Foo", (1, 16)), # trailing comma is accepted, TODO: update parser or EBNF + ("type Foo", (1, 17)), + ("type Foo { 3 }", (1, 19)), + ( + """ + type Foo { + foo + } + """, + (4, 1), + ), + ( + """ + type Foo { + foo: 3 + } + """, + (3, 10), + ), + ### + # Operation statements + ### + ("op", (1, 3)), + ("op float", (1, 4)), + ("op <", (1, 5)), + ("op ", (1, 11)), + ("op +", (1, 13)), + ("op + float", (1, 14)), + ("op + <", (1, 15)), + ("op + + ", (1, 21)), + ("op + =", (1, 23)), + ("op + = float", (1, 24)), + ("op + = <", (1, 25)), + ("op + = + = ", (1, 13)), + ("op + = ", (1, 23)), + ("op + = ", (1, 33)), + ### + # Constraint statements + ### + ("constraint", (1, 11)), + ("constraint 3", (1, 12)), + ("constraint Foo", (1, 15)), + ("constraint Foo =", (1, 17)), + ("constraint Foo = 3", (1, 19)), + ("constraint Foo = 3 <", (1, 21)), + ], +) +def test_parsing_error(src: str, pos: tuple[int, int]): + src = textwrap.dedent(src) + tokens: list[Token] = MidasLexer(src).process() + parser: MidasParser = MidasParser(tokens) + stmt: list[Stmt] = parser.parse() + assert len(stmt) == 0 + assert len(parser.errors) != 0 + error_pos: Position = parser.errors[0].token.position + assert (error_pos.line, error_pos.column) == pos From 7a831a1afce56bd9010d1874418f95b7bdc3731b Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 18:43:35 +0200 Subject: [PATCH 40/41] fix(parser): handle extra tokens in Midas parser --- parser/midas.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/parser/midas.py b/parser/midas.py index 631e51c..a49bc5e 100644 --- a/parser/midas.py +++ b/parser/midas.py @@ -62,6 +62,7 @@ class MidasParser(Parser): return self.op_declaration() if self.match(TokenType.CONSTRAINT): return self.constraint_declaration() + raise self.error(self.peek(), "Unexpected token") except ParsingError: self.synchronize() return None @@ -111,7 +112,7 @@ class MidasParser(Parser): Returns: ConstraintExpr: the parsed type constraint expression """ - + left: Expr = self.constraint_value() op: Token = self.constraint_operator() right: Expr = self.constraint_value() @@ -129,14 +130,21 @@ class MidasParser(Parser): return LiteralExpr(True) if self.match(TokenType.NONE): return LiteralExpr(None) - + if self.match(TokenType.NUMBER): return LiteralExpr(self.previous().value) - + raise self.error(self.peek(), "Expected literal") def constraint_operator(self) -> Token: - if self.match(TokenType.LESS, TokenType.LESS_EQUAL, TokenType.GREATER, TokenType.GREATER_EQUAL, TokenType.EQUAL_EQUAL, TokenType.BANG_EQUAL): + if self.match( + TokenType.LESS, + TokenType.LESS_EQUAL, + TokenType.GREATER, + TokenType.GREATER_EQUAL, + TokenType.EQUAL_EQUAL, + TokenType.BANG_EQUAL, + ): return self.previous() raise self.error(self.peek(), "Expected constraint operator") From 697f4d5003e252fe806bd4f955e4b722ce6e09c4 Mon Sep 17 00:00:00 2001 From: LordBaryhobal Date: Mon, 18 May 2026 15:17:51 +0200 Subject: [PATCH 41/41] tests(parser): add midas lexer tests --- tests/lexer/test_midas_lexer.py | 129 ++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 tests/lexer/test_midas_lexer.py diff --git a/tests/lexer/test_midas_lexer.py b/tests/lexer/test_midas_lexer.py new file mode 100644 index 0000000..9cffff2 --- /dev/null +++ b/tests/lexer/test_midas_lexer.py @@ -0,0 +1,129 @@ +from typing import Any + +import pytest + +from lexer.midas import MidasLexer +from lexer.token import Token, TokenType + + +def scan(source: str) -> list[Token]: + return MidasLexer(source).process() + + +def assert_n_tokens(tokens: list[Token], n: int): + assert len(tokens) == n + 1 + assert tokens[-1].type == TokenType.EOF + + +@pytest.mark.parametrize( + "src,expected", + [ + ("(", TokenType.LEFT_PAREN), + (")", TokenType.RIGHT_PAREN), + ("[", TokenType.LEFT_BRACKET), + ("]", TokenType.RIGHT_BRACKET), + ("{", TokenType.LEFT_BRACE), + ("}", TokenType.RIGHT_BRACE), + (":", TokenType.COLON), + (",", TokenType.COMMA), + ("_", TokenType.UNDERSCORE), + ], +) +def test_punctuation(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("+", TokenType.PLUS), + ("-", TokenType.MINUS), + ("*", TokenType.STAR), + ("/", TokenType.SLASH), + (">", TokenType.GREATER), + (">=", TokenType.GREATER_EQUAL), + ("<", TokenType.LESS), + ("<=", TokenType.LESS_EQUAL), + ("=", TokenType.EQUAL), + ("==", TokenType.EQUAL_EQUAL), + ("!=", TokenType.BANG_EQUAL), + ], +) +def test_operators(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("a", TokenType.IDENTIFIER), + ("foo", TokenType.IDENTIFIER), + ("foo1", TokenType.IDENTIFIER), + ("foo_", TokenType.IDENTIFIER), + ("foo_bar1_baz2", TokenType.IDENTIFIER), + ("FOO_BAR1_BAZ2", TokenType.IDENTIFIER), + ("true", TokenType.TRUE), + ("false", TokenType.FALSE), + ("none", TokenType.NONE), + ], +) +def test_identifiers_keywords(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("// This is a comment", TokenType.COMMENT), + ("/* This is a comment */", TokenType.COMMENT), + (" ", TokenType.WHITESPACE), + ("\t", TokenType.WHITESPACE), + ("\r", TokenType.WHITESPACE), + (" \t \t", TokenType.WHITESPACE), + ("\n", TokenType.NEWLINE), + ], +) +def test_misc(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected_type,expected_value", + [ + ("0", TokenType.NUMBER, 0), + ("0.0", TokenType.NUMBER, 0), + ("1234.56", TokenType.NUMBER, 1234.56), + ], +) +def test_literals(src: str, expected_type: TokenType, expected_value: Any): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected_type + assert tokens[0].value == expected_value + + +def test_single_bang_error(): + with pytest.raises(SyntaxError): + scan("!") + + +@pytest.mark.parametrize( + "src", + [ + "@", + '"', + "'", + ".", + ], +) +def test_unexpected_character(src: str): + with pytest.raises(SyntaxError): + scan(src)