diff --git a/.gitignore b/.gitignore index e69de29..f63541d 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,6 @@ +.vscode +__pycache__ +.env +venv +.venv +*.pyc \ No newline at end of file diff --git a/core/ast/annotations.py b/core/ast/annotations.py new file mode 100644 index 0000000..a885e29 --- /dev/null +++ b/core/ast/annotations.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Generic, Optional, TypeVar + +from lexer.token import Token + +T = TypeVar("T") + + +@dataclass(frozen=True) +class Stmt(ABC): + @abstractmethod + def accept(self, visitor: Visitor[T]) -> T: ... + + class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_annotation_stmt(self, stmt: AnnotationStmt) -> T: ... + + +@dataclass(frozen=True) +class AnnotationStmt(Stmt): + name: Token + schema: Optional[SchemaExpr] + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_annotation_stmt(self) + + +@dataclass(frozen=True) +class Expr(ABC): + @abstractmethod + def accept(self, visitor: Visitor[T]) -> T: ... + + class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_wildcard_expr(self, expr: WildcardExpr) -> T: ... + + @abstractmethod + def visit_literal_expr(self, expr: LiteralExpr) -> T: ... + + @abstractmethod + def visit_type_expr(self, expr: TypeExpr) -> T: ... + + @abstractmethod + def visit_constraint_expr(self, expr: ConstraintExpr) -> T: ... + + @abstractmethod + def visit_schema_expr(self, expr: SchemaExpr) -> T: ... + + @abstractmethod + def visit_schema_element_expr(self, expr: SchemaElementExpr) -> T: ... + + +@dataclass(frozen=True) +class WildcardExpr(Expr): + token: Token + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_wildcard_expr(self) + + +@dataclass(frozen=True) +class LiteralExpr(Expr): + value: Any + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_literal_expr(self) + + +@dataclass(frozen=True) +class TypeExpr(Expr): + name: Token + constraints: list[ConstraintExpr] + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_type_expr(self) + + +@dataclass(frozen=True) +class ConstraintExpr(Expr): + left: Expr + op: Token + right: Expr + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_constraint_expr(self) + + +@dataclass(frozen=True) +class SchemaExpr(Expr): + left: Token + elements: list[Expr] + right: Token + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_schema_expr(self) + + +@dataclass(frozen=True) +class SchemaElementExpr(Expr): + name: Optional[Token] + type: Optional[Expr] + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_schema_element_expr(self) diff --git a/core/ast/midas.py b/core/ast/midas.py new file mode 100644 index 0000000..4f2b03f --- /dev/null +++ b/core/ast/midas.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Generic, Optional, TypeVar + +from lexer.token import Token + +T = TypeVar("T") + + +# Statements + + +@dataclass(frozen=True) +class Stmt(ABC): + @abstractmethod + def accept(self, visitor: Visitor[T]) -> T: ... + + class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_type_stmt(self, stmt: TypeStmt) -> T: ... + + @abstractmethod + def visit_property_stmt(self, stmt: PropertyStmt) -> T: ... + + @abstractmethod + def visit_op_stmt(self, stmt: OpStmt) -> T: ... + + @abstractmethod + def visit_constraint_stmt(self, stmt: ConstraintStmt) -> T: ... + + +@dataclass(frozen=True) +class TypeStmt(Stmt): + name: Token + bases: list[TypeExpr] + body: Optional[TypeBodyExpr] + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_type_stmt(self) + + +@dataclass(frozen=True) +class PropertyStmt(Stmt): + name: Token + type: TypeExpr + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_property_stmt(self) + + +@dataclass(frozen=True) +class OpStmt(Stmt): + left: TypeExpr + op: Token + right: TypeExpr + result: TypeExpr + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_op_stmt(self) + + +@dataclass(frozen=True) +class ConstraintStmt(Stmt): + name: Token + constraint: ConstraintExpr + + def accept(self, visitor: Stmt.Visitor[T]) -> T: + return visitor.visit_constraint_stmt(self) + + +# Expressions + + +@dataclass(frozen=True) +class Expr(ABC): + @abstractmethod + def accept(self, visitor: Visitor[T]) -> T: ... + + class Visitor(ABC, Generic[T]): + @abstractmethod + def visit_wildcard_expr(self, expr: WildcardExpr) -> T: ... + + @abstractmethod + def visit_literal_expr(self, expr: LiteralExpr) -> T: ... + + @abstractmethod + def visit_type_expr(self, expr: TypeExpr) -> T: ... + + @abstractmethod + def visit_constraint_expr(self, expr: ConstraintExpr) -> T: ... + + @abstractmethod + def visit_type_body_expr(self, expr: TypeBodyExpr) -> T: ... + + +@dataclass(frozen=True) +class WildcardExpr(Expr): + token: Token + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_wildcard_expr(self) + + +@dataclass(frozen=True) +class LiteralExpr(Expr): + value: Any + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_literal_expr(self) + + +@dataclass(frozen=True) +class TypeExpr(Expr): + name: Token + constraints: list[ConstraintExpr] + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_type_expr(self) + + +@dataclass(frozen=True) +class ConstraintExpr(Expr): + left: Expr + op: Token + right: Expr + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_constraint_expr(self) + + +@dataclass(frozen=True) +class TypeBodyExpr(Expr): + properties: list[PropertyStmt] + + def accept(self, visitor: Expr.Visitor[T]) -> T: + return visitor.visit_type_body_expr(self) diff --git a/core/ast/printer.py b/core/ast/printer.py new file mode 100644 index 0000000..086c581 --- /dev/null +++ b/core/ast/printer.py @@ -0,0 +1,360 @@ +from __future__ import annotations + +from contextlib import contextmanager +from enum import Enum, auto +import io +from typing import Generator, Generic, Optional, Protocol, TypeVar + +import core.ast.annotations as a +import core.ast.midas as m + + +class _Level(Enum): + EMPTY = auto() + ACTIVE = auto() + LAST = auto() + + +class Expr(Protocol): + def accept(self, printer: AstPrinter) -> None: ... + + +T = TypeVar("T", bound=Expr) + + +class AstPrinter(Generic[T]): + LAST_CHILD = "└── " + CHILD = "├── " + VERTICAL = "│ " + EMPTY = " " + + def __init__(self): + self._levels: list[_Level] = [] + self._idx: Optional[int] = None + self._buf: io.StringIO = io.StringIO() + + def print(self, expr: T): + self._buf = io.StringIO() + expr.accept(self) + return self._buf.getvalue() + + @contextmanager + def _child_level(self, last: bool = False) -> Generator[None, None, None]: + self._levels.append(_Level.LAST if last else _Level.ACTIVE) + try: + yield + finally: + self._levels.pop() + + def _mark_last(self): + if self._levels: + self._levels[-1] = _Level.LAST + + def _write_line(self, text: str, *, last: bool = False): + if last: + self._mark_last() + indent: str = self._build_indent() + if self._idx is not None: + text = f"[{self._idx}] {text}" + self._idx = None + self._buf.write(indent + text + "\n") + + def _build_indent(self) -> str: + parts: list[str] = [] + for level in self._levels[:-1]: + parts.append(self.EMPTY if level == _Level.EMPTY else self.VERTICAL) + if self._levels: + if self._levels[-1] == _Level.LAST: + parts.append(self.LAST_CHILD) + self._levels[-1] = _Level.EMPTY + else: + parts.append(self.CHILD) + return "".join(parts) + + def _write_optional_child( + self, label: str, child: Optional[T], *, last: bool = False + ): + if last: + self._mark_last() + if child is None: + self._write_line(f"{label}: None") + else: + self._write_line(label) + with self._child_level(last=True): + child.accept(self) + + +class AnnotationAstPrinter(AstPrinter, a.Expr.Visitor[None], a.Stmt.Visitor[None]): + def visit_annotation_stmt(self, stmt: a.AnnotationStmt) -> None: + self._write_line("AnnotationStmt") + with self._child_level(): + self._write_line(f'name: "{stmt.name.lexeme}"') + self._write_optional_child("schema", stmt.schema, last=True) + + def visit_type_expr(self, expr: a.TypeExpr): + self._write_line("TypeExpr") + with self._child_level(): + self._write_line(f'name: "{expr.name.lexeme}"') + self._write_line("constraints", last=True) + with self._child_level(): + for i, constraint in enumerate(expr.constraints): + self._idx = i + if i == len(expr.constraints) - 1: + self._mark_last() + constraint.accept(self) + + def visit_constraint_expr(self, expr: a.ConstraintExpr) -> None: + self._write_line("ConstraintExpr") + with self._child_level(): + self._write_line("left") + with self._child_level(): + self._mark_last() + expr.left.accept(self) + + self._write_line(f"operator: {expr.op.lexeme}") + + self._write_line("right", last=True) + with self._child_level(): + self._mark_last() + expr.right.accept(self) + + def visit_schema_expr(self, expr: a.SchemaExpr): + self._write_line("SchemaExpr") + with self._child_level(): + for i, elmt in enumerate(expr.elements): + self._idx = i + if i == len(expr.elements) - 1: + self._mark_last() + elmt.accept(self) + + def visit_schema_element_expr(self, expr: a.SchemaElementExpr): + self._write_line("SchemaElementExpr") + with self._child_level(): + name_text: str = "None" if expr.name is None else f'"{expr.name.lexeme}"' + self._write_line(f"name: {name_text}") + self._write_optional_child("type", expr.type, last=True) + + def visit_wildcard_expr(self, expr: a.WildcardExpr) -> None: + self._write_line("WildcardExpr") + + def visit_literal_expr(self, expr: a.LiteralExpr) -> None: + self._write_line("LiteralExpr") + with self._child_level(): + self._write_line(f"value: {expr.value}", last=True) + + +class AnnotationPrinter(a.Expr.Visitor[str], a.Stmt.Visitor[str]): + def print(self, expr: a.Expr | a.Stmt): + return expr.accept(self) + + def visit_annotation_stmt(self, stmt: a.AnnotationStmt) -> str: + schema: str = "" + if stmt.schema is not None: + schema = stmt.schema.accept(self) + return f"{stmt.name.lexeme}{schema}" + + def visit_type_expr(self, expr: a.TypeExpr) -> str: + parts: list[str] = [expr.name.lexeme] + for constraint in expr.constraints: + parts.append("(" + constraint.accept(self) + ")") + return " + ".join(parts) + + def visit_constraint_expr(self, expr: a.ConstraintExpr) -> str: + parts: list[str] = [ + expr.left.accept(self), + expr.op.lexeme, + expr.right.accept(self), + ] + return " ".join(parts) + + def visit_schema_expr(self, expr: a.SchemaExpr) -> str: + res: str = expr.left.lexeme + res += ", ".join(elmt.accept(self) for elmt in expr.elements) + res += expr.right.lexeme + return res + + def visit_schema_element_expr(self, expr: a.SchemaElementExpr) -> str: + parts: list[str] = [] + if expr.name is not None: + parts.append(expr.name.lexeme) + + if expr.type is None: + parts.append("_") + else: + parts.append(expr.type.accept(self)) + return ": ".join(parts) + + def visit_wildcard_expr(self, expr: a.WildcardExpr) -> str: + return "_" + + def visit_literal_expr(self, expr: a.LiteralExpr) -> str: + return str(expr.value) + + +class MidasAstPrinter(AstPrinter, m.Expr.Visitor[None], m.Stmt.Visitor[None]): + def visit_type_stmt(self, stmt: m.TypeStmt): + self._write_line("TypeStmt") + with self._child_level(): + self._write_line(f'name: "{stmt.name.lexeme}"') + self._write_line("bases") + with self._child_level(): + for i, base in enumerate(stmt.bases): + self._idx = i + if i == len(stmt.bases) - 1: + self._mark_last() + base.accept(self) + self._write_optional_child("body", stmt.body, last=True) + + def visit_property_stmt(self, stmt: m.PropertyStmt): + self._write_line("PropertyStmt") + with self._child_level(): + self._write_line(f'name: "{stmt.name.lexeme}"') + self._write_line("type", last=True) + with self._child_level(): + self._mark_last() + stmt.type.accept(self) + + def visit_op_stmt(self, stmt: m.OpStmt) -> None: + self._write_line("OpStmt") + with self._child_level(): + self._write_line("left") + with self._child_level(): + self._mark_last() + stmt.left.accept(self) + + self._write_line(f'op: "{stmt.op.lexeme}"') + + self._write_line("right") + with self._child_level(): + self._mark_last() + stmt.right.accept(self) + + self._write_line("result", last=True) + with self._child_level(): + self._mark_last() + stmt.result.accept(self) + + def visit_constraint_stmt(self, stmt: m.ConstraintStmt): + self._write_line("ConstraintStmt") + with self._child_level(): + self._write_line(f'name: "{stmt.name.lexeme}"') + self._write_line("constraint", last=True) + with self._child_level(): + self._mark_last() + stmt.constraint.accept(self) + + def visit_type_expr(self, expr: m.TypeExpr): + self._write_line("TypeExpr") + with self._child_level(): + self._write_line(f'name: "{expr.name.lexeme}"') + self._write_line("constraints", last=True) + with self._child_level(): + for i, constraint in enumerate(expr.constraints): + self._idx = i + if i == len(expr.constraints) - 1: + self._mark_last() + constraint.accept(self) + + def visit_constraint_expr(self, expr: m.ConstraintExpr): + self._write_line("ConstraintExpr") + with self._child_level(): + self._write_line("left") + with self._child_level(): + self._mark_last() + expr.left.accept(self) + + self._write_line(f"operator: {expr.op.lexeme}") + + self._write_line("right", last=True) + with self._child_level(): + self._mark_last() + expr.right.accept(self) + + def visit_type_body_expr(self, expr: m.TypeBodyExpr): + self._write_line("TypeBodyExpr") + with self._child_level(): + self._write_line("properties", last=True) + with self._child_level(): + for i, property in enumerate(expr.properties): + self._idx = i + if i == len(expr.properties) - 1: + self._mark_last() + property.accept(self) + + def visit_wildcard_expr(self, expr: m.WildcardExpr) -> None: + self._write_line("WildcardExpr") + + def visit_literal_expr(self, expr: m.LiteralExpr) -> None: + self._write_line("LiteralExpr") + with self._child_level(): + self._write_line(f"value: {expr.value}", last=True) + +class MidasPrinter(m.Expr.Visitor[str], m.Stmt.Visitor[str]): + def __init__(self, indent: int = 4): + self.indent: int = indent + self.level: int = 0 + + def indented(self, text: str) -> str: + return " " * (self.level * self.indent) + text + + def print(self, expr: m.Expr | m.Stmt): + self.level = 0 + return expr.accept(self) + + def visit_type_stmt(self, stmt: m.TypeStmt): + bases: list[str] = [ + b.accept(self) + for b in stmt.bases + ] + + res: str = self.indented(f"type {stmt.name.lexeme}<{', '.join(bases)}>") + if stmt.body is not None: + res += " {\n" + self.level += 1 + res += stmt.body.accept(self) + self.level -= 1 + res += "\n" + self.indented("}") + + return res + + def visit_property_stmt(self, stmt: m.PropertyStmt): + return f"{stmt.name.lexeme}: {stmt.type.accept(self)}" + + def visit_op_stmt(self, stmt: m.OpStmt): + left: str = stmt.left.accept(self) + op: str = stmt.op.lexeme + right: str = stmt.right.accept(self) + result: str = stmt.result.accept(self) + return self.indented(f"op <{left}> {op} <{right}> = <{result}>") + + def visit_constraint_stmt(self, stmt: m.ConstraintStmt): + name: str = stmt.name.lexeme + constraint: str = stmt.constraint.accept(self) + return self.indented(f"constraint {name} = {constraint}") + + def visit_type_expr(self, expr: m.TypeExpr): + parts: list[str] = [expr.name.lexeme] + for constraint in expr.constraints: + parts.append("(" + constraint.accept(self) + ")") + return " + ".join(parts) + + def visit_constraint_expr(self, expr: m.ConstraintExpr): + parts: list[str] = [ + expr.left.accept(self), + expr.op.lexeme, + expr.right.accept(self), + ] + return " ".join(parts) + + def visit_type_body_expr(self, expr: m.TypeBodyExpr): + properties: list[str] = [ + self.indented(prop.accept(self)) + for prop in expr.properties + ] + return "\n".join(properties) + + def visit_wildcard_expr(self, expr: m.WildcardExpr): + return "_" + + def visit_literal_expr(self, expr: m.LiteralExpr): + return str(expr.value) \ No newline at end of file diff --git a/examples/00_syntax_prototype/01_simple_types.py b/examples/00_syntax_prototype/01_simple_types.py new file mode 100644 index 0000000..725fdf4 --- /dev/null +++ b/examples/00_syntax_prototype/01_simple_types.py @@ -0,0 +1,16 @@ +# type: ignore +# ruff: disable[F821] +from __future__ import annotations + +# A simple data-frame with different column of various simple types +# Columns can be named and/or typed +df: Frame[ + verified: bool, + birth_year: int, + height: float + ( _ > 0 ) + ( _ < 250 ), + name: str, + date: datetime, + float, # unnamed + unknown: _, # untyped + _ # unnamed and untyped +] diff --git a/examples/00_syntax_prototype/02_custom_types.midas b/examples/00_syntax_prototype/02_custom_types.midas new file mode 100644 index 0000000..017e40c --- /dev/null +++ b/examples/00_syntax_prototype/02_custom_types.midas @@ -0,0 +1,24 @@ +// Simple custom type derived from floats +type Latitude +type Longitude + +// Complex custom type, containing two values accessible through properties +type GeoLocation { + lat: Latitude + lon: Longitude +} + +type LatitudeDiff +type LongitudeDiff + +// Simple operation defined on our custom types +op - = +op - = + +// Simple custom type with a constraint +type Age + +// Predefined custom constraints that can be referenced in other definitions +constraint Positive = _ >= 0 +constraint StrictlyPositive = _ > 0 +//constraint Even = _ % 2 == 0 \ No newline at end of file diff --git a/examples/00_syntax_prototype/02_custom_types.py b/examples/00_syntax_prototype/02_custom_types.py new file mode 100644 index 0000000..0297058 --- /dev/null +++ b/examples/00_syntax_prototype/02_custom_types.py @@ -0,0 +1,34 @@ +# type: ignore +# ruff: disable[F821] +from __future__ import annotations + +# Prototype of custom type import to use valid Python syntax +import midas +midas.using("02_custom_types.midas") + +# A data-frame using a custom type +df: Frame[ + location: GeoLocation +] + +# Properties of a type can be used on a column of that type +lat: Column[GeoLocation] = df["location"].lat +lon: Column[GeoLocation] = df["location"].lon + +# Unregistered operations between types are not permitted +lat + lon # Invalid operation + +# Registered operations are permitted +lat1: Latitude = lat[0] +lat2: Latitude = lat[1] +lat_diff: LatitudeDiff = lat2 - lat1 # Valid operation + +# In addition to the type, a column can have one or more constraints, either defined inline or in a separate file +df2: Frame[ + age: int + (_ >= 0), + height: float + (_ >= 0), +] +df2_bis: Frame[ + age: int + Positive, + height: float + Positive, +] diff --git a/lexer/__init__.py b/lexer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lexer/annotations.py b/lexer/annotations.py new file mode 100644 index 0000000..ae9faae --- /dev/null +++ b/lexer/annotations.py @@ -0,0 +1,102 @@ +from lexer.base import Lexer +from lexer.keyword import ANNOTATION_KEYWORDS +from lexer.token import TokenType + + +class AnnotationLexer(Lexer): + def scan_token(self) -> None: + char: str = self.advance() + match char: + case "(": + self.add_token(TokenType.LEFT_PAREN) + case ")": + self.add_token(TokenType.RIGHT_PAREN) + case "[": + self.add_token(TokenType.LEFT_BRACKET) + case "]": + self.add_token(TokenType.RIGHT_BRACKET) + case "<": + self.add_token( + TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS + ) + case ">": + self.add_token( + TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER + ) + case "=": + self.add_token( + TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL + ) + case "!": + if self.match("="): + self.add_token(TokenType.BANG_EQUAL) + else: + self.error("Unexpected single bang. Did you mean '!=' ?") + case ":": + self.add_token(TokenType.COLON) + case ",": + self.add_token(TokenType.COMMA) + case "_": + self.add_token(TokenType.UNDERSCORE) + case "+": + self.add_token(TokenType.PLUS) + case "#": + self.scan_comment() + case "\n": + self.add_token(TokenType.NEWLINE) + case " " | "\r" | "\t": + # Consume all whitespace characters until EOL or EOF + while ( + self.peek().isspace() + and self.peek() != "\n" + and not self.is_at_end() + ): + self.advance() + self.add_token(TokenType.WHITESPACE) + case _: + if char.isdigit(): + self.scan_number() + elif char.isalpha(): + self.scan_identifier() + else: + self.error("Unexpected character") + return None + + def scan_number(self): + """Scan the rest of number and add it as a token + + This method handles both simple integers and floats. Scientific notation + and base prefixes (0x, 0b, 0o) are not supported + """ + while self.peek().isdigit(): + self.advance() + + if self.peek() == "." and self.peek_next().isdigit(): + self.advance() + while self.peek().isdigit(): + self.advance() + + value: float = float(self.source[self.start : self.idx]) + self.add_token(TokenType.NUMBER, value) + + def scan_identifier(self): + """Scan the rest of an identifier and add it as a token + + An identifier starts with a letter, followed by any number of + alphanumerical characters or underscores + """ + while self.peek().isalnum() or self.peek() == "_": + self.advance() + + lexeme: str = self.source[self.start : self.idx] + token_type: TokenType = ANNOTATION_KEYWORDS.get(lexeme, TokenType.IDENTIFIER) + self.add_token(token_type) + + def scan_comment(self): + """Scan the rest of a comment and add it as a token + + A comment starts with a `#` character and ends at the EOL/EOF + """ + while self.peek() != "\n" and not self.is_at_end(): + self.advance() + self.add_token(TokenType.COMMENT) diff --git a/lexer/base.py b/lexer/base.py new file mode 100644 index 0000000..1104e7a --- /dev/null +++ b/lexer/base.py @@ -0,0 +1,166 @@ +from abc import ABC, abstractmethod +from typing import Any, Callable, Optional + +from lexer.position import Position +from lexer.token import Token, TokenType + + +class Lexer(ABC): + """An abstract lexer which provides methods to easily extend it into a concrete one + + This implementation is based on the [_Crafting Interpreters_][1] book by Robert Nystrom, + more specifically on my [previous Python implementation](https://git.kb28.ch/HEL/pebble) + + [1]: https://craftinginterpreters.com/ + """ + + def __init__(self, source: str, file: Optional[str] = None) -> None: + """Create a new lexer to scan for tokens in the given source + + Args: + source (str): the source to scan + file (Optional[str], optional): the path of the given source. Can be a file path or any string identifier. Defaults to None. + """ + self.source: str = source + self.file: Optional[str] = file + self.tokens: list[Token] = [] + self.start: int = 0 + self.idx: int = 0 + self.length: int = len(self.source) + self.line: int = 1 + self.column: int = 1 + self.start_pos: Position = self.get_position() + + def error(self, msg: str): + """Raise a syntax error + + Args: + msg (str): the error message + + Raises: + SyntaxError + """ + raise SyntaxError(f"[ERROR] Error at {self.start_pos}: {msg}") + + def process(self) -> list[Token]: + """Scan tokens out of the source text + + Returns: + list[Token]: all the tokens that could be scanned + + Raises: + SyntaxError: if a syntax error is found + """ + self.scan_tokens() + self.tokens.append(Token(TokenType.EOF, "", None, self.get_position())) + return self.tokens + + def is_at_end(self) -> bool: + """Whether the lexer is at the end of the source + + Returns: + bool: True if the current index is at the end of the source + """ + return self.idx >= self.length + + def get_position(self) -> Position: + """Get the current position + + Returns: + Position: the current position + """ + return Position(file=self.file, line=self.line, column=self.column) + + def peek(self) -> str: + """Get the current character without advancing, if any + + Returns: + str: the current character, or an empty string if at EOF + """ + if self.idx < self.length: + return self.source[self.idx] + return "" + + def peek_next(self) -> str: + """Get the next character without advancing, if any + + Returns: + str: the next character, or an empty string if at EOF + """ + if self.idx + 1 < self.length: + return self.source[self.idx + 1] + return "" + + def advance(self) -> str: + """Get the new character and advance + + Returns: + str: the current character, before advancing + """ + char: str = self.peek() + self.idx += 1 + self.column += 1 + if char == "\n": + self.newline() + return char + + def newline(self): + """Update the current position after encountering a newline character""" + self.line += 1 + self.column = 1 + + def match(self, expected: str) -> bool: + """Consume the next character if it matches the given value + + Args: + expected (str): the expected character + + Returns: + bool: whether a character was matched and consumed + """ + if self.peek() == expected: + self.advance() + return True + return False + + def update_start(self): + """Update the starting position of the current lexeme + + The cursor marking the start of the lexeme currently being scanned is + moved to the current position + """ + self.start_pos = self.get_position() + self.start = self.idx + + def add_token(self, token_type: TokenType, value: Optional[Any] = None): + """Add the current lexeme to the list of scanned tokens + + Args: + token_type (TokenType): the type of token to add + value (Optional[Any], optional): the value of the token (useful for numbers or constants). Defaults to None. + """ + lexeme: str = self.source[self.start : self.idx] + self.tokens.append( + Token(position=self.start_pos, type=token_type, lexeme=lexeme, value=value) + ) + + def scan_tokens(self, condition: Optional[Callable[[], bool]] = None): + """Scan tokens until EOF is reached or the given condition becomes False + + Args: + condition (Optional[Callable[[], bool]], optional): the condition to continue scanning tokens. + If None, defaults to always being True, effectively scanning tokens until EOF is reached. Defaults to None. + """ + if condition is None: + condition = lambda: True # noqa: E731 + while condition() and not self.is_at_end(): + self.update_start() + self.scan_token() + + @abstractmethod + def scan_token(self) -> None: + """Scan a token + + This function should (at least) consume the current character and produce the appropriate token(s), using `add_token` + """ + pass diff --git a/lexer/keyword.py b/lexer/keyword.py new file mode 100644 index 0000000..b66f21a --- /dev/null +++ b/lexer/keyword.py @@ -0,0 +1,16 @@ +from lexer.token import TokenType + +ANNOTATION_KEYWORDS: dict[str, TokenType] = { + "True": TokenType.TRUE, + "False": TokenType.FALSE, + "None": TokenType.NONE, +} + +MIDAS_KEYWORDS: dict[str, TokenType] = { + "type": TokenType.TYPE, + "op": TokenType.OP, + "constraint": TokenType.CONSTRAINT, + "true": TokenType.TRUE, + "false": TokenType.FALSE, + "none": TokenType.NONE, +} diff --git a/lexer/midas.py b/lexer/midas.py new file mode 100644 index 0000000..ad29a68 --- /dev/null +++ b/lexer/midas.py @@ -0,0 +1,131 @@ +from lexer.base import Lexer +from lexer.keyword import MIDAS_KEYWORDS +from lexer.token import TokenType + + +class MidasLexer(Lexer): + def scan_token(self) -> None: + char: str = self.advance() + match char: + case "(": + self.add_token(TokenType.LEFT_PAREN) + case ")": + self.add_token(TokenType.RIGHT_PAREN) + case "[": + self.add_token(TokenType.LEFT_BRACKET) + case "]": + self.add_token(TokenType.RIGHT_BRACKET) + case "{": + self.add_token(TokenType.LEFT_BRACE) + case "}": + self.add_token(TokenType.RIGHT_BRACE) + case "<": + self.add_token( + TokenType.LESS_EQUAL if self.match("=") else TokenType.LESS + ) + case ">": + self.add_token( + TokenType.GREATER_EQUAL if self.match("=") else TokenType.GREATER + ) + case "=": + self.add_token( + TokenType.EQUAL_EQUAL if self.match("=") else TokenType.EQUAL + ) + case "!": + if self.match("="): + self.add_token(TokenType.BANG_EQUAL) + else: + self.error("Unexpected single bang. Did you mean '!=' ?") + case ":": + self.add_token(TokenType.COLON) + case ",": + self.add_token(TokenType.COMMA) + case "_": + self.add_token(TokenType.UNDERSCORE) + case "+": + self.add_token(TokenType.PLUS) + case "-": + self.add_token(TokenType.MINUS) + case "*": + self.add_token(TokenType.STAR) + case "/": + if self.match("/"): + self.scan_comment() + elif self.match("*"): + self.scan_comment_multiline() + else: + self.add_token(TokenType.SLASH) + case "\n": + self.add_token(TokenType.NEWLINE) + case " " | "\r" | "\t": + # Consume all whitespace characters until EOL or EOF + while ( + self.peek().isspace() + and self.peek() != "\n" + and not self.is_at_end() + ): + self.advance() + self.add_token(TokenType.WHITESPACE) + case _: + if char.isdigit(): + self.scan_number() + elif char.isalpha(): + self.scan_identifier() + else: + self.error("Unexpected character") + return None + + def scan_number(self): + """Scan the rest of number and add it as a token + + This method handles both simple integers and floats. Scientific notation + and base prefixes (0x, 0b, 0o) are not supported + """ + while self.peek().isdigit(): + self.advance() + + if self.peek() == "." and self.peek_next().isdigit(): + self.advance() + while self.peek().isdigit(): + self.advance() + + value: float = float(self.source[self.start : self.idx]) + self.add_token(TokenType.NUMBER, value) + + def scan_identifier(self): + """Scan the rest of an identifier and add it as a token + + An identifier starts with a letter, followed by any number of + alphanumerical characters or underscores + """ + while self.peek().isalnum() or self.peek() == "_": + self.advance() + + lexeme: str = self.source[self.start : self.idx] + token_type: TokenType = MIDAS_KEYWORDS.get(lexeme, TokenType.IDENTIFIER) + self.add_token(token_type) + + def scan_comment(self): + """Scan the rest of a comment and add it as a token + + A comment starts with `//` and ends at the EOL/EOF + """ + while self.peek() != "\n" and not self.is_at_end(): + self.advance() + self.add_token(TokenType.COMMENT) + + def scan_comment_multiline(self): + """Scan the rest of a multiline comment and add it as a token + + A multiline comment starts with `/*` and ends with `*/` or at the EOF + """ + while ( + not (self.peek() == "*" and self.peek_next() == "/") + and not self.is_at_end() + ): + self.advance() + if not self.is_at_end(): + self.advance() + if not self.is_at_end(): + self.advance() + self.add_token(TokenType.COMMENT) diff --git a/lexer/position.py b/lexer/position.py new file mode 100644 index 0000000..306e24d --- /dev/null +++ b/lexer/position.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass(frozen=True) +class Position: + """A simple structure to store the position of a token""" + file: Optional[str] + line: int + column: int + + def __repr__(self): + return f"{self.file or ''}L{self.line}:{self.column}" diff --git a/lexer/token.py b/lexer/token.py new file mode 100644 index 0000000..70a7a1b --- /dev/null +++ b/lexer/token.py @@ -0,0 +1,59 @@ +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any + +from lexer.position import Position + + +class TokenType(Enum): + # Punctuation + LEFT_PAREN = auto() + RIGHT_PAREN = auto() + LEFT_BRACKET = auto() + RIGHT_BRACKET = auto() + LEFT_BRACE = auto() + RIGHT_BRACE = auto() + COLON = auto() + COMMA = auto() + UNDERSCORE = auto() + + # Operators + PLUS = auto() + MINUS = auto() + STAR = auto() + SLASH = auto() + GREATER = auto() + GREATER_EQUAL = auto() + LESS = auto() + LESS_EQUAL = auto() + EQUAL = auto() + EQUAL_EQUAL = auto() + BANG_EQUAL = auto() + + # Literals + IDENTIFIER = auto() + NUMBER = auto() + TRUE = auto() + FALSE = auto() + NONE = auto() + + # Keywords + TYPE = auto() + OP = auto() + CONSTRAINT = auto() + + # Misc + COMMENT = auto() + WHITESPACE = auto() + EOF = auto() + NEWLINE = auto() + + +@dataclass(frozen=True) +class Token: + """A scanned token""" + + type: TokenType + lexeme: str + value: Any + position: Position diff --git a/parser/annotations.py b/parser/annotations.py new file mode 100644 index 0000000..0bf99d6 --- /dev/null +++ b/parser/annotations.py @@ -0,0 +1,152 @@ +from typing import Optional + +from core.ast.annotations import ( + AnnotationStmt, + ConstraintExpr, + Expr, + LiteralExpr, + SchemaElementExpr, + SchemaExpr, + Stmt, + TypeExpr, + WildcardExpr, +) +from lexer.token import Token, TokenType +from parser.base import Parser +from parser.errors import ParsingError + + +class AnnotationParser(Parser): + """A simple parser for custom type annotations""" + + SYNC_BOUNDARY: set[TokenType] = set() + + def parse(self) -> Optional[Stmt]: + stmt: Optional[Stmt] = None + try: + stmt = self.annotation() + except ParsingError: + self.synchronize() + if not self.is_at_end(): + self.error(self.peek(), "Extra tokens") + return stmt + + def synchronize(self): + """Skip tokens until a synchronization boundary is found + + This method allows gracefully recovering from a parse error + to a safe place and continue parsing + """ + self.advance() + while not self.is_at_end(): + if self.peek().type in self.SYNC_BOUNDARY: + return + self.advance() + + def annotation(self) -> AnnotationStmt: + """Parse an annotation + + An annotation is written as `Type` or `Type[Schema]` + + Returns: + AnnotationStmt: the parsed annotation statement + """ + + name: Token = self.consume(TokenType.IDENTIFIER, "Expected type identifier") + schema: Optional[SchemaExpr] = None + if self.match(TokenType.LEFT_BRACKET): + schema = self.schema() + return AnnotationStmt(name=name, schema=schema) + + def type_expr(self) -> TypeExpr: + """Parse a type expression + + Returns: + TypeExpr: the parsed type expression + """ + name: Token = self.consume(TokenType.IDENTIFIER, "Expected type name") + constraints: list[ConstraintExpr] = [] + + while not self.is_at_end() and self.match(TokenType.PLUS): + self.consume(TokenType.LEFT_PAREN, "Expected '(' before type constraint") + constraints.append(self.constraint_expr()) + self.consume(TokenType.RIGHT_PAREN, "Expected ')' after type constraint") + + return TypeExpr(name=name, constraints=constraints) + + def constraint_expr(self) -> ConstraintExpr: + """Parse a type constraint + + Returns: + ConstraintExpr: the parsed type constraint expression + """ + + left: Expr = self.constraint_value() + op: Token = self.constraint_operator() + right: Expr = self.constraint_value() + return ConstraintExpr(left=left, op=op, right=right) + + def constraint_value(self) -> Expr: + if self.match(TokenType.UNDERSCORE): + return WildcardExpr(self.previous()) + return self.literal() + + def literal(self) -> LiteralExpr: + if self.match(TokenType.FALSE): + return LiteralExpr(False) + if self.match(TokenType.TRUE): + return LiteralExpr(True) + if self.match(TokenType.NONE): + return LiteralExpr(None) + + if self.match(TokenType.NUMBER): + return LiteralExpr(self.previous().value) + + raise self.error(self.peek(), "Expected literal") + + def constraint_operator(self) -> Token: + if self.match(TokenType.LESS, TokenType.LESS_EQUAL, TokenType.GREATER, TokenType.GREATER_EQUAL, TokenType.EQUAL_EQUAL, TokenType.BANG_EQUAL): + return self.previous() + raise self.error(self.peek(), "Expected constraint operator") + + def schema(self) -> SchemaExpr: + """Parse a schema definition + + A comma separated list of schema elements + + Returns: + SchemaExpr: the parsed schema expression + """ + left: Token = self.previous() + elements: list[Expr] = [] + while not self.check(TokenType.RIGHT_BRACKET) and not self.is_at_end(): + elements.append(self.schema_element()) + if not self.check(TokenType.RIGHT_BRACKET): + self.consume(TokenType.COMMA, "Expected ',' between schema elements") + + right: Token = self.consume(TokenType.RIGHT_BRACKET, "Unclosed schema") + return SchemaExpr(left=left, elements=elements, right=right) + + def schema_element(self) -> SchemaElementExpr: + """Parse a schema element + + An anonymous element (`_`), a type, an untyped named column (`name: _`), + or a named column (`name: Type`) + + Returns: + SchemaElementExpr: the parsed schema element expression + """ + if self.match(TokenType.UNDERSCORE): + return SchemaElementExpr(name=None, type=None) + + if not self.check(TokenType.IDENTIFIER): + raise self.error(self.peek(), "Expected schema element") + + name: Optional[Token] = None + type: Optional[TypeExpr] = None + if self.check_next(TokenType.COLON): + name = self.advance() + self.advance() + if not self.match(TokenType.UNDERSCORE): + type = self.type_expr() + return SchemaElementExpr(name=name, type=type) diff --git a/parser/base.py b/parser/base.py new file mode 100644 index 0000000..74962db --- /dev/null +++ b/parser/base.py @@ -0,0 +1,183 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Generic, TypeVar + +from lexer.token import Token, TokenType +from parser.errors import ParsingError + + +@dataclass(frozen=True) +class TokenError: + """A parsing error linked to a particular token""" + + token: Token + message: str + + def get_report(self) -> str: + """Get a detailed error message + + Returns: + str: the complete error message + """ + where: str = f"'{self.token.lexeme}'" + if self.token.type == TokenType.EOF: + where = "end" + return f"({self.token.position}) Error at {where}: {self.message}" + + +T = TypeVar("T") + + +class Parser(ABC, Generic[T]): + """An abstract parser which provides methods to easily extend it into a concrete one + + This implementation is based on the [_Crafting Interpreters_][1] book by Robert Nystrom, + more specifically on my [previous Python implementation](https://git.kb28.ch/HEL/pebble) + + [1]: https://craftinginterpreters.com/ + """ + + IGNORE: set[TokenType] = { + TokenType.WHITESPACE, + TokenType.COMMENT, + TokenType.NEWLINE, + } + + def __init__(self, tokens: list[Token]) -> None: + """Create a new parser to parse the given tokens + + Args: + tokens (list[Token]): the tokens to parse + """ + self.tokens: list[Token] = list( + filter(lambda t: t.type not in self.IGNORE, tokens) + ) + self.current: int = 0 + self.length: int = len(self.tokens) + self.errors: list[TokenError] = [] + + def error(self, token: Token, message: str): + """Record an error + + Args: + token (Token): the token at which the error was detected + message (str): a message explaining the error + + Returns: + ParsingError: the parsing error to raise + """ + self.errors.append(TokenError(token=token, message=message)) + return ParsingError() + + @abstractmethod + def parse(self) -> T: + """Parse the tokens + + Returns: + T: the parsed element(s) + """ + pass + + def is_at_end(self) -> bool: + """Whether the parser is at the end of the token list + + Returns: + bool: True if the current index is at the end of the token list + """ + return self.peek().type == TokenType.EOF + + def peek(self) -> Token: + """Get the current token without advancing + + Returns: + Token: the current token + """ + return self.tokens[self.current] + + def previous(self) -> Token: + """Get the previous token + + This function is unsafe and will raise an IndexError if called when + the parser is at the begin of the token list + + Returns: + Token: the previous token + """ + return self.tokens[self.current - 1] + + def check(self, token_type: TokenType) -> bool: + """Check whether the current token is of the given type + + This function always returns False if the parser is at the EOF token + + Args: + token_type (TokenType): the type of token to check + + Returns: + bool: True if the current token is of the given type and not EOF + """ + if self.is_at_end(): + return False + return self.peek().type == token_type + + def check_next(self, token_type: TokenType) -> bool: + """Check whether the next token is of the given type + + This function always returns False if the parser is at the EOF token + + Args: + token_type (TokenType): the type of token to check + + Returns: + bool: True if the current token is of the given type and not EOF + """ + if self.is_at_end(): + return False + if self.current + 1 >= self.length: + return False + token: Token = self.tokens[self.current + 1] + if token.type == TokenType.EOF: + return False + return token.type == token_type + + def advance(self) -> Token: + """Consume and return the current token, if not at the EOF + + Returns: + Token: the current token, before advancing + """ + if not self.is_at_end(): + self.current += 1 + return self.previous() + + def match(self, *types: TokenType) -> bool: + """Consume the next token if it matches one of the given types + + Returns: + bool: whether a token was matched and consumed + """ + for token_type in types: + if self.check(token_type): + self.advance() + return True + return False + + def consume(self, token_type: TokenType, error_msg: str) -> Token: + """Consume the current token if it matches the given type or raise an error + + If the current token doesn't match the given type, an error is raised + with the provided message + + Args: + token_type (TokenType): the expected token type + error_msg (str): the error message if the token doesn't match + + Raises: + SyntaxError: if the current token doesn't match the given type + + Returns: + Token: the current token which matched the given type + """ + if self.check(token_type): + return self.advance() + raise self.error(self.peek(), error_msg) diff --git a/parser/errors.py b/parser/errors.py new file mode 100644 index 0000000..e8e65fb --- /dev/null +++ b/parser/errors.py @@ -0,0 +1,2 @@ +class ParsingError(RuntimeError): + pass diff --git a/parser/midas.py b/parser/midas.py new file mode 100644 index 0000000..a49bc5e --- /dev/null +++ b/parser/midas.py @@ -0,0 +1,217 @@ +from typing import Optional + +from core.ast.midas import ( + ConstraintExpr, + ConstraintStmt, + Expr, + LiteralExpr, + OpStmt, + PropertyStmt, + Stmt, + TypeBodyExpr, + TypeExpr, + TypeStmt, + WildcardExpr, +) +from lexer.token import Token, TokenType +from parser.base import Parser +from parser.errors import ParsingError + + +class MidasParser(Parser): + """A simple parser for midas type definitions""" + + SYNC_BOUNDARY: set[TokenType] = {TokenType.TYPE, TokenType.OP, TokenType.CONSTRAINT} + + def parse(self) -> list[Stmt]: + statements: list[Stmt] = [] + while not self.is_at_end(): + stmt: Optional[Stmt] = self.declaration() + if stmt is None: + print("Early stop") + break + statements.append(stmt) + return statements + + def synchronize(self): + """Skip tokens until a synchronization boundary is found + + This method allows gracefully recovering from a parse error + to a safe place and continue parsing + """ + self.advance() + while not self.is_at_end(): + if self.previous().type == TokenType.NEWLINE: + return + if self.peek().type in self.SYNC_BOUNDARY: + return + self.advance() + + def declaration(self) -> Optional[Stmt]: + """Try and parse a declaration + + Any parsing error is caught and None is returned + + Returns: + Optional[Stmt]: the parsed Midas statement, or None if a ParsingError was raised + """ + try: + if self.match(TokenType.TYPE): + return self.type_declaration() + if self.match(TokenType.OP): + return self.op_declaration() + if self.match(TokenType.CONSTRAINT): + return self.constraint_declaration() + raise self.error(self.peek(), "Unexpected token") + except ParsingError: + self.synchronize() + return None + + def type_declaration(self) -> TypeStmt: + """Parse a type declaration + + A type declaration is written `type Name` optionally followed by a brace-wrapped body + + Returns: + TypeStmt: the parsed type declaration statement + """ + name: Token = self.consume(TokenType.IDENTIFIER, "Expected type name") + self.consume(TokenType.LESS, "Expected '<' after type name") + bases: list[TypeExpr] = [] + while not self.check(TokenType.GREATER) and not self.is_at_end(): + bases.append(self.type_expr()) + if not self.check(TokenType.GREATER): + self.consume(TokenType.COMMA, "Expected ',' between type bases") + self.consume(TokenType.GREATER, "Expected '>' after base type") + + body: Optional[TypeBodyExpr] = None + + if self.check(TokenType.LEFT_BRACE): + body = self.type_body_expr() + return TypeStmt(name=name, bases=bases, body=body) + + def type_expr(self) -> TypeExpr: + """Parse a type expression + + Returns: + TypeExpr: the parsed type expression + """ + name: Token = self.consume(TokenType.IDENTIFIER, "Expected type name") + constraints: list[ConstraintExpr] = [] + + while not self.is_at_end() and self.match(TokenType.PLUS): + self.consume(TokenType.LEFT_PAREN, "Expected '(' before type constraint") + constraints.append(self.constraint_expr()) + self.consume(TokenType.RIGHT_PAREN, "Expected ')' after type constraint") + + return TypeExpr(name=name, constraints=constraints) + + def constraint_expr(self) -> ConstraintExpr: + """Parse a type constraint + + Returns: + ConstraintExpr: the parsed type constraint expression + """ + + left: Expr = self.constraint_value() + op: Token = self.constraint_operator() + right: Expr = self.constraint_value() + return ConstraintExpr(left=left, op=op, right=right) + + def constraint_value(self) -> Expr: + if self.match(TokenType.UNDERSCORE): + return WildcardExpr(self.previous()) + return self.literal() + + def literal(self) -> LiteralExpr: + if self.match(TokenType.FALSE): + return LiteralExpr(False) + if self.match(TokenType.TRUE): + return LiteralExpr(True) + if self.match(TokenType.NONE): + return LiteralExpr(None) + + if self.match(TokenType.NUMBER): + return LiteralExpr(self.previous().value) + + raise self.error(self.peek(), "Expected literal") + + def constraint_operator(self) -> Token: + if self.match( + TokenType.LESS, + TokenType.LESS_EQUAL, + TokenType.GREATER, + TokenType.GREATER_EQUAL, + TokenType.EQUAL_EQUAL, + TokenType.BANG_EQUAL, + ): + return self.previous() + raise self.error(self.peek(), "Expected constraint operator") + + def type_body_expr(self) -> TypeBodyExpr: + """Parse a type definition body + + A type definition body is a set of whitespace-separated + property statements enclosed in curly braces + + Returns: + TypeBodyExpr: the parsed type body expression + """ + self.consume(TokenType.LEFT_BRACE, "Expected '{' to start type body") + properties: list[PropertyStmt] = [] + while not self.check(TokenType.RIGHT_BRACE) and not self.is_at_end(): + properties.append(self.property_stmt()) + self.consume(TokenType.RIGHT_BRACE, "Unclosed type body") + return TypeBodyExpr(properties=properties) + + def property_stmt(self) -> PropertyStmt: + """Parse a property statement + + A type property statement is written `name: Type` + + Returns: + PropertyStmt: the parsed property statement + """ + name: Token = self.consume(TokenType.IDENTIFIER, "Expected property name") + self.consume(TokenType.COLON, "Expected ':' after property name") + type: TypeExpr = self.type_expr() + return PropertyStmt(name=name, type=type) + + def op_declaration(self) -> OpStmt: + """Parse an operation definition + + An operation is written `op operator = ` where `operator` can be any single token + + Returns: + OpStmt: the parsed operation statement + """ + self.consume(TokenType.LESS, "Expected '<' before first type") + left: TypeExpr = self.type_expr() + self.consume(TokenType.GREATER, "Expected '>' after first type") + + op: Token = self.advance() + + self.consume(TokenType.LESS, "Expected '<' before second type") + right: TypeExpr = self.type_expr() + self.consume(TokenType.GREATER, "Expected '>' after second type") + + self.consume(TokenType.EQUAL, "Expected '=' after second type") + + self.consume(TokenType.LESS, "Expected '<' before result type") + result: TypeExpr = self.type_expr() + self.consume(TokenType.GREATER, "Expected '>' after result type") + + return OpStmt(left=left, op=op, right=right, result=result) + + def constraint_declaration(self) -> ConstraintStmt: + """Parse a type constraint declaration + + A constraint is written `constraint Name = constraint_expression` + + Returns: + ConstraintStmt: the parsed constraint declaration statement + """ + name: Token = self.consume(TokenType.IDENTIFIER, "Expected constraint name") + self.consume(TokenType.EQUAL, "Expected '=' after constraint name") + constraint: ConstraintExpr = self.constraint_expr() + return ConstraintStmt(name=name, constraint=constraint) diff --git a/syntax/annotations.ebnf b/syntax/annotations.ebnf new file mode 100644 index 0000000..73caf4f --- /dev/null +++ b/syntax/annotations.ebnf @@ -0,0 +1,20 @@ +identifier ::= '[a-zA-Z][a-zA-Z_]*' + +integer ::= '\d+' +number ::= integer ["." integer] +boolean ::= "False" | "True" +none ::= "None" + +value ::= number | boolean | none +lambda-value ::= "_" | value +lambda-operator ::= ">" | "<" | ">=" | "<=" | "==" | "!=" +lambda ::= lambda-value lambda-operator lambda-value + +constraint ::= identifier | "(" lambda ")" +base-type ::= identifier +type ::= base-type { "+" constraint } + +column-type ::= type | "_" +column-def ::= [ identifier ":" ] column-type + +frame-def ::= column-def { "," column-def } diff --git a/syntax/annotations.typ b/syntax/annotations.typ new file mode 100644 index 0000000..8c66031 --- /dev/null +++ b/syntax/annotations.typ @@ -0,0 +1,74 @@ +#import "@preview/fervojo:0.1.1": render + +#let value = ``` +{[`value` < + [`number` 'digit' * ! ], + [`boolean` <"False", "True">], + [`none` "None"] +>]} +``` + +#let constraint = ``` +{[`constraint` <"_", 'value'> <">", "<", ">=", "<=", "==", "!="> <"_", 'value'>]} +``` + +#let type-with-constraints = ``` +{[`type-with-constraints` 'identifier' ]} +``` + +#let column-def = ``` +{[`column-def` <"_", 'type-with-constraints'>]} +``` + +#let frame-def = ``` +{[`frame-def` 'column-def' * ","]} +``` + +#let annotation = ``` +{[`annotation` 'identifier' ]} +``` + +#let rules = ( + value, + constraint, + type-with-constraints, + column-def, + frame-def, + annotation, +) + +#set text(font: "Source Sans 3") + += Type annotation syntax + +#for rule in rules { + render(rule) +} + +/* +#let by-name = ( + annotation: annotation, + frame-def: frame-def, + column-def: column-def, + type-with-constraints: type-with-constraints, + constraint: constraint, + value: value, +) + +#let substitute(base-rule) = { + let new-rule = base-rule + for (key, rule) in by-name.pairs() { + new-rule = new-rule.replace("'" + key + "'", rule.text.slice(1, -1)) + } + if new-rule != base-rule { + new-rule = substitute(new-rule) + } + return new-rule +} + +#let combined = raw(substitute(annotation.text)) + + +#set page(flipped: true) +#render(combined) +*/ \ No newline at end of file diff --git a/syntax/midas.ebnf b/syntax/midas.ebnf new file mode 100644 index 0000000..71b4740 --- /dev/null +++ b/syntax/midas.ebnf @@ -0,0 +1,26 @@ +identifier ::= '[a-zA-Z][a-zA-Z_]*' + +integer ::= '\d+' +number ::= integer ["." integer] +boolean ::= "False" | "True" +none ::= "None" + +value ::= number | boolean | none +lambda-value ::= "_" | value +lambda-operator ::= ">" | "<" | ">=" | "<=" | "==" | "!=" +lambda ::= lambda-value lambda-operator lambda-value + +constraint ::= identifier | "(" lambda ")" +base-type ::= identifier +type ::= base-type { "+" constraint } + +type-property ::= 'identifier' ":" 'type' +type-body ::= "{" { 'type-property' } "}" + +operation-type ::= "<" 'type' ">" + +type-statement ::= "type" 'identifier' "<" 'type' {"," 'type'} ">" ['type-body'] +operation-statement ::= "op" 'operation-type' 'operator' 'operation-type' "=" 'operation-type' +constraint-statement ::= "constraint" 'identifier' "=" 'lambda' + +statement ::= type-statement | operation-statement | constraint-statement \ No newline at end of file diff --git a/syntax/midas.typ b/syntax/midas.typ new file mode 100644 index 0000000..17db6d6 --- /dev/null +++ b/syntax/midas.typ @@ -0,0 +1,97 @@ +#import "@preview/fervojo:0.1.1": render + +#let value = ``` +{[`value` < + [`number` 'digit' * ! ], + [`boolean` <"False", "True">], + [`none` "None"] +>]} +``` + +#let constraint = ``` +{[`constraint` <"_", 'value'> <">", "<", ">=", "<=", "==", "!="> <"_", 'value'>]} +``` + +#let type-with-constraints = ``` +{[`type-with-constraints` 'identifier' ]} +``` + +#let type-property = ``` +{[`type-property` 'identifier' ":" 'type-with-constraints']} +``` + +#let type-body = ``` +{[`type-body` "{" "}"]} +``` + +#let operation-type = ``` +{[`operation-type` "<" 'type-with-constraints' ">"]} +``` + +#let type-statement = ``` +{[`type-statement` "type" 'identifier' "<" 'type-with-constraints'*"," ">" ]} +``` + +#let operation-statement = ``` +{[`operation-statement` "op" 'operation-type' "operator" 'operation-type' "=" 'operation-type']} +``` + +#let constraint-statement = ``` +{[`constraint-statement` "constraint" 'identifier' "=" 'constraint']} +``` + +#let statement = ``` +{[`statement` <'type-statement', 'operation-statement', 'constraint-statement'>]} +``` + +#let rules = ( + value, + constraint, + type-with-constraints, + type-property, + type-body, + operation-type, + type-statement, + operation-statement, + constraint-statement, + statement, +) + +#set text(font: "Source Sans 3") + += Midas type definition syntax + +#for rule in rules { + render(rule) +} + +/* +#let by-name = ( + value: value, + constraint: constraint, + type-with-constraints: type-with-constraints, + type-property: type-property, + type-body: type-body, + operation-type: operation-type, + type-statement: type-statement, + operation-statement: operation-statement, + constraint-statement: constraint-statement, +) + +#let substitute(base-rule) = { + let new-rule = base-rule + for (key, rule) in by-name.pairs() { + new-rule = new-rule.replace("'" + key + "'", rule.text.slice(1, -1)) + } + if new-rule != base-rule { + new-rule = substitute(new-rule) + } + return new-rule.replace(regex("`.*?`"), "") +} + +#let combined = raw(substitute(statement.text)) + + +#set page(flipped: true) +#render(combined) +*/ \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..0476438 --- /dev/null +++ b/test.py @@ -0,0 +1,52 @@ +import importlib +from pathlib import Path + +from core.ast.printer import AnnotationAstPrinter, MidasAstPrinter +from lexer.annotations import AnnotationLexer +from lexer.midas import MidasLexer +from lexer.token import Token +from parser.annotations import AnnotationParser +from parser.midas import MidasParser + + +def test_annotation(): + # Frame annotation + mod = importlib.import_module("examples.00_syntax_prototype.01_simple_types") + + annotation: str = mod.__annotations__["df"] + lexer: AnnotationLexer = AnnotationLexer(annotation, "01_simple_types.py") + tokens: list[Token] = lexer.process() + # print([f"{t.type.name}('{t.lexeme}')" for t in tokens]) + + parser = AnnotationParser(tokens) + parsed = parser.parse() + print(parsed) + for err in parser.errors: + print(err.get_report()) + printer = AnnotationAstPrinter() + if parsed is not None: + print(printer.print(parsed)) + + +def test_midas(): + # Midas type definitions + path: Path = Path("examples") / "00_syntax_prototype" / "02_custom_types.midas" + definitions: str = path.read_text() + midas_lexer: MidasLexer = MidasLexer(definitions, path.name) + tokens: list[Token] = midas_lexer.process() + # print([f"{t.type.name}('{t.lexeme}')" for t in tokens]) + + parser = MidasParser(tokens) + parsed = parser.parse() + print(parsed) + for err in parser.errors: + print(err.get_report()) + printer = MidasAstPrinter() + for stmt in parsed: + if stmt is None: + print("None") + continue + print(printer.print(stmt)) + + +test_midas() diff --git a/tests/lexer/test_annotation_lexer.py b/tests/lexer/test_annotation_lexer.py new file mode 100644 index 0000000..33a83a1 --- /dev/null +++ b/tests/lexer/test_annotation_lexer.py @@ -0,0 +1,129 @@ +from typing import Any + +import pytest + +from lexer.annotations import AnnotationLexer +from lexer.token import Token, TokenType + + +def scan(source: str) -> list[Token]: + return AnnotationLexer(source).process() + + +def assert_n_tokens(tokens: list[Token], n: int): + assert len(tokens) == n + 1 + assert tokens[-1].type == TokenType.EOF + + +@pytest.mark.parametrize( + "src,expected", + [ + ("(", TokenType.LEFT_PAREN), + (")", TokenType.RIGHT_PAREN), + ("[", TokenType.LEFT_BRACKET), + ("]", TokenType.RIGHT_BRACKET), + (":", TokenType.COLON), + (",", TokenType.COMMA), + ("_", TokenType.UNDERSCORE), + ], +) +def test_punctuation(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("+", TokenType.PLUS), + (">", TokenType.GREATER), + (">=", TokenType.GREATER_EQUAL), + ("<", TokenType.LESS), + ("<=", TokenType.LESS_EQUAL), + ("=", TokenType.EQUAL), + ("==", TokenType.EQUAL_EQUAL), + ("!=", TokenType.BANG_EQUAL), + ], +) +def test_operators(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("a", TokenType.IDENTIFIER), + ("foo", TokenType.IDENTIFIER), + ("foo1", TokenType.IDENTIFIER), + ("foo_", TokenType.IDENTIFIER), + ("foo_bar1_baz2", TokenType.IDENTIFIER), + ("FOO_BAR1_BAZ2", TokenType.IDENTIFIER), + ("True", TokenType.TRUE), + ("False", TokenType.FALSE), + ("None", TokenType.NONE), + ], +) +def test_identifiers_keywords(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("#", TokenType.COMMENT), + ("# This is a comment", TokenType.COMMENT), + (" ", TokenType.WHITESPACE), + ("\t", TokenType.WHITESPACE), + ("\r", TokenType.WHITESPACE), + (" \t \t", TokenType.WHITESPACE), + ("\n", TokenType.NEWLINE), + ], +) +def test_misc(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected_type,expected_value", + [ + ("0", TokenType.NUMBER, 0), + ("0.0", TokenType.NUMBER, 0), + ("1234.56", TokenType.NUMBER, 1234.56), + ], +) +def test_literals(src: str, expected_type: TokenType, expected_value: Any): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected_type + assert tokens[0].value == expected_value + + +def test_single_bang_error(): + with pytest.raises(SyntaxError): + scan("!") + + +@pytest.mark.parametrize( + "src", + [ + "-", + "*", + "/", + "{", + "}", + "@", + '"', + "'", + ".", + ], +) +def test_unexpected_character(src: str): + with pytest.raises(SyntaxError): + scan(src) diff --git a/tests/lexer/test_midas_lexer.py b/tests/lexer/test_midas_lexer.py new file mode 100644 index 0000000..9cffff2 --- /dev/null +++ b/tests/lexer/test_midas_lexer.py @@ -0,0 +1,129 @@ +from typing import Any + +import pytest + +from lexer.midas import MidasLexer +from lexer.token import Token, TokenType + + +def scan(source: str) -> list[Token]: + return MidasLexer(source).process() + + +def assert_n_tokens(tokens: list[Token], n: int): + assert len(tokens) == n + 1 + assert tokens[-1].type == TokenType.EOF + + +@pytest.mark.parametrize( + "src,expected", + [ + ("(", TokenType.LEFT_PAREN), + (")", TokenType.RIGHT_PAREN), + ("[", TokenType.LEFT_BRACKET), + ("]", TokenType.RIGHT_BRACKET), + ("{", TokenType.LEFT_BRACE), + ("}", TokenType.RIGHT_BRACE), + (":", TokenType.COLON), + (",", TokenType.COMMA), + ("_", TokenType.UNDERSCORE), + ], +) +def test_punctuation(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("+", TokenType.PLUS), + ("-", TokenType.MINUS), + ("*", TokenType.STAR), + ("/", TokenType.SLASH), + (">", TokenType.GREATER), + (">=", TokenType.GREATER_EQUAL), + ("<", TokenType.LESS), + ("<=", TokenType.LESS_EQUAL), + ("=", TokenType.EQUAL), + ("==", TokenType.EQUAL_EQUAL), + ("!=", TokenType.BANG_EQUAL), + ], +) +def test_operators(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("a", TokenType.IDENTIFIER), + ("foo", TokenType.IDENTIFIER), + ("foo1", TokenType.IDENTIFIER), + ("foo_", TokenType.IDENTIFIER), + ("foo_bar1_baz2", TokenType.IDENTIFIER), + ("FOO_BAR1_BAZ2", TokenType.IDENTIFIER), + ("true", TokenType.TRUE), + ("false", TokenType.FALSE), + ("none", TokenType.NONE), + ], +) +def test_identifiers_keywords(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected", + [ + ("// This is a comment", TokenType.COMMENT), + ("/* This is a comment */", TokenType.COMMENT), + (" ", TokenType.WHITESPACE), + ("\t", TokenType.WHITESPACE), + ("\r", TokenType.WHITESPACE), + (" \t \t", TokenType.WHITESPACE), + ("\n", TokenType.NEWLINE), + ], +) +def test_misc(src: str, expected: TokenType): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected + + +@pytest.mark.parametrize( + "src,expected_type,expected_value", + [ + ("0", TokenType.NUMBER, 0), + ("0.0", TokenType.NUMBER, 0), + ("1234.56", TokenType.NUMBER, 1234.56), + ], +) +def test_literals(src: str, expected_type: TokenType, expected_value: Any): + tokens: list[Token] = scan(src) + assert_n_tokens(tokens, 1) + assert tokens[0].type == expected_type + assert tokens[0].value == expected_value + + +def test_single_bang_error(): + with pytest.raises(SyntaxError): + scan("!") + + +@pytest.mark.parametrize( + "src", + [ + "@", + '"', + "'", + ".", + ], +) +def test_unexpected_character(src: str): + with pytest.raises(SyntaxError): + scan(src) diff --git a/tests/parser/test_annotation_parser.py b/tests/parser/test_annotation_parser.py new file mode 100644 index 0000000..9c034dd --- /dev/null +++ b/tests/parser/test_annotation_parser.py @@ -0,0 +1,130 @@ +from typing import Optional + +import pytest + +from core.ast.annotations import ( + AnnotationStmt, + ConstraintExpr, + Expr, + LiteralExpr, + SchemaElementExpr, + SchemaExpr, + Stmt, + TypeExpr, + WildcardExpr, +) +from lexer.annotations import AnnotationLexer +from lexer.position import Position +from lexer.token import Token +from parser.annotations import AnnotationParser + + +class AstSerializer(Stmt.Visitor[str], Expr.Visitor[str]): + def serialize(self, stmt: Stmt): + return stmt.accept(self) + + def visit_annotation_stmt(self, stmt: AnnotationStmt) -> str: + schema: str = "" + if stmt.schema is not None: + schema = " " + stmt.schema.accept(self) + return f"(annotation {stmt.name.lexeme}{schema})" + + def visit_schema_expr(self, expr: SchemaExpr) -> str: + elements: list[str] = [elmt.accept(self) for elmt in expr.elements] + return f"(schema {' '.join(elements)})" + + def visit_schema_element_expr(self, expr: SchemaElementExpr) -> str: + name: str = expr.name.lexeme if expr.name is not None else "_" + type: str = expr.type.accept(self) if expr.type is not None else "_" + return f"({name} {type})" + + def visit_type_expr(self, expr: TypeExpr) -> str: + res: str = f"({expr.name.lexeme}" + for constraint in expr.constraints: + res += " " + constraint.accept(self) + res += ")" + return res + + def visit_constraint_expr(self, expr: ConstraintExpr) -> str: + return f"(constraint {expr.left.accept(self)} {expr.op.lexeme} {expr.right.accept(self)})" + + def visit_wildcard_expr(self, expr: WildcardExpr) -> str: + return "(_)" + + def visit_literal_expr(self, expr: LiteralExpr) -> str: + return f"({expr.value})" + + +def parse(source: str) -> Optional[Stmt]: + tokens: list[Token] = AnnotationLexer(source).process() + return AnnotationParser(tokens).parse() + + +def must_parse(source: str) -> Stmt: + stmt: Optional[Stmt] = parse(source) + assert stmt is not None + return stmt + + +def ast_str(source: str) -> str: + stmt: Stmt = must_parse(source) + return AstSerializer().serialize(stmt) + + +@pytest.mark.parametrize( + "src,expected", + [ + ("Type", "(annotation Type)"), + ("Type[]", "(annotation Type (schema ))"), + ( + """ + Frame[ + verified: bool, + birth_year: int, + height: float + ( _ > 0 ) + ( _ < 250 ), + name: str, + date: datetime, + float, # unnamed + unknown: _, # untyped + _ # unnamed and untyped + ] + """, + "(annotation Frame (schema (verified (bool)) (birth_year (int)) (height (float (constraint (_) > (0.0)) (constraint (_) < (250.0)))) (name (str)) (date (datetime)) (_ (float)) (unknown _) (_ _)))", + ), + ], +) +def test_expressions(src: str, expected: str): + assert ast_str(src) == expected + + +@pytest.mark.parametrize( + "src,pos,should_fail", + [ + ("", (1, 1), True), + ("42", (1, 1), True), + ("True", (1, 1), True), + ("Type[", (1, 6), True), + ("Type[] Type2", (1, 8), False), + ("Type[bool:]", (1, 11), True), + ("Type[3]", (1, 6), True), + ("Type[bool float]", (1, 11), True), + ("Type[bool (_ < 2)]", (1, 11), True), + ("Type[bool + _ < 2)]", (1, 13), True), + ("Type[bool + (_ < 2]", (1, 19), True), + ("Type[bool + (< 2)]", (1, 14), True), + ("Type[bool + (_ + 2)]", (1, 16), True), + ("Type[bool + (Foo + Bar)]", (1, 14), True), + # ("Type[bool,]", (1, 11), True), # trailing comma is accepted, TODO: update parser or EBNF + ("Type[bool, Type[]]", (1, 16), True), + ("Type[foo: 3]", (1, 11), True), + ], +) +def test_parsing_error(src: str, pos: tuple[int, int], should_fail: bool): + tokens: list[Token] = AnnotationLexer(src).process() + parser: AnnotationParser = AnnotationParser(tokens) + stmt: Optional[Stmt] = parser.parse() + if should_fail: + assert stmt is None + assert len(parser.errors) != 0 + error_pos: Position = parser.errors[0].token.position + assert (error_pos.line, error_pos.column) == pos diff --git a/tests/parser/test_midas_parser.py b/tests/parser/test_midas_parser.py new file mode 100644 index 0000000..28a6aa7 --- /dev/null +++ b/tests/parser/test_midas_parser.py @@ -0,0 +1,202 @@ +import textwrap + +import pytest + +from core.ast.midas import ( + ConstraintExpr, + ConstraintStmt, + Expr, + LiteralExpr, + OpStmt, + PropertyStmt, + Stmt, + TypeBodyExpr, + TypeExpr, + TypeStmt, + WildcardExpr, +) +from lexer.midas import MidasLexer +from lexer.position import Position +from lexer.token import Token +from parser.midas import MidasParser + + +class AstSerializer(Stmt.Visitor[str], Expr.Visitor[str]): + def serialize(self, stmt: Stmt): + return stmt.accept(self) + + def visit_type_stmt(self, stmt: TypeStmt) -> str: + res: str = f"(type_def {stmt.name.lexeme}" + for base in stmt.bases: + res += " " + base.accept(self) + if stmt.body is not None: + res += " " + stmt.body.accept(self) + res += ")" + return res + + def visit_type_expr(self, expr: TypeExpr) -> str: + res: str = f"({expr.name.lexeme}" + for constraint in expr.constraints: + res += " " + constraint.accept(self) + res += ")" + return res + + def visit_constraint_expr(self, expr: ConstraintExpr) -> str: + return f"(constraint {expr.left.accept(self)} {expr.op.lexeme} {expr.right.accept(self)})" + + def visit_wildcard_expr(self, expr: WildcardExpr) -> str: + return "(_)" + + def visit_literal_expr(self, expr: LiteralExpr) -> str: + return f"({expr.value})" + + def visit_type_body_expr(self, expr: TypeBodyExpr) -> str: + res: str = "(body" + for prop in expr.properties: + res += " " + prop.accept(self) + res += ")" + return res + + def visit_property_stmt(self, stmt: PropertyStmt) -> str: + return f"(property {stmt.name.lexeme} {stmt.type.accept(self)})" + + def visit_op_stmt(self, stmt: OpStmt) -> str: + left: str = stmt.left.accept(self) + right: str = stmt.right.accept(self) + result: str = stmt.result.accept(self) + return f"(op_def {left} {stmt.op.lexeme} {right} {result})" + + def visit_constraint_stmt(self, stmt: ConstraintStmt) -> str: + return f"(constraint_def {stmt.name.lexeme} {stmt.constraint.accept(self)})" + + +def parse(source: str) -> list[Stmt]: + tokens: list[Token] = MidasLexer(source).process() + return MidasParser(tokens).parse() + + +def ast_str(source: str) -> list[str]: + stmts: list[Stmt] = parse(source) + return [AstSerializer().serialize(stmt) for stmt in stmts] + + +@pytest.mark.parametrize( + "src,expected", + [ + ("type Foo<>", "(type_def Foo)"), + ("type Foo", "(type_def Foo (Bar))"), + ("type Foo", "(type_def Foo (Bar) (Baz))"), + ( + "type Foo", + "(type_def Foo (Bar (constraint (_) < (2.0))) (Baz))", + ), + ( + """ + type Foo<> { + foo: Bar + } + """, + "(type_def Foo (body (property foo (Bar))))", + ), + ( + """ + type Foo<> { + foo: Bar + (_ != none) + foo2: Bar2 + (0 <= _) + (_ <= 100) + } + """, + "(type_def Foo (body (property foo (Bar (constraint (_) != (None)))) (property foo2 (Bar2 (constraint (0.0) <= (_)) (constraint (_) <= (100.0))))))", + ), + ("op + = ", "(op_def (A) + (B) (C))"), + ( + "op + = ", + "(op_def (A (constraint (_) < (100.0))) + (B (constraint (_) < (100.0))) (C (constraint (_) < (200.0))))", + ), + ( + "constraint Positive = _ >= 0", + "(constraint_def Positive (constraint (_) >= (0.0)))", + ), + ], +) +def test_expressions(src: str, expected: str | list[str]): + if isinstance(expected, str): + expected = [expected] + assert ast_str(src) == expected + + +@pytest.mark.parametrize( + "src,pos", + [ + ### + # Misc + ### + ("42", (1, 1)), + ("true", (1, 1)), + ("foo", (1, 1)), + ### + # Type statements + ### + ("type", (1, 5)), + ("type true", (1, 6)), + ("type Foo", (1, 9)), + ("type Foo<1>", (1, 10)), + # ("type Foo", (1, 16)), # trailing comma is accepted, TODO: update parser or EBNF + ("type Foo", (1, 17)), + ("type Foo { 3 }", (1, 19)), + ( + """ + type Foo { + foo + } + """, + (4, 1), + ), + ( + """ + type Foo { + foo: 3 + } + """, + (3, 10), + ), + ### + # Operation statements + ### + ("op", (1, 3)), + ("op float", (1, 4)), + ("op <", (1, 5)), + ("op ", (1, 11)), + ("op +", (1, 13)), + ("op + float", (1, 14)), + ("op + <", (1, 15)), + ("op + + ", (1, 21)), + ("op + =", (1, 23)), + ("op + = float", (1, 24)), + ("op + = <", (1, 25)), + ("op + = + = ", (1, 13)), + ("op + = ", (1, 23)), + ("op + = ", (1, 33)), + ### + # Constraint statements + ### + ("constraint", (1, 11)), + ("constraint 3", (1, 12)), + ("constraint Foo", (1, 15)), + ("constraint Foo =", (1, 17)), + ("constraint Foo = 3", (1, 19)), + ("constraint Foo = 3 <", (1, 21)), + ], +) +def test_parsing_error(src: str, pos: tuple[int, int]): + src = textwrap.dedent(src) + tokens: list[Token] = MidasLexer(src).process() + parser: MidasParser = MidasParser(tokens) + stmt: list[Stmt] = parser.parse() + assert len(stmt) == 0 + assert len(parser.errors) != 0 + error_pos: Position = parser.errors[0].token.position + assert (error_pos.line, error_pos.column) == pos diff --git a/vscode-ext/language-configurations.json b/vscode-ext/language-configurations.json new file mode 100644 index 0000000..ffd5219 --- /dev/null +++ b/vscode-ext/language-configurations.json @@ -0,0 +1,19 @@ +{ + "brackets": [ + ["{", "}"], + ["[", "]"], + ["<", ">"] + ], + "autoClosingPairs": [ + { "open": "{", "close": "}" }, + { "open": "[", "close": "]" }, + { "open": "(", "close": ")" }, + { "open": "<", "close": ">" } + ], + "surroundingPairs": [ + ["{", "}"], + ["[", "]"], + ["(", ")"], + ["<", ">"] + ] +} \ No newline at end of file diff --git a/vscode-ext/package.json b/vscode-ext/package.json new file mode 100644 index 0000000..bd2c40b --- /dev/null +++ b/vscode-ext/package.json @@ -0,0 +1,33 @@ +{ + "name": "midas", + "version": "0.1.0", + "engines": { + "vscode": "*" + }, + "categories": ["Programming Languages"], + "contributes": { + "languages": [ + { + "id": "midas", + "extensions": [ + ".mpy", + ".midas" + ], + "aliases": [ + "Midas" + ], + "configuration": "./language-configuration.json" + } + ], + "grammars": [ + { + "language": "midas", + "scopeName": "source.midas", + "path": "./syntaxes/midas.tmLanguage.json", + "embeddedLanguages": { + "meta.embedded.block.python": "python" + } + } + ] + } +} \ No newline at end of file diff --git a/vscode-ext/syntaxes/midas.tmLanguage.json b/vscode-ext/syntaxes/midas.tmLanguage.json new file mode 100644 index 0000000..44745b0 --- /dev/null +++ b/vscode-ext/syntaxes/midas.tmLanguage.json @@ -0,0 +1,135 @@ +{ + "$schema": "https://raw.githubusercontent.com/martinring/tmlanguage/master/tmlanguage.json", + "name": "Midas", + "scopeName": "source.midas", + "patterns": [{ "include": "#statement" }], + "repository": { + "comment": { + "begin": "(//)", + "end": "($)", + "name": "comment.line", + "beginCaptures": { + "1": { + "name": "comment.line.double-dash" + } + } + }, + "type-def": { + "begin": "\\b(type)\\s+([a-zA-Z_][a-zA-Z_\\d]*)", + "end": "$", + "beginCaptures": { + "1": { + "name": "keyword.control.type.midas" + }, + "2": { + "name" : "variable.name" + } + }, + "patterns": [ + { "include": "#type-base" }, + { "include": "#type-body" } + ] + }, + "type-base": { + "begin": "<", + "end": ">", + "beginCaptures": { + "0": { + "name": "punctuation.definition.base.begin.midas" + } + }, + "endCaptures": { + "0": { + "name": "punctuation.definition.base.end.midas" + } + }, + "patterns": [ + {"include": "source.python"} + ] + }, + "type-body": { + "begin": "\\{", + "end": "\\}", + "beginCaptures": { + "0": { + "name": "punctuation.definition.type-body.begin.midas" + } + }, + "endCaptures": { + "0": { + "name": "punctuation.definition.type-body.end.midas" + } + }, + "patterns": [ + {"include": "#type-prop"} + ] + }, + "type-prop": { + "match": "([a-zA-Z_][a-zA-Z_\\d]*)(:)\\s*([a-zA-Z_][a-zA-Z_\\d]*)", + "captures": { + "1": { + "name": "variable.name" + }, + "2": { + "name": "punctuation.separator.annotation.midas" + }, + "3": { + "name": "meta.type.name" + } + } + }, + "op-def": { + "match": "\\b(op)\\s+<([a-zA-Z_][a-zA-Z_\\d]*)>\\s+(\\S+)\\s+<([a-zA-Z_][a-zA-Z_\\d]*)>\\s+(=)\\s+<([a-zA-Z_][a-zA-Z_\\d]*)>", + "captures": { + "1": { + "name": "keyword.control.op.midas" + }, + "2": { + "name" : "variable.name" + }, + "3": { + "name" : "keyword.operator" + }, + "4": { + "name" : "variable.name" + }, + "5": { + "name" : "keyword.operator.assignment" + }, + "6": { + "name" : "variable.name" + } + }, + "patterns": [ + { "include": "#type-base" }, + { "include": "#type-body" } + ] + }, + "constr-def": { + "begin": "(constraint)\\s+([a-zA-Z_][a-zA-Z_\\d]*)\\s*(=)", + "end": "$", + "beginCaptures": { + "1": { + "name": "keyword.control.constr.midas" + }, + "2": { + "name": "variable.name" + }, + "3": { + "name": "keyword.operator.assignment" + } + }, + "patterns": [ + { "include": "source.python" } + ] + }, + "statement": { + "patterns": [ + { "include": "#comment" }, + { "include": "#type-def" }, + { "include": "#op-def" }, + { "include": "#constr-def" } + ] + } + } +} \ No newline at end of file