You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
467 lines
16 KiB
Python
467 lines
16 KiB
Python
from dataclasses import dataclass
|
|
import imp
|
|
from webbrowser import Opera
|
|
from .defs import Token, IntegerLiteralToken, TokenType, OperatorToken, KeywordToken, IdentifierToken, Span
|
|
from .lexer import Lexer
|
|
from .helpers import ParserIterator
|
|
from .errors import CompilerError, EndOfInputError, InvalidTokenError, UnsupportedSyntaxError, print_warning
|
|
|
|
from typing import Tuple, Optional, List, Dict, Set, Iterable
|
|
|
|
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Type:
|
|
name: IdentifierToken | str
|
|
wraps: Tuple['Type', ...]
|
|
|
|
@dataclass(frozen=True)
|
|
class Value:
|
|
type: Optional[Type]
|
|
value: 'ASTNode'
|
|
|
|
@dataclass(frozen=True)
|
|
class FunctionArgument:
|
|
name: IdentifierToken
|
|
type: Type
|
|
#default_value: Value | None
|
|
|
|
@dataclass(frozen=True)
|
|
class ASTNode:
|
|
pass
|
|
|
|
@dataclass(frozen=True)
|
|
class FunctionNode(ASTNode):
|
|
name: IdentifierToken
|
|
args: Tuple[FunctionArgument, ...]
|
|
return_type: Type
|
|
contents: Tuple[ASTNode, ...]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class FunctionCallNode(ASTNode):
|
|
function: Value
|
|
arguments: List[Value]
|
|
|
|
@dataclass(frozen=True)
|
|
class VariableDeclarationNode(ASTNode):
|
|
name: IdentifierToken
|
|
modifiers: List[str]
|
|
type: Type
|
|
value: Value
|
|
|
|
@dataclass(frozen=True)
|
|
class ForLoopNode(ASTNode):
|
|
variable_name: IdentifierToken | str
|
|
iterator: Value
|
|
body: List[ASTNode]
|
|
|
|
@dataclass(frozen=True)
|
|
class SpreadOperatorNode(ASTNode):
|
|
left_side: Value
|
|
right_side: Value
|
|
type: Type | None
|
|
|
|
@dataclass(frozen=True)
|
|
class IntegerImmediateNode(ASTNode):
|
|
value: int
|
|
type: Type
|
|
span: Span
|
|
|
|
@dataclass(frozen=True)
|
|
class StringImmediateNode(ASTNode):
|
|
value: str
|
|
type: Type
|
|
span: Span
|
|
|
|
@dataclass(frozen=True)
|
|
class VariableNameNode(ASTNode):
|
|
name: IdentifierToken
|
|
|
|
@dataclass(frozen=True)
|
|
class BracketetExpressionNode(ASTNode):
|
|
content: ASTNode
|
|
|
|
@dataclass(frozen=True)
|
|
class UseStatement(ASTNode):
|
|
path: List[IdentifierToken]
|
|
|
|
|
|
class Parser:
|
|
"""
|
|
This class takes a lexed input and produces a syntax tree.
|
|
|
|
This only validates syntax, but does no type checking etc...
|
|
"""
|
|
types: Dict[str, Type]
|
|
|
|
lexer: Lexer
|
|
tokens: ParserIterator[Token]
|
|
|
|
def __init__(self, lexer: Lexer):
|
|
self.variables = dict()
|
|
self.lexer = lexer
|
|
# strip comments from tokens
|
|
self.tokens = ParserIterator(lexer.do_parse())
|
|
|
|
def parse(self):
|
|
body = []
|
|
|
|
while True:
|
|
thing = self.parse_file_level_block()
|
|
if thing is None:
|
|
return body
|
|
body.append(thing)
|
|
print(thing)
|
|
|
|
|
|
def consume_next_token(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
|
|
if not isinstance(types, set) and types is not None:
|
|
types = {types}
|
|
|
|
peeked = self.tokens.peek()
|
|
if peeked is None:
|
|
raise EndOfInputError(self.tokens.last_item.span, content)
|
|
if types is not None and peeked.kind not in types:
|
|
raise InvalidTokenError(peeked, (*types, content), msg)
|
|
if content is not None and peeked.content != content:
|
|
raise InvalidTokenError(peeked, {content}, msg)
|
|
|
|
return self.tokens.next()
|
|
|
|
def consume_optional_eol(self):
|
|
"""
|
|
This function tries to consume EOL tokens, if they are available
|
|
"""
|
|
while self.tokens.peek().kind == TokenType.EOL:
|
|
self.tokens.next()
|
|
|
|
def consume_expected_eol(self, msg):
|
|
"""
|
|
This function consumes at least one EOL token, or fails
|
|
"""
|
|
if self.tokens.peek().kind != TokenType.EOL:
|
|
raise InvalidTokenError(self.tokens.peek(), expected_type=["\\n"], message=msg)
|
|
while self.tokens.peek().kind == TokenType.EOL:
|
|
self.tokens.next()
|
|
|
|
|
|
|
|
def consume_optional(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
|
|
try:
|
|
return self.consume_next_token(types, content, msg)
|
|
except InvalidTokenError:
|
|
return False
|
|
|
|
|
|
def parse_file_level_block(self) -> ASTNode | None:
|
|
"""
|
|
File-level blocks are statements written at the file level.
|
|
|
|
|
|
"""
|
|
# this part ignores newlines!
|
|
prev_ignore_lvl = self.tokens.ignore_newline
|
|
self.tokens.ignore_newline = True
|
|
|
|
try:
|
|
match self.tokens.peek():
|
|
case KeywordToken(content="function"):
|
|
return self.parse_function_definition()
|
|
case KeywordToken(content="struct"):
|
|
return self.parse_struct()
|
|
case KeywordToken(content="const"):
|
|
return self.parse_const_declaration()
|
|
case KeywordToken(content="use"):
|
|
return self.parse_import_statement()
|
|
case Token(kind=TokenType.EOI):
|
|
return None
|
|
case None:
|
|
raise Exception("Unexpected None token!")
|
|
case unknown_token:
|
|
raise InvalidTokenError(unknown_token, ("function", "struct"), "Only function and struct declarations are allowed at file-level!")
|
|
finally:
|
|
self.tokens.ignore_newline = prev_ignore_lvl
|
|
|
|
|
|
def parse_import_statement(self):
|
|
"""
|
|
parse an import-equivalent statement:
|
|
|
|
use std.String
|
|
|
|
"""
|
|
self.consume_next_token(types=TokenType.Keyword, content="use")
|
|
|
|
path = []
|
|
|
|
if self.tokens.peek().kind == TokenType.String:
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "file paths in use statements!")
|
|
|
|
prev = self.tokens.ignore_newline
|
|
self.tokens.ignore_newline = False
|
|
|
|
while self.tokens.peek().kind != TokenType.EOL:
|
|
path.append(self.consume_next_token(TokenType.Identifier))
|
|
if self.tokens.peek().content == '.':
|
|
self.consume_next_token(types=TokenType.Operator, content='.')
|
|
|
|
self.consume_expected_eol("'use' statement must be terminated by EOL!")
|
|
|
|
self.tokens.ignore_newline = prev
|
|
|
|
return UseStatement(path)
|
|
|
|
|
|
|
|
def parse_basic_block(self) -> Iterable[ASTNode]:
|
|
"""
|
|
A "Basic Block" is a block inside a function, for loop, etc.
|
|
"""
|
|
# when parsing blocks, newlines are important!
|
|
prev_ignore_lvl = self.tokens.ignore_newline
|
|
|
|
self.tokens.ignore_newline = False
|
|
|
|
if prev_ignore_lvl:
|
|
# consume all remaining EOLs
|
|
self.consume_optional_eol()
|
|
|
|
try:
|
|
while True:
|
|
match self.tokens.peek():
|
|
case KeywordToken(content="function"):
|
|
yield self.parse_function_definition()
|
|
case KeywordToken(content="const"):
|
|
yield self.parse_const_declaration()
|
|
case KeywordToken(content="let"):
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "'let' not supported yet")
|
|
case KeywordToken(content="for"):
|
|
yield self.parse_for_statement()
|
|
case KeywordToken(content="return"):
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "'return' not supported yet")
|
|
case KeywordToken(content="if"):
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "'if' not supported yet")
|
|
case KeywordToken(content="struct"):
|
|
# TODO: support
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "structs not supported yet")
|
|
case Token(kind=TokenType.RBracket, content="}"):
|
|
break
|
|
case other:
|
|
yield self.parse_value()
|
|
self.consume_expected_eol(msg="Only one statement per line permitted!")
|
|
finally:
|
|
self.tokens.ignore_newline = prev_ignore_lvl
|
|
|
|
|
|
|
|
def parse_function_definition(self):
|
|
"""
|
|
Parses a function definition including the body
|
|
"""
|
|
self.tokens.next()
|
|
|
|
function_name = self.consume_next_token(types=TokenType.Identifier, msg="'function' keyword must be followed by identifier!")
|
|
|
|
# consume parenthesis
|
|
self.consume_next_token(types = TokenType.LBracket, content="(", msg="A function declaration must contain a list of arguments enclosed in parenthesis!")
|
|
|
|
args = []
|
|
|
|
# TODO: we actually want to match against Token(kind=TokenType.RParen, content=")")
|
|
while self.tokens.peek().content != ')':
|
|
args.append(self.parse_function_def_arg())
|
|
self.consume_optional_eol()
|
|
if not self.consume_optional(content=','):
|
|
break
|
|
self.consume_optional_eol()
|
|
|
|
self.consume_next_token(types=TokenType.RBracket, content=")", msg="Expected ')' at the end of function argument list!")
|
|
|
|
if self.tokens.peek().content == '->':
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "Function return type annotations are not yet supported!")
|
|
|
|
if self.tokens.peek().content == '=>':
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "Short function body notation not yet supported!")
|
|
|
|
self.consume_next_token(types=TokenType.LBracket, content="{")
|
|
|
|
content = list(self.parse_basic_block())
|
|
|
|
self.consume_next_token(types=TokenType.RBracket, content="}", msg="Expected '}' at the end of a function body!")
|
|
|
|
return FunctionNode(function_name, args, None, content)
|
|
|
|
|
|
def parse_function_def_arg(self) -> FunctionArgument:
|
|
"""
|
|
Parse a single argument of a function.
|
|
|
|
Currently this allows name: type
|
|
|
|
In the future we want to also support name: type = value
|
|
"""
|
|
identifier = self.consume_next_token(types=TokenType.Identifier, msg="Function argument name expected!")
|
|
|
|
self.consume_next_token(types=TokenType.Operator, content=":", msg="Function argument name must be followed by a colon ':' and a type definition!")
|
|
|
|
type = self.parse_type()
|
|
|
|
if self.tokens.peek().content == '=':
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "default values for function arguments")
|
|
|
|
return FunctionArgument(identifier, type)
|
|
|
|
|
|
def parse_type(self) -> Type:
|
|
"""
|
|
Parse a type declaration, such as String, i64, or Vector<i64>
|
|
"""
|
|
main_type = self.consume_next_token(types=TokenType.Identifier, msg="Expected type name!")
|
|
|
|
# if this type does not wrap any other types, we are done!
|
|
if self.tokens.peek().content != '<':
|
|
return Type(main_type, [])
|
|
|
|
wrapped_types = []
|
|
start_token = self.consume_next_token(content="<")
|
|
|
|
while self.tokens.peek().content != '>':
|
|
wrapped_types.append(self.parse_type())
|
|
if not self.consume_optional(content=","):
|
|
break
|
|
self.consume_optional_eol()
|
|
|
|
self.consume_next_token(content=">", msg="Error while parsing list of wrapped types, expected '>' at the end of the type list!")
|
|
|
|
if len(wrapped_types) == 0:
|
|
print_warning(self.tokens.last_item.span.union(start_token.span), "Empty set of type arguments!")
|
|
|
|
return Type(main_type, wrapped_types)
|
|
|
|
def parse_const_declaration(self):
|
|
"""
|
|
parse a const declaration, so basically
|
|
|
|
|
|
const name: type = value
|
|
"""
|
|
self.consume_next_token(types=TokenType.Keyword, content="const")
|
|
identifier = self.consume_next_token(types=TokenType.Identifier, msg="const keywords must be immediately followed by a variable name!")
|
|
|
|
type = None
|
|
if self.tokens.peek().content == ':':
|
|
self.consume_next_token(content=':')
|
|
type = self.parse_type()
|
|
|
|
self.consume_next_token(content='=', msg="Expected '=' in const declaration!")
|
|
|
|
value = self.parse_value()
|
|
|
|
self.consume_expected_eol("Const declaration statement must be terminated by a newline!")
|
|
|
|
return VariableDeclarationNode(identifier, ['const'], type, Value(None, value))
|
|
|
|
def parse_value(self) -> ASTNode:
|
|
"""
|
|
This function parses a "value", so basically any statement that evaluates to a value
|
|
|
|
This can be a literal, a function call, an array/struct constructor, etc.
|
|
"""
|
|
# handle bracketet expression
|
|
if self.tokens.peek().content == '(':
|
|
self.consume_next_token(content='(')
|
|
self.consume_optional_eol()
|
|
value = self.parse_value()
|
|
self.consume_optional_eol()
|
|
self.consume_next_token(content=')', msg="Expected closing bracket")
|
|
|
|
value = self._inner_parse_value()
|
|
|
|
match self.tokens.peek():
|
|
case OperatorToken(content='..'):
|
|
self.consume_next_token(types=TokenType.Operator, content="..")
|
|
right_hand_type = self.parse_type()
|
|
return SpreadOperatorNode(value, right_hand_type, None)
|
|
#case OperatorToken(content):
|
|
# raise UnsupportedSyntaxError(self.tokens.peek(), f"'{content}' is not implemented yet!")
|
|
case Token(kind=TokenType.LBracket, content="("):
|
|
self.consume_next_token(content="(")
|
|
self.consume_optional_eol()
|
|
args = list(self.parse_inner_function_call_args())
|
|
self.consume_next_token(content=')', msg="")
|
|
return FunctionCallNode(value, args)
|
|
|
|
def _inner_parse_value(self) -> ASTNode:
|
|
match self.tokens.peek():
|
|
case IntegerLiteralToken(value, suffix, span):
|
|
if suffix:
|
|
type = Type(suffix, [])
|
|
else:
|
|
# assume widest signed integer type available
|
|
type = Type('i64', [])
|
|
self.consume_next_token()
|
|
return IntegerImmediateNode(value, type, span)
|
|
case Token(span, content, kind=TokenType.String):
|
|
self.consume_next_token()
|
|
return StringImmediateNode(content, Type("String", []), span)
|
|
case Token(content="{", kind=TokenType.LBracket):
|
|
return self.parse_structured_value()
|
|
case IdentifierToken():
|
|
return VariableNameNode(self.consume_next_token(TokenType.Identifier))
|
|
case other:
|
|
raise UnsupportedSyntaxError(other, "This type of value is not implemented yet!")
|
|
|
|
def parse_inner_function_call_args(self) -> Iterable[ASTNode]:
|
|
if self.tokens.peek().content == ')':
|
|
return
|
|
|
|
while True:
|
|
self.consume_optional_eol()
|
|
yield self.parse_value()
|
|
self.consume_optional_eol()
|
|
|
|
if self.tokens.peek().content == ',':
|
|
self.consume_next_token(content=",")
|
|
continue
|
|
|
|
if self.tokens.peek().content == ')':
|
|
break
|
|
|
|
def parse_structured_value(self) -> ASTNode:
|
|
"""
|
|
parse either a list or struct initializer:
|
|
|
|
list initializer:
|
|
const data: Vector<i64> = {1,2,3,4,5}
|
|
|
|
struct initializer:
|
|
const data: MyStruct = {
|
|
field1: 1
|
|
field2: "Hello World"
|
|
arrayField: {"test", "123", "these are strings"}
|
|
}
|
|
"""
|
|
raise UnsupportedSyntaxError(self.tokens.peek(), "Structured values such as lists, dictionaries and structs!")
|
|
|
|
def parse_for_statement(self) -> ForLoopNode:
|
|
self.consume_next_token(content='for', types=TokenType.Keyword)
|
|
|
|
loop_variable_name = self.consume_next_token(types=TokenType.Identifier, msg="Name of the loop variable expected!")
|
|
|
|
self.consume_next_token(types=TokenType.Keyword, content="in", msg="for <name> in <value> format required!")
|
|
|
|
iterator = self.parse_value()
|
|
|
|
self.consume_next_token(content='{')
|
|
self.consume_optional_eol()
|
|
|
|
body = list(self.parse_basic_block())
|
|
|
|
self.consume_optional_eol()
|
|
self.consume_next_token(content='}')
|
|
self.consume_optional_eol()
|
|
|
|
return ForLoopNode(loop_variable_name, iterator, body)
|