You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

467 lines
16 KiB
Python

from dataclasses import dataclass
import imp
from webbrowser import Opera
from .defs import Token, IntegerLiteralToken, TokenType, OperatorToken, KeywordToken, IdentifierToken, Span
from .lexer import Lexer
from .helpers import ParserIterator
from .errors import CompilerError, EndOfInputError, InvalidTokenError, UnsupportedSyntaxError, print_warning
from typing import Tuple, Optional, List, Dict, Set, Iterable
@dataclass(frozen=True)
class Type:
name: IdentifierToken | str
wraps: Tuple['Type', ...]
@dataclass(frozen=True)
class Value:
type: Optional[Type]
value: 'ASTNode'
@dataclass(frozen=True)
class FunctionArgument:
name: IdentifierToken
type: Type
#default_value: Value | None
@dataclass(frozen=True)
class ASTNode:
pass
@dataclass(frozen=True)
class FunctionNode(ASTNode):
name: IdentifierToken
args: Tuple[FunctionArgument, ...]
return_type: Type
contents: Tuple[ASTNode, ...]
@dataclass(frozen=True)
class FunctionCallNode(ASTNode):
function: Value
arguments: List[Value]
@dataclass(frozen=True)
class VariableDeclarationNode(ASTNode):
name: IdentifierToken
modifiers: List[str]
type: Type
value: Value
@dataclass(frozen=True)
class ForLoopNode(ASTNode):
variable_name: IdentifierToken | str
iterator: Value
body: List[ASTNode]
@dataclass(frozen=True)
class SpreadOperatorNode(ASTNode):
left_side: Value
right_side: Value
type: Type | None
@dataclass(frozen=True)
class IntegerImmediateNode(ASTNode):
value: int
type: Type
span: Span
@dataclass(frozen=True)
class StringImmediateNode(ASTNode):
value: str
type: Type
span: Span
@dataclass(frozen=True)
class VariableNameNode(ASTNode):
name: IdentifierToken
@dataclass(frozen=True)
class BracketetExpressionNode(ASTNode):
content: ASTNode
@dataclass(frozen=True)
class UseStatement(ASTNode):
path: List[IdentifierToken]
class Parser:
"""
This class takes a lexed input and produces a syntax tree.
This only validates syntax, but does no type checking etc...
"""
types: Dict[str, Type]
lexer: Lexer
tokens: ParserIterator[Token]
def __init__(self, lexer: Lexer):
self.variables = dict()
self.lexer = lexer
# strip comments from tokens
self.tokens = ParserIterator(lexer.do_parse())
def parse(self):
body = []
while True:
thing = self.parse_file_level_block()
if thing is None:
return body
body.append(thing)
print(thing)
def consume_next_token(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
if not isinstance(types, set) and types is not None:
types = {types}
peeked = self.tokens.peek()
if peeked is None:
raise EndOfInputError(self.tokens.last_item.span, content)
if types is not None and peeked.kind not in types:
raise InvalidTokenError(peeked, (*types, content), msg)
if content is not None and peeked.content != content:
raise InvalidTokenError(peeked, {content}, msg)
return self.tokens.next()
def consume_optional_eol(self):
"""
This function tries to consume EOL tokens, if they are available
"""
while self.tokens.peek().kind == TokenType.EOL:
self.tokens.next()
def consume_expected_eol(self, msg):
"""
This function consumes at least one EOL token, or fails
"""
if self.tokens.peek().kind != TokenType.EOL:
raise InvalidTokenError(self.tokens.peek(), expected_type=["\\n"], message=msg)
while self.tokens.peek().kind == TokenType.EOL:
self.tokens.next()
def consume_optional(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
try:
return self.consume_next_token(types, content, msg)
except InvalidTokenError:
return False
def parse_file_level_block(self) -> ASTNode | None:
"""
File-level blocks are statements written at the file level.
"""
# this part ignores newlines!
prev_ignore_lvl = self.tokens.ignore_newline
self.tokens.ignore_newline = True
try:
match self.tokens.peek():
case KeywordToken(content="function"):
return self.parse_function_definition()
case KeywordToken(content="struct"):
return self.parse_struct()
case KeywordToken(content="const"):
return self.parse_const_declaration()
case KeywordToken(content="use"):
return self.parse_import_statement()
case Token(kind=TokenType.EOI):
return None
case None:
raise Exception("Unexpected None token!")
case unknown_token:
raise InvalidTokenError(unknown_token, ("function", "struct"), "Only function and struct declarations are allowed at file-level!")
finally:
self.tokens.ignore_newline = prev_ignore_lvl
def parse_import_statement(self):
"""
parse an import-equivalent statement:
use std.String
"""
self.consume_next_token(types=TokenType.Keyword, content="use")
path = []
if self.tokens.peek().kind == TokenType.String:
raise UnsupportedSyntaxError(self.tokens.peek(), "file paths in use statements!")
prev = self.tokens.ignore_newline
self.tokens.ignore_newline = False
while self.tokens.peek().kind != TokenType.EOL:
path.append(self.consume_next_token(TokenType.Identifier))
if self.tokens.peek().content == '.':
self.consume_next_token(types=TokenType.Operator, content='.')
self.consume_expected_eol("'use' statement must be terminated by EOL!")
self.tokens.ignore_newline = prev
return UseStatement(path)
def parse_basic_block(self) -> Iterable[ASTNode]:
"""
A "Basic Block" is a block inside a function, for loop, etc.
"""
# when parsing blocks, newlines are important!
prev_ignore_lvl = self.tokens.ignore_newline
self.tokens.ignore_newline = False
if prev_ignore_lvl:
# consume all remaining EOLs
self.consume_optional_eol()
try:
while True:
match self.tokens.peek():
case KeywordToken(content="function"):
yield self.parse_function_definition()
case KeywordToken(content="const"):
yield self.parse_const_declaration()
case KeywordToken(content="let"):
raise UnsupportedSyntaxError(self.tokens.peek(), "'let' not supported yet")
case KeywordToken(content="for"):
yield self.parse_for_statement()
case KeywordToken(content="return"):
raise UnsupportedSyntaxError(self.tokens.peek(), "'return' not supported yet")
case KeywordToken(content="if"):
raise UnsupportedSyntaxError(self.tokens.peek(), "'if' not supported yet")
case KeywordToken(content="struct"):
# TODO: support
raise UnsupportedSyntaxError(self.tokens.peek(), "structs not supported yet")
case Token(kind=TokenType.RBracket, content="}"):
break
case other:
yield self.parse_value()
self.consume_expected_eol(msg="Only one statement per line permitted!")
finally:
self.tokens.ignore_newline = prev_ignore_lvl
def parse_function_definition(self):
"""
Parses a function definition including the body
"""
self.tokens.next()
function_name = self.consume_next_token(types=TokenType.Identifier, msg="'function' keyword must be followed by identifier!")
# consume parenthesis
self.consume_next_token(types = TokenType.LBracket, content="(", msg="A function declaration must contain a list of arguments enclosed in parenthesis!")
args = []
# TODO: we actually want to match against Token(kind=TokenType.RParen, content=")")
while self.tokens.peek().content != ')':
args.append(self.parse_function_def_arg())
self.consume_optional_eol()
if not self.consume_optional(content=','):
break
self.consume_optional_eol()
self.consume_next_token(types=TokenType.RBracket, content=")", msg="Expected ')' at the end of function argument list!")
if self.tokens.peek().content == '->':
raise UnsupportedSyntaxError(self.tokens.peek(), "Function return type annotations are not yet supported!")
if self.tokens.peek().content == '=>':
raise UnsupportedSyntaxError(self.tokens.peek(), "Short function body notation not yet supported!")
self.consume_next_token(types=TokenType.LBracket, content="{")
content = list(self.parse_basic_block())
self.consume_next_token(types=TokenType.RBracket, content="}", msg="Expected '}' at the end of a function body!")
return FunctionNode(function_name, args, None, content)
def parse_function_def_arg(self) -> FunctionArgument:
"""
Parse a single argument of a function.
Currently this allows name: type
In the future we want to also support name: type = value
"""
identifier = self.consume_next_token(types=TokenType.Identifier, msg="Function argument name expected!")
self.consume_next_token(types=TokenType.Operator, content=":", msg="Function argument name must be followed by a colon ':' and a type definition!")
type = self.parse_type()
if self.tokens.peek().content == '=':
raise UnsupportedSyntaxError(self.tokens.peek(), "default values for function arguments")
return FunctionArgument(identifier, type)
def parse_type(self) -> Type:
"""
Parse a type declaration, such as String, i64, or Vector<i64>
"""
main_type = self.consume_next_token(types=TokenType.Identifier, msg="Expected type name!")
# if this type does not wrap any other types, we are done!
if self.tokens.peek().content != '<':
return Type(main_type, [])
wrapped_types = []
start_token = self.consume_next_token(content="<")
while self.tokens.peek().content != '>':
wrapped_types.append(self.parse_type())
if not self.consume_optional(content=","):
break
self.consume_optional_eol()
self.consume_next_token(content=">", msg="Error while parsing list of wrapped types, expected '>' at the end of the type list!")
if len(wrapped_types) == 0:
print_warning(self.tokens.last_item.span.union(start_token.span), "Empty set of type arguments!")
return Type(main_type, wrapped_types)
def parse_const_declaration(self):
"""
parse a const declaration, so basically
const name: type = value
"""
self.consume_next_token(types=TokenType.Keyword, content="const")
identifier = self.consume_next_token(types=TokenType.Identifier, msg="const keywords must be immediately followed by a variable name!")
type = None
if self.tokens.peek().content == ':':
self.consume_next_token(content=':')
type = self.parse_type()
self.consume_next_token(content='=', msg="Expected '=' in const declaration!")
value = self.parse_value()
self.consume_expected_eol("Const declaration statement must be terminated by a newline!")
return VariableDeclarationNode(identifier, ['const'], type, Value(None, value))
def parse_value(self) -> ASTNode:
"""
This function parses a "value", so basically any statement that evaluates to a value
This can be a literal, a function call, an array/struct constructor, etc.
"""
# handle bracketet expression
if self.tokens.peek().content == '(':
self.consume_next_token(content='(')
self.consume_optional_eol()
value = self.parse_value()
self.consume_optional_eol()
self.consume_next_token(content=')', msg="Expected closing bracket")
value = self._inner_parse_value()
match self.tokens.peek():
case OperatorToken(content='..'):
self.consume_next_token(types=TokenType.Operator, content="..")
right_hand_type = self.parse_type()
return SpreadOperatorNode(value, right_hand_type, None)
#case OperatorToken(content):
# raise UnsupportedSyntaxError(self.tokens.peek(), f"'{content}' is not implemented yet!")
case Token(kind=TokenType.LBracket, content="("):
self.consume_next_token(content="(")
self.consume_optional_eol()
args = list(self.parse_inner_function_call_args())
self.consume_next_token(content=')', msg="")
return FunctionCallNode(value, args)
def _inner_parse_value(self) -> ASTNode:
match self.tokens.peek():
case IntegerLiteralToken(value, suffix, span):
if suffix:
type = Type(suffix, [])
else:
# assume widest signed integer type available
type = Type('i64', [])
self.consume_next_token()
return IntegerImmediateNode(value, type, span)
case Token(span, content, kind=TokenType.String):
self.consume_next_token()
return StringImmediateNode(content, Type("String", []), span)
case Token(content="{", kind=TokenType.LBracket):
return self.parse_structured_value()
case IdentifierToken():
return VariableNameNode(self.consume_next_token(TokenType.Identifier))
case other:
raise UnsupportedSyntaxError(other, "This type of value is not implemented yet!")
def parse_inner_function_call_args(self) -> Iterable[ASTNode]:
if self.tokens.peek().content == ')':
return
while True:
self.consume_optional_eol()
yield self.parse_value()
self.consume_optional_eol()
if self.tokens.peek().content == ',':
self.consume_next_token(content=",")
continue
if self.tokens.peek().content == ')':
break
def parse_structured_value(self) -> ASTNode:
"""
parse either a list or struct initializer:
list initializer:
const data: Vector<i64> = {1,2,3,4,5}
struct initializer:
const data: MyStruct = {
field1: 1
field2: "Hello World"
arrayField: {"test", "123", "these are strings"}
}
"""
raise UnsupportedSyntaxError(self.tokens.peek(), "Structured values such as lists, dictionaries and structs!")
def parse_for_statement(self) -> ForLoopNode:
self.consume_next_token(content='for', types=TokenType.Keyword)
loop_variable_name = self.consume_next_token(types=TokenType.Identifier, msg="Name of the loop variable expected!")
self.consume_next_token(types=TokenType.Keyword, content="in", msg="for <name> in <value> format required!")
iterator = self.parse_value()
self.consume_next_token(content='{')
self.consume_optional_eol()
body = list(self.parse_basic_block())
self.consume_optional_eol()
self.consume_next_token(content='}')
self.consume_optional_eol()
return ForLoopNode(loop_variable_name, iterator, body)