pmp/compiler/parser.py

from dataclasses import dataclass
import imp
from webbrowser import Opera
from .defs import Token, IntegerLiteralToken, TokenType, OperatorToken, KeywordToken, IdentifierToken, Span
from .lexer import Lexer
from .helpers import ParserIterator
from .errors import CompilerError, EndOfInputError, InvalidTokenError, UnsupportedSyntaxError, print_warning

from typing import Tuple, Optional, List, Dict, Set, Iterable


@dataclass(frozen=True)
class Type:
    name: IdentifierToken | str
    wraps: Tuple['Type', ...]

@dataclass(frozen=True)
class Value:
    type: Optional[Type]
    value: 'ASTNode'

@dataclass(frozen=True)
class FunctionArgument:
    name: IdentifierToken
    type: Type
    #default_value: Value | None

@dataclass(frozen=True)
class ASTNode:
    pass

@dataclass(frozen=True)
class FunctionNode(ASTNode):
    name: IdentifierToken
    args: Tuple[FunctionArgument, ...]
    return_type: Type
    contents: Tuple[ASTNode, ...]


@dataclass(frozen=True)
class FunctionCallNode(ASTNode):
    function: Value
    arguments: List[Value]

@dataclass(frozen=True)
class VariableDeclarationNode(ASTNode):
    name: IdentifierToken
    modifiers: List[str]
    type: Type
    value: Value

@dataclass(frozen=True)
class ForLoopNode(ASTNode):
    variable_name: IdentifierToken | str
    iterator: Value
    body: List[ASTNode]

@dataclass(frozen=True)
class SpreadOperatorNode(ASTNode):
    left_side: Value
    right_side: Value
    type: Type | None

@dataclass(frozen=True)
class IntegerImmediateNode(ASTNode):
    value: int
    type: Type
    span: Span

@dataclass(frozen=True)
class StringImmediateNode(ASTNode):
    value: str
    type: Type
    span: Span

@dataclass(frozen=True)
class VariableNameNode(ASTNode):
    name: IdentifierToken

@dataclass(frozen=True)
class BracketetExpressionNode(ASTNode):
    content: ASTNode

@dataclass(frozen=True)
class UseStatement(ASTNode):
    path: List[IdentifierToken]


class Parser:
    """
    This class takes a lexed input and produces a syntax tree.

    This only validates syntax, but does no type checking etc...
    """
    types: Dict[str, Type]

    lexer: Lexer
    tokens: ParserIterator[Token]

    def __init__(self, lexer: Lexer):
        self.variables = dict()
        self.lexer = lexer
        # strip comments from tokens
        self.tokens = ParserIterator(lexer.do_parse())

    def parse(self):
        body = []

        while True:
            thing = self.parse_file_level_block()
            if thing is None:
                return body
            body.append(thing)
            print(thing)


    def consume_next_token(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
        if not isinstance(types, set) and types is not None:
            types = {types}

        peeked = self.tokens.peek()
        if peeked is None:
            raise EndOfInputError(self.tokens.last_item.span, content)
        if types is not None and peeked.kind not in types:
            raise InvalidTokenError(peeked, (*types, content), msg)
        if content is not None and peeked.content != content:
            raise InvalidTokenError(peeked, {content}, msg)

        return self.tokens.next()

    def consume_optional_eol(self):
        """
        This function tries to consume EOL tokens, if they are available
        """
        while self.tokens.peek().kind == TokenType.EOL:
            self.tokens.next()

    def consume_expected_eol(self, msg):
        """
        This function consumes at least one EOL token, or fails
        """
        if self.tokens.peek().kind != TokenType.EOL:
            raise InvalidTokenError(self.tokens.peek(), expected_type=["\\n"], message=msg)
        while self.tokens.peek().kind == TokenType.EOL:
            self.tokens.next()


    def consume_optional(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
        try:
            return self.consume_next_token(types, content, msg)
        except InvalidTokenError:
            return False


    def parse_file_level_block(self) -> ASTNode | None:
        """
        File-level blocks are statements written at the file level.


        """
        # this part ignores newlines!
        prev_ignore_lvl = self.tokens.ignore_newline
        self.tokens.ignore_newline = True

        try:
            match self.tokens.peek():
                case KeywordToken(content="function"):
                    return self.parse_function_definition()
                case KeywordToken(content="struct"):
                    return self.parse_struct()
                case KeywordToken(content="const"):
                    return self.parse_const_declaration()
                case KeywordToken(content="use"):
                    return self.parse_import_statement()
                case Token(kind=TokenType.EOI):
                    return None
                case None:
                    raise Exception("Unexpected None token!")
                case unknown_token:
                    raise InvalidTokenError(unknown_token, ("function", "struct"), "Only function and struct declarations are allowed at file-level!")
        finally:
            self.tokens.ignore_newline = prev_ignore_lvl


    def parse_import_statement(self):
        """
        parse an import-equivalent statement:

        use std.String

        """
        self.consume_next_token(types=TokenType.Keyword, content="use")

        path = []

        if self.tokens.peek().kind == TokenType.String:
            raise UnsupportedSyntaxError(self.tokens.peek(), "file paths in use statements!")

        prev = self.tokens.ignore_newline
        self.tokens.ignore_newline = False

        while self.tokens.peek().kind != TokenType.EOL:
            path.append(self.consume_next_token(TokenType.Identifier))
            if self.tokens.peek().content == '.':
                self.consume_next_token(types=TokenType.Operator, content='.')

        self.consume_expected_eol("'use' statement must be terminated by EOL!")

        self.tokens.ignore_newline = prev

        return UseStatement(path)


    def parse_basic_block(self) -> Iterable[ASTNode]:
        """
        A "Basic Block" is a block inside a function, for loop, etc.
        """
        # when parsing blocks, newlines are important!
        prev_ignore_lvl = self.tokens.ignore_newline

        self.tokens.ignore_newline = False

        if prev_ignore_lvl:
            # consume all remaining EOLs
            self.consume_optional_eol()

        try:
            while True:
                match self.tokens.peek():
                    case KeywordToken(content="function"):
                        yield self.parse_function_definition()
                    case KeywordToken(content="const"):
                        yield self.parse_const_declaration()
                    case KeywordToken(content="let"):
                        raise UnsupportedSyntaxError(self.tokens.peek(), "'let' not supported yet")
                    case KeywordToken(content="for"):
                        yield self.parse_for_statement()
                    case KeywordToken(content="return"):
                        raise UnsupportedSyntaxError(self.tokens.peek(), "'return' not supported yet")
                    case KeywordToken(content="if"):
                        raise UnsupportedSyntaxError(self.tokens.peek(), "'if' not supported yet")
                    case KeywordToken(content="struct"):
                        # TODO: support
                        raise UnsupportedSyntaxError(self.tokens.peek(), "structs not supported yet")
                    case Token(kind=TokenType.RBracket, content="}"):
                        break
                    case other:
                        yield self.parse_value()
                        self.consume_expected_eol(msg="Only one statement per line permitted!")
        finally:
            self.tokens.ignore_newline = prev_ignore_lvl


    def parse_function_definition(self):
        """
        Parses a function definition including the body
        """
        self.tokens.next()

        function_name = self.consume_next_token(types=TokenType.Identifier, msg="'function' keyword must be followed by identifier!")

        # consume parenthesis
        self.consume_next_token(types = TokenType.LBracket, content="(", msg="A function declaration must contain a list of arguments enclosed in parenthesis!")

        args = []

        # TODO: we actually want to match against Token(kind=TokenType.RParen, content=")")
        while self.tokens.peek().content != ')':
            args.append(self.parse_function_def_arg())
            self.consume_optional_eol()
            if not self.consume_optional(content=','):
                break
            self.consume_optional_eol()

        self.consume_next_token(types=TokenType.RBracket, content=")", msg="Expected ')' at the end of function argument list!")

        if self.tokens.peek().content == '->':
            raise UnsupportedSyntaxError(self.tokens.peek(), "Function return type annotations are not yet supported!")

        if self.tokens.peek().content == '=>':
            raise UnsupportedSyntaxError(self.tokens.peek(), "Short function body notation not yet supported!")

        self.consume_next_token(types=TokenType.LBracket, content="{")

        content = list(self.parse_basic_block())

        self.consume_next_token(types=TokenType.RBracket, content="}", msg="Expected '}' at the end of a function body!")

        return FunctionNode(function_name, args, None, content)


    def parse_function_def_arg(self) -> FunctionArgument:
        """
        Parse a single argument of a function.

        Currently this allows name: type

        In the future we want to also support name: type = value
        """
        identifier = self.consume_next_token(types=TokenType.Identifier, msg="Function argument name expected!")

        self.consume_next_token(types=TokenType.Operator, content=":", msg="Function argument name must be followed by a colon ':' and a type definition!")

        type = self.parse_type()

        if self.tokens.peek().content == '=':
            raise UnsupportedSyntaxError(self.tokens.peek(), "default values for function arguments")

        return FunctionArgument(identifier, type)


    def parse_type(self) -> Type:
        """
        Parse a type declaration, such as String, i64, or Vector<i64>
        """
        main_type = self.consume_next_token(types=TokenType.Identifier, msg="Expected type name!")

        # if this type does not wrap any other types, we are done!
        if self.tokens.peek().content != '<':
            return Type(main_type, [])

        wrapped_types = []
        start_token = self.consume_next_token(content="<")

        while self.tokens.peek().content != '>':
            wrapped_types.append(self.parse_type())
            if not self.consume_optional(content=","):
                break
            self.consume_optional_eol()

        self.consume_next_token(content=">", msg="Error while parsing list of wrapped types, expected '>' at the end of the type list!")

        if len(wrapped_types) == 0:
            print_warning(self.tokens.last_item.span.union(start_token.span), "Empty set of type arguments!")

        return Type(main_type, wrapped_types)

    def parse_const_declaration(self):
        """
        parse a const declaration, so basically


        const name: type = value
        """
        self.consume_next_token(types=TokenType.Keyword, content="const")
        identifier = self.consume_next_token(types=TokenType.Identifier, msg="const keywords must be immediately followed by a variable name!")

        type = None
        if self.tokens.peek().content == ':':
            self.consume_next_token(content=':')
            type = self.parse_type()

        self.consume_next_token(content='=', msg="Expected '=' in const declaration!")

        value = self.parse_value()

        self.consume_expected_eol("Const declaration statement must be terminated by a newline!")

        return VariableDeclarationNode(identifier, ['const'], type, Value(None, value))

    def parse_value(self) -> ASTNode:
        """
        This function parses a "value", so basically any statement that evaluates to a value

        This can be a literal, a function call, an array/struct constructor, etc.
        """
        # handle bracketet expression
        if self.tokens.peek().content == '(':
            self.consume_next_token(content='(')
            self.consume_optional_eol()
            value = self.parse_value()
            self.consume_optional_eol()
            self.consume_next_token(content=')', msg="Expected closing bracket")

        value = self._inner_parse_value()

        match self.tokens.peek():
            case OperatorToken(content='..'):
                self.consume_next_token(types=TokenType.Operator, content="..")
                right_hand_type = self.parse_type()
                return SpreadOperatorNode(value, right_hand_type, None)
            #case OperatorToken(content):
            #    raise UnsupportedSyntaxError(self.tokens.peek(), f"'{content}' is not implemented yet!")
            case Token(kind=TokenType.LBracket, content="("):
                self.consume_next_token(content="(")
                self.consume_optional_eol()
                args = list(self.parse_inner_function_call_args())
                self.consume_next_token(content=')', msg="")
                return FunctionCallNode(value, args)

    def _inner_parse_value(self) -> ASTNode:
        match self.tokens.peek():
            case IntegerLiteralToken(value, suffix, span):
                if suffix:
                    type = Type(suffix, [])
                else:
                    # assume widest signed integer type available
                    type = Type('i64', [])
                self.consume_next_token()
                return IntegerImmediateNode(value, type, span)
            case Token(span, content, kind=TokenType.String):
                self.consume_next_token()
                return StringImmediateNode(content, Type("String", []), span)
            case Token(content="{", kind=TokenType.LBracket):
                return self.parse_structured_value()
            case IdentifierToken():
                return VariableNameNode(self.consume_next_token(TokenType.Identifier))
            case other:
                raise UnsupportedSyntaxError(other, "This type of value is not implemented yet!")

    def parse_inner_function_call_args(self) -> Iterable[ASTNode]:
        if self.tokens.peek().content == ')':
            return

        while True:
            self.consume_optional_eol()
            yield self.parse_value()
            self.consume_optional_eol()

            if self.tokens.peek().content == ',':
                self.consume_next_token(content=",")
                continue

            if self.tokens.peek().content == ')':
                break

    def parse_structured_value(self) -> ASTNode:
        """
        parse either a list or struct initializer:

        list initializer:
        const data: Vector<i64> = {1,2,3,4,5}

        struct initializer:
        const data: MyStruct = {
            field1: 1
            field2: "Hello World"
            arrayField: {"test", "123", "these are strings"}
        }
        """
        raise UnsupportedSyntaxError(self.tokens.peek(), "Structured values such as lists, dictionaries and structs!")

    def parse_for_statement(self) -> ForLoopNode:
        self.consume_next_token(content='for', types=TokenType.Keyword)

        loop_variable_name = self.consume_next_token(types=TokenType.Identifier, msg="Name of the loop variable expected!")

        self.consume_next_token(types=TokenType.Keyword, content="in", msg="for <name> in <value> format required!")

        iterator = self.parse_value()

        self.consume_next_token(content='{')
        self.consume_optional_eol()

        body = list(self.parse_basic_block())

        self.consume_optional_eol()
        self.consume_next_token(content='}')
        self.consume_optional_eol()

        return ForLoopNode(loop_variable_name, iterator, body)