commit 525ef8f467cb2baab5e93e46a00a5a472c1728d6 Author: Anton Lydike Date: Thu Jun 16 16:17:49 2022 +0200 parser is working with very basic functionality diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7f93ebf --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv +__pycache__ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e69de29 diff --git a/compiler/__init__.py b/compiler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/compiler/ast_printer.py b/compiler/ast_printer.py new file mode 100644 index 0000000..e69de29 diff --git a/compiler/defs.py b/compiler/defs.py new file mode 100644 index 0000000..835d8ae --- /dev/null +++ b/compiler/defs.py @@ -0,0 +1,213 @@ +from dataclasses import dataclass, fields +from mimetypes import suffix_map +from ntpath import join +from typing import Dict, List, Iterable, Generator, Any, Literal +from enum import Enum, auto + + +@dataclass +class LexingContext: + sources: Dict[str,str] + entrypoint: str + structs: Dict[int, Any] # TODO: struct def type + functions: Dict[str, Any] # TODO: function types + + def get_nth_line_bounds(self, source_name: str, n: int): + if source_name not in self.sources: + raise KeyError("Unknown source file \"{}\"!".format(source_name)) + start = 0 + source = self.sources[source_name] + for i in range(n): + next_start = source.find('\n', start) + if next_start == -1: + return None + start = next_start + 1 + return start, source.find('\n', start) + + def get_lines_containing(self, span: 'Span'): + if span.source_name not in self.sources: + raise KeyError("Unknown source file \"{}\"!".format(span.source_name)) + start = 0 + line_no = 0 + source = self.sources[span.source_name] + while True: + next_start = source.find('\n', start) + line_no += 1 + # handle eof + if next_start == -1: + return None + # as long as the next newline comes before the spans start we are good + if next_start < span.start: + start = next_start + 1 + continue + # if the whole span is on one line, we are good as well + if next_start >= span.end: + return [ source[start:next_start] ], start, line_no + while next_start < span.end: + next_start = source.find('\n', next_start+1) + + return source[start:next_start].split('\n'), start, line_no + + +@dataclass(frozen=True) +class Span: + start: int + """ + Start of tokens location in source file, global byte offset in file + """ + end: int + """ + End of tokens location in source file, global byte offset in file + """ + source_name: str + context: LexingContext + + def union(self, *spans: 'Span'): + for span in spans: + assert span.source_name == self.source_name + assert span.context == self.context + return Span( + start=min(self.start, *(span.start for span in spans)), + end=max(self.end, *(span.end for span in spans)), + source_name=self.source_name, + context=self.context + ) + + def transform(self, start:int=0, end:int=0): + return Span(self.start + start, self.end - end, self.source_name, self.context) + + def __repr__(self): + return "{}(start={},end={},source_name={})".format( + self.__class__.__name__, + self.start, self.end, self.source_name + ) + +class TokenType(Enum): + Keyword = auto() + Integer = auto() + Float = auto() + Identifier = auto() + String = auto() + LBracket = auto() + RBracket = auto() + Operator = auto() + LineComment = auto() + MultiComment = auto() + EOL = auto() # End of Line + EOI = auto() # End of Input + + +@dataclass(frozen=True) +class Token: + span: Span + content: str + kind: TokenType + + def __new__(cls, span: Span, content: str, kind: TokenType): + if kind in token_type_to_subclass_map and cls == Token: + return token_type_to_subclass_map[kind].__new__( + token_type_to_subclass_map[kind], span, content, kind + ) + return super().__new__(cls) + + def __repr__(self): + fields_to_print = [field for field in fields(self) if field.name not in ('span', 'context', 'kind')] + if self.__class__ == Token: + return "{}[{}]({})".format( + self.__class__.__name__, + self.kind.name, + ", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print) + ) + else: + return "{}({})".format( + self.__class__.__name__, + ", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print) + ) + + +@dataclass(frozen=True, init=False, repr=False) +class IntegerLiteralToken(Token): + value: int + format: Literal['hex', 'int', 'dec', 'oct'] + suffix: str | None + + def __init__(self, span: Span, content: str, kind: TokenType): + super().__init__(span, content, kind) + assert kind == TokenType.Integer + + suffix = None + for suffix_ in integer_type_suffixes: + if content.endswith(suffix_): + suffix = suffix_ + content.removesuffix(suffix_) + break + + format = 'dec' + if content.startswith('0x') or content.startswith('0X'): + value = int(content, 16) + format = 'hex' + elif content.startswith('0b'): + value = int(content, 2) + format = 'bin' + else: + value = int(content, 10) + object.__setattr__(self, "value", value) + object.__setattr__(self, "suffix", suffix) + object.__setattr__(self, "format", format) + + +@dataclass(frozen=True, repr=False) +class KeywordToken(Token): + pass + +@dataclass(frozen=True, repr=False) +class OperatorToken(Token): + pass + +@dataclass(frozen=True, repr=False) +class IdentifierToken(Token): + pass + + + +token_type_to_subclass_map = { + TokenType.Integer: IntegerLiteralToken, + TokenType.Keyword: KeywordToken, + TokenType.Operator: OperatorToken, + TokenType.Identifier: IdentifierToken +} + + +keywords = { + 'const', 'let', 'for', 'if', 'function', + 'true', 'false', 'in', 'not', 'or', 'and', + 'struct', 'private', 'public', 'return', + 'impure', 'while', 'use', 'do', 'continue', + 'break' +} + +digits = { + 'bin': '01_', + 'hex': '0123456789abcdefABCDEF_', + 'dec': '0123456789_', + 'oct': '01234567_' # TODO: implement octal literals? +} + +operators = { + '+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '||', + '??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--', + '=', '*=', '+=', '/=', '-=' +} + +integer_type_suffixes = { + 'i8', 'i16', 'i32', 'i64', + 'u8', 'u16', 'u32', 'u64', +} + +reserved_special_chars = { + '#', '~', '`', '"', '\'', '@', '|', ';' +} + +parens = '[]{}()<>' + +identifier_terminating_chars = set((*operators, *parens, ' ', '\n', '\t', '\r', *reserved_special_chars)) \ No newline at end of file diff --git a/compiler/errors.py b/compiler/errors.py new file mode 100644 index 0000000..5f64d3b --- /dev/null +++ b/compiler/errors.py @@ -0,0 +1,90 @@ +from .defs import Span, LexingContext, Token, TokenType +from math import exp, log10, ceil +from typing import Iterable + + + + +def create_span_context_str(span: Span, message: str, color: str = '\033[31m'): + lines, offset_into_file, line_no = span.context.get_lines_containing(span) + relative_offset = span.start - offset_into_file + annotation_len = span.end - span.start + + digit_len = ceil(log10(line_no + len(lines))) + if digit_len == 0: + digit_len = 1 + + output_str = ">>> In file {}:{}\n".format(span.source_name, line_no) + + for i, source_line in enumerate(lines): + source_line = source_line[:relative_offset] + color + source_line[relative_offset:relative_offset+annotation_len] + '\033[0m' + source_line[relative_offset+annotation_len:] + output_str += '{:>{}d}: {}\n'.format(line_no + i, digit_len, source_line) + + if relative_offset > len(source_line): + continue + # TODO: handle multi-line underlines + output_str += "{}{}{}{}\n".format( + color, + ' ' * (relative_offset + digit_len + 2), + '^' * min(annotation_len, len(source_line) - relative_offset), + '\033[0m' + ) + if annotation_len > len(source_line) - relative_offset: + relative_offset = 0 + annotation_len -= len(source_line) - relative_offset + + if message: + output_str += color + output_str += ' ' * (relative_offset + digit_len + 2) + '|\n' + for message_line in message.split("\n"): + output_str += ' ' * (relative_offset + digit_len + 2) + message_line + '\n' + + return output_str + '\033[0m' + +def print_warning(span: Span, message: str, color="\033[33m"): + print(create_span_context_str(span, "Warning: " + message, color)) + + +class CompilerError(Exception): + span: Span + message: str + + def __init__(self, msg: str, span: Span=None) -> None: + super().__init__((msg, span)) + self.span = span + self.message = msg + + + def print_context_message(self): + if not self.span: + print("\n".join(">>> {}".format(line) for line in self.message.split('\n'))) + else: + print(create_span_context_str(self.span, self.message)) + + +class EndOfInputError(CompilerError): + def __init__(self,span: Span, search_str:str = None) -> None: + + if search_str: + super().__init__(f"Unexpected end-of-input in {span.source_name} while scanning for {search_str}!", span) + else: + super().__init__(f"Unexpected end-of-input in {span.source_name}!", span) + +class ParseError(CompilerError): + def __init__(self, msg: str, span: Span = None) -> None: + super().__init__(msg, span) + +class InvalidTokenError(CompilerError): + def __init__(self, token: Token, expected_type: Iterable[str | TokenType] = None, message: str = None) -> None: + + expected = ", expected {}".format(", ".join(f"{x}" for x in expected_type)) if expected_type else "" + + super().__init__("Unexpected token {}{} {}".format( + token, expected, '\n' + message if message else "" + ), token.span if token is not None else None) + + +class UnsupportedSyntaxError(CompilerError): + def __init__(self, token: Token, feature: str) -> None: + super().__init__("Unsupported syntax: {}".format(feature), token.span) + diff --git a/compiler/helpers.py b/compiler/helpers.py new file mode 100644 index 0000000..d267729 --- /dev/null +++ b/compiler/helpers.py @@ -0,0 +1,74 @@ +from typing import TypeVar, Generic, Iterable, Iterator, List +from .defs import Token, TokenType + + +T = TypeVar("T") +class PeekableIterator(Iterator[T]): + _peeked: List[T] + last_item: T | None + + def __init__(self, iterable: Iterable[T]) -> None: + self.iter = iterable + self._peeked = list() + self.last_item = None + + def peek(self, offset: int = 0): + while len(self._peeked) <= offset: + try: + self._peeked.append(next(self.iter)) + except StopIteration: + return None + + return self._peeked[offset] + + def __next__(self) -> T: + if len(self._peeked) > 0: + item = self._peeked.pop(0) + else: + item = next(self.iter) + self.last_item = item + return item + + def __iter__(self) -> Iterator[T]: + return self + + def next(self) -> T: + try: + return next(self) + except StopIteration: + return None + + def has_next(self): + return self.peek() is not None + + +class ParserIterator(PeekableIterator[Token]): + def __init__(self, iterable: Iterable[Token]) -> None: + super().__init__(t for t in iterable if t.kind not in (TokenType.LineComment, TokenType.MultiComment)) + self.ignore_newline = False + + def peek(self, offset: int = 0): + while len(self._peeked) <= offset: + try: + self._peeked.append(next(self.iter)) + except StopIteration: + return None + + token = self._peeked[offset] + + if self.ignore_newline and token.kind == TokenType.EOL: + return self.peek(offset=offset+1) + + return token + + def __next__(self) -> T: + if len(self._peeked) > 0: + item = self._peeked.pop(0) + else: + item = next(self.iter) + self.last_item = item + + if self.ignore_newline and item.kind == TokenType.EOL: + return next(self) + + return item \ No newline at end of file diff --git a/compiler/lexer.py b/compiler/lexer.py new file mode 100644 index 0000000..e6e05fe --- /dev/null +++ b/compiler/lexer.py @@ -0,0 +1,201 @@ +from ast import operator +from dataclasses import dataclass +from sre_parse import parse_template +from typing import Dict, List, Iterable, Generator, Tuple +from enum import Enum, auto +from unicodedata import digit +from .errors import EndOfInputError, ParseError +from .defs import Span, LexingContext, Token, TokenType, digits, keywords, operators, integer_type_suffixes, parens, identifier_terminating_chars + + +class Lexer: + separators = ':-+*/#!"\'=?%&<>[]{}()\n \t' + parens = '[]{}()<>' + + context: LexingContext + content: str + pos: int + line: int + fname: str + size: int + + def __init__(self, fname: str, context: LexingContext): + self.content = context.sources[fname] + self.fname = fname + self.pos = self.line = 0 + self.word = "" + self.size = len(self.content) + self.context = context + + def peek(self, offset: int = 0): + if self.pos + offset >= self.size: + return None + return self.content[self.pos + offset] + + def startswith(self, *patterns: str, offset: int = 0): + # match longest first + for pattern in sorted(patterns, key=len, reverse=True): + if self.content.startswith(pattern, self.pos + offset): + return pattern + return False + + + def read_until(self, pattern: str, inclusive=True) -> Tuple[str, Span]: + start = self.pos + pos = self.pos + while not self.content[pos:].startswith(pattern) and pos < self.size: + pos += 1 + if pos == self.size: + raise EndOfInputError(Span(start, pos, self.fname, self.context), pattern) + + if inclusive: + pos += len(pattern) + self.pos = pos + + return self.content[start:pos], Span(start, pos, self.fname, self.context) + + def do_parse(self) -> Iterable[Token]: + while True: + c = self.peek() + # reached end of input + if c == None: + yield Token(Span(self.pos, self.pos, self.fname, self.context), "", TokenType.EOI) + break + + if c in '\n\r': + start = self.pos + if self.startswith('\r\n'): + self.pos += 1 + self.pos += 1 + yield self.Token(start, TokenType.EOL) + continue + + # check for integer literals + if self.startswith('0x', '0X', '0b', *'0123456789'): + yield self.parse_integer() + continue + + # check for parenthesis + if c in parens: + # left parens at position 0, 2, 4, 6 + left_paren = parens.index(c) % 2 == 0 + self.pos += 1 + yield self.Token(self.pos -1, TokenType.LBracket if left_paren else TokenType.RBracket) + continue + + if self.startswith('//'): + start = self.pos + self.read_until('\n', inclusive=False) # read until newline, but don't consume newline + yield self.Token(start, TokenType.LineComment) + continue + + if self.startswith('/*'): + start = self.pos + self.read_until('*/') + yield self.Token(start, TokenType.MultiComment) + continue + + starts_with_keyword = self.startswith(*keywords) + if starts_with_keyword: + start = self.pos + self.pos += len(starts_with_keyword) + yield self.Token(start, TokenType.Keyword) + self.consume_expected_whitespace() + continue + + starts_with_operator = self.startswith(*operators) + if starts_with_operator: + start = self.pos + self.pos += len(starts_with_operator) + yield self.Token(start, TokenType.Operator) + continue + + if self.peek() in '"\'': + yield self.parse_string() + continue + + if self.peek() in ' \t': + self.pos += 1 + continue + + # must be an identifier then + start = self.pos + while self.peek() not in identifier_terminating_chars: + self.pos += 1 + if start == self.pos: + raise ParseError("Expected identifier!", Span(start, start+1, self.fname, self.context)) + yield self.Token(start, TokenType.Identifier) + continue + + def consume_expected_whitespace(self): + if self.peek() in '\r\n': + return + if self.peek() not in '\t ': + raise ParseError("Expected whitespace here", Span(self.pos, self.pos+1, self.fname, self.context)) + while self.peek() in '\t ': + self.pos += 1 + + def parse_integer(self): + start = self.pos + + if self.startswith('-'): + self.pos += 1 + + parse_type = 'dec' + if self.startswith('0x', '0X'): + parse_type = 'hex' + self.pos += 2 + elif self.startswith('0b'): + parse_type = 'bin' + self.pos += 2 + + while self.peek() in digits[parse_type]: + self.pos += 1 + + suffix = self.startswith(*integer_type_suffixes) + if suffix: + self.pos += len(suffix) + + return self.Token(start, TokenType.Integer) + + + def parse_string(self): + start = self.pos + terminator = self.peek() + escaped = False + self.pos += 1 + string = "" + while not escaped and self.peek() != terminator: + char = self.peek() + if escaped: + match char: + case 'r': + string += '\r' + case 'b': + string += '\b' + case 'n': + string += '\n' + case 't': + string += '\t' + case 'e': # support terminal escape codes + string += '\033' + case other: + string += '\\' + other + escaped = False + elif self.peek() == '\\': + escaped = True + else: + string += char + self.pos += 1 + # consume trailing terminator + self.pos += 1 + + return self.Token(start, TokenType.String, content=string) + + def Token(self, start: int, type: TokenType, end=None, content=None) -> Token: + if end is None: + end = self.pos + if content is None: + content = self.content[start:end] + return Token(Span(start, end, self.fname, self.context), content, type) + diff --git a/compiler/parser.py b/compiler/parser.py new file mode 100644 index 0000000..b6bda87 --- /dev/null +++ b/compiler/parser.py @@ -0,0 +1,466 @@ +from dataclasses import dataclass +import imp +from webbrowser import Opera +from .defs import Token, IntegerLiteralToken, TokenType, OperatorToken, KeywordToken, IdentifierToken, Span +from .lexer import Lexer +from .helpers import ParserIterator +from .errors import CompilerError, EndOfInputError, InvalidTokenError, UnsupportedSyntaxError, print_warning + +from typing import Tuple, Optional, List, Dict, Set, Iterable + + + + +@dataclass(frozen=True) +class Type: + name: IdentifierToken | str + wraps: Tuple['Type', ...] + +@dataclass(frozen=True) +class Value: + type: Optional[Type] + value: 'ASTNode' + +@dataclass(frozen=True) +class FunctionArgument: + name: IdentifierToken + type: Type + #default_value: Value | None + +@dataclass(frozen=True) +class ASTNode: + pass + +@dataclass(frozen=True) +class FunctionNode(ASTNode): + name: IdentifierToken + args: Tuple[FunctionArgument, ...] + return_type: Type + contents: Tuple[ASTNode, ...] + + +@dataclass(frozen=True) +class FunctionCallNode(ASTNode): + function: Value + arguments: List[Value] + +@dataclass(frozen=True) +class VariableDeclarationNode(ASTNode): + name: IdentifierToken + modifiers: List[str] + type: Type + value: Value + +@dataclass(frozen=True) +class ForLoopNode(ASTNode): + variable_name: IdentifierToken | str + iterator: Value + body: List[ASTNode] + +@dataclass(frozen=True) +class SpreadOperatorNode(ASTNode): + left_side: Value + right_side: Value + type: Type | None + +@dataclass(frozen=True) +class IntegerImmediateNode(ASTNode): + value: int + type: Type + span: Span + +@dataclass(frozen=True) +class StringImmediateNode(ASTNode): + value: str + type: Type + span: Span + +@dataclass(frozen=True) +class VariableNameNode(ASTNode): + name: IdentifierToken + +@dataclass(frozen=True) +class BracketetExpressionNode(ASTNode): + content: ASTNode + +@dataclass(frozen=True) +class UseStatement(ASTNode): + path: List[IdentifierToken] + + +class Parser: + """ + This class takes a lexed input and produces a syntax tree. + + This only validates syntax, but does no type checking etc... + """ + types: Dict[str, Type] + + lexer: Lexer + tokens: ParserIterator[Token] + + def __init__(self, lexer: Lexer): + self.variables = dict() + self.lexer = lexer + # strip comments from tokens + self.tokens = ParserIterator(lexer.do_parse()) + + def parse(self): + body = [] + + while True: + thing = self.parse_file_level_block() + if thing is None: + return body + body.append(thing) + print(thing) + + + def consume_next_token(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None): + if not isinstance(types, set) and types is not None: + types = {types} + + peeked = self.tokens.peek() + if peeked is None: + raise EndOfInputError(self.tokens.last_item.span, content) + if types is not None and peeked.kind not in types: + raise InvalidTokenError(peeked, (*types, content), msg) + if content is not None and peeked.content != content: + raise InvalidTokenError(peeked, {content}, msg) + + return self.tokens.next() + + def consume_optional_eol(self): + """ + This function tries to consume EOL tokens, if they are available + """ + while self.tokens.peek().kind == TokenType.EOL: + self.tokens.next() + + def consume_expected_eol(self, msg): + """ + This function consumes at least one EOL token, or fails + """ + if self.tokens.peek().kind != TokenType.EOL: + raise InvalidTokenError(self.tokens.peek(), expected_type=["\\n"], message=msg) + while self.tokens.peek().kind == TokenType.EOL: + self.tokens.next() + + + + def consume_optional(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None): + try: + return self.consume_next_token(types, content, msg) + except InvalidTokenError: + return False + + + def parse_file_level_block(self) -> ASTNode | None: + """ + File-level blocks are statements written at the file level. + + + """ + # this part ignores newlines! + prev_ignore_lvl = self.tokens.ignore_newline + self.tokens.ignore_newline = True + + try: + match self.tokens.peek(): + case KeywordToken(content="function"): + return self.parse_function_definition() + case KeywordToken(content="struct"): + return self.parse_struct() + case KeywordToken(content="const"): + return self.parse_const_declaration() + case KeywordToken(content="use"): + return self.parse_import_statement() + case Token(kind=TokenType.EOI): + return None + case None: + raise Exception("Unexpected None token!") + case unknown_token: + raise InvalidTokenError(unknown_token, ("function", "struct"), "Only function and struct declarations are allowed at file-level!") + finally: + self.tokens.ignore_newline = prev_ignore_lvl + + + def parse_import_statement(self): + """ + parse an import-equivalent statement: + + use std.String + + """ + self.consume_next_token(types=TokenType.Keyword, content="use") + + path = [] + + if self.tokens.peek().kind == TokenType.String: + raise UnsupportedSyntaxError(self.tokens.peek(), "file paths in use statements!") + + prev = self.tokens.ignore_newline + self.tokens.ignore_newline = False + + while self.tokens.peek().kind != TokenType.EOL: + path.append(self.consume_next_token(TokenType.Identifier)) + if self.tokens.peek().content == '.': + self.consume_next_token(types=TokenType.Operator, content='.') + + self.consume_expected_eol("'use' statement must be terminated by EOL!") + + self.tokens.ignore_newline = prev + + return UseStatement(path) + + + + def parse_basic_block(self) -> Iterable[ASTNode]: + """ + A "Basic Block" is a block inside a function, for loop, etc. + """ + # when parsing blocks, newlines are important! + prev_ignore_lvl = self.tokens.ignore_newline + + self.tokens.ignore_newline = False + + if prev_ignore_lvl: + # consume all remaining EOLs + self.consume_optional_eol() + + try: + while True: + match self.tokens.peek(): + case KeywordToken(content="function"): + yield self.parse_function_definition() + case KeywordToken(content="const"): + yield self.parse_const_declaration() + case KeywordToken(content="let"): + raise UnsupportedSyntaxError(self.tokens.peek(), "'let' not supported yet") + case KeywordToken(content="for"): + yield self.parse_for_statement() + case KeywordToken(content="return"): + raise UnsupportedSyntaxError(self.tokens.peek(), "'return' not supported yet") + case KeywordToken(content="if"): + raise UnsupportedSyntaxError(self.tokens.peek(), "'if' not supported yet") + case KeywordToken(content="struct"): + # TODO: support + raise UnsupportedSyntaxError(self.tokens.peek(), "structs not supported yet") + case Token(kind=TokenType.RBracket, content="}"): + break + case other: + yield self.parse_value() + self.consume_expected_eol(msg="Only one statement per line permitted!") + finally: + self.tokens.ignore_newline = prev_ignore_lvl + + + + def parse_function_definition(self): + """ + Parses a function definition including the body + """ + self.tokens.next() + + function_name = self.consume_next_token(types=TokenType.Identifier, msg="'function' keyword must be followed by identifier!") + + # consume parenthesis + self.consume_next_token(types = TokenType.LBracket, content="(", msg="A function declaration must contain a list of arguments enclosed in parenthesis!") + + args = [] + + # TODO: we actually want to match against Token(kind=TokenType.RParen, content=")") + while self.tokens.peek().content != ')': + args.append(self.parse_function_def_arg()) + self.consume_optional_eol() + if not self.consume_optional(content=','): + break + self.consume_optional_eol() + + self.consume_next_token(types=TokenType.RBracket, content=")", msg="Expected ')' at the end of function argument list!") + + if self.tokens.peek().content == '->': + raise UnsupportedSyntaxError(self.tokens.peek(), "Function return type annotations are not yet supported!") + + if self.tokens.peek().content == '=>': + raise UnsupportedSyntaxError(self.tokens.peek(), "Short function body notation not yet supported!") + + self.consume_next_token(types=TokenType.LBracket, content="{") + + content = list(self.parse_basic_block()) + + self.consume_next_token(types=TokenType.RBracket, content="}", msg="Expected '}' at the end of a function body!") + + return FunctionNode(function_name, args, None, content) + + + def parse_function_def_arg(self) -> FunctionArgument: + """ + Parse a single argument of a function. + + Currently this allows name: type + + In the future we want to also support name: type = value + """ + identifier = self.consume_next_token(types=TokenType.Identifier, msg="Function argument name expected!") + + self.consume_next_token(types=TokenType.Operator, content=":", msg="Function argument name must be followed by a colon ':' and a type definition!") + + type = self.parse_type() + + if self.tokens.peek().content == '=': + raise UnsupportedSyntaxError(self.tokens.peek(), "default values for function arguments") + + return FunctionArgument(identifier, type) + + + def parse_type(self) -> Type: + """ + Parse a type declaration, such as String, i64, or Vector + """ + main_type = self.consume_next_token(types=TokenType.Identifier, msg="Expected type name!") + + # if this type does not wrap any other types, we are done! + if self.tokens.peek().content != '<': + return Type(main_type, []) + + wrapped_types = [] + start_token = self.consume_next_token(content="<") + + while self.tokens.peek().content != '>': + wrapped_types.append(self.parse_type()) + if not self.consume_optional(content=","): + break + self.consume_optional_eol() + + self.consume_next_token(content=">", msg="Error while parsing list of wrapped types, expected '>' at the end of the type list!") + + if len(wrapped_types) == 0: + print_warning(self.tokens.last_item.span.union(start_token.span), "Empty set of type arguments!") + + return Type(main_type, wrapped_types) + + def parse_const_declaration(self): + """ + parse a const declaration, so basically + + + const name: type = value + """ + self.consume_next_token(types=TokenType.Keyword, content="const") + identifier = self.consume_next_token(types=TokenType.Identifier, msg="const keywords must be immediately followed by a variable name!") + + type = None + if self.tokens.peek().content == ':': + self.consume_next_token(content=':') + type = self.parse_type() + + self.consume_next_token(content='=', msg="Expected '=' in const declaration!") + + value = self.parse_value() + + self.consume_expected_eol("Const declaration statement must be terminated by a newline!") + + return VariableDeclarationNode(identifier, ['const'], type, Value(None, value)) + + def parse_value(self) -> ASTNode: + """ + This function parses a "value", so basically any statement that evaluates to a value + + This can be a literal, a function call, an array/struct constructor, etc. + """ + # handle bracketet expression + if self.tokens.peek().content == '(': + self.consume_next_token(content='(') + self.consume_optional_eol() + value = self.parse_value() + self.consume_optional_eol() + self.consume_next_token(content=')', msg="Expected closing bracket") + + value = self._inner_parse_value() + + match self.tokens.peek(): + case OperatorToken(content='..'): + self.consume_next_token(types=TokenType.Operator, content="..") + right_hand_type = self.parse_type() + return SpreadOperatorNode(value, right_hand_type, None) + #case OperatorToken(content): + # raise UnsupportedSyntaxError(self.tokens.peek(), f"'{content}' is not implemented yet!") + case Token(kind=TokenType.LBracket, content="("): + self.consume_next_token(content="(") + self.consume_optional_eol() + args = list(self.parse_inner_function_call_args()) + self.consume_next_token(content=')', msg="") + return FunctionCallNode(value, args) + + def _inner_parse_value(self) -> ASTNode: + match self.tokens.peek(): + case IntegerLiteralToken(value, suffix, span): + if suffix: + type = Type(suffix, []) + else: + # assume widest signed integer type available + type = Type('i64', []) + self.consume_next_token() + return IntegerImmediateNode(value, type, span) + case Token(span, content, kind=TokenType.String): + self.consume_next_token() + return StringImmediateNode(content, Type("String", []), span) + case Token(content="{", kind=TokenType.LBracket): + return self.parse_structured_value() + case IdentifierToken(): + return VariableNameNode(self.consume_next_token(TokenType.Identifier)) + case other: + raise UnsupportedSyntaxError(other, "This type of value is not implemented yet!") + + def parse_inner_function_call_args(self) -> Iterable[ASTNode]: + if self.tokens.peek().content == ')': + return + + while True: + self.consume_optional_eol() + yield self.parse_value() + self.consume_optional_eol() + + if self.tokens.peek().content == ',': + self.consume_next_token(content=",") + continue + + if self.tokens.peek().content == ')': + break + + def parse_structured_value(self) -> ASTNode: + """ + parse either a list or struct initializer: + + list initializer: + const data: Vector = {1,2,3,4,5} + + struct initializer: + const data: MyStruct = { + field1: 1 + field2: "Hello World" + arrayField: {"test", "123", "these are strings"} + } + """ + raise UnsupportedSyntaxError(self.tokens.peek(), "Structured values such as lists, dictionaries and structs!") + + def parse_for_statement(self) -> ForLoopNode: + self.consume_next_token(content='for', types=TokenType.Keyword) + + loop_variable_name = self.consume_next_token(types=TokenType.Identifier, msg="Name of the loop variable expected!") + + self.consume_next_token(types=TokenType.Keyword, content="in", msg="for in format required!") + + iterator = self.parse_value() + + self.consume_next_token(content='{') + self.consume_optional_eol() + + body = list(self.parse_basic_block()) + + self.consume_optional_eol() + self.consume_next_token(content='}') + self.consume_optional_eol() + + return ForLoopNode(loop_variable_name, iterator, body) diff --git a/example.pmp b/example.pmp new file mode 100644 index 0000000..1a6c3d8 --- /dev/null +++ b/example.pmp @@ -0,0 +1,21 @@ +/*struct something { + i64 i + Array data + + private i64 total +}*/ + +use std.string + +function main(args: Array) { + const cars = 100 + + for x in 0x01..cars { + print("{} cars are driving around the block", cars,) + } +} + + + + + diff --git a/runtime_lib/ProgramQueue.cpp b/runtime_lib/ProgramQueue.cpp new file mode 100644 index 0000000..e69de29 diff --git a/runtime_lib/ProgramQueue.h b/runtime_lib/ProgramQueue.h new file mode 100644 index 0000000..3f59c93 --- /dev/null +++ b/runtime_lib/ProgramQueue.h @@ -0,0 +1,2 @@ +#pragma once + diff --git a/test.py b/test.py new file mode 100644 index 0000000..e8e3415 --- /dev/null +++ b/test.py @@ -0,0 +1,25 @@ +from compiler.errors import * +from compiler.lexer import Lexer, LexingContext +from compiler.parser import Parser +import os + +fname = os.path.abspath('./example.pmp') + +c = LexingContext(dict(), fname, dict(), dict()) + +try: + with open(fname, 'r') as f: + c.sources[fname] = f.read() + + lex = Lexer(fname, c) + + #for token in a.do_parse(): + # print(token) + + a = Parser(lex) + + elems = a.parse() + + +except CompilerError as err: + err.print_context_message() diff --git a/zipfs.pmp b/zipfs.pmp new file mode 100644 index 0000000..a506c30 --- /dev/null +++ b/zipfs.pmp @@ -0,0 +1,25 @@ +use std.random randint + +const ALPHABET = 26 + +function main() { + + // generate a lot of words + map const x in 0..10000 into words { + let word = 0 + let len = 0 + let char = randint(ALPHABET) + + while char != 0 { + word *= ALPHABET + word += char + len++ + + char = randint(ALPHABET) + } + + yield word + } + + +} \ No newline at end of file