parser is working with very basic functionality

This commit is contained in:
Anton Lydike 2022-06-16 16:17:49 +02:00
commit 525ef8f467
14 changed files with 1119 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
venv
__pycache__

0
Makefile Normal file
View File

0
compiler/__init__.py Normal file
View File

0
compiler/ast_printer.py Normal file
View File

213
compiler/defs.py Normal file
View File

@ -0,0 +1,213 @@
from dataclasses import dataclass, fields
from mimetypes import suffix_map
from ntpath import join
from typing import Dict, List, Iterable, Generator, Any, Literal
from enum import Enum, auto
@dataclass
class LexingContext:
sources: Dict[str,str]
entrypoint: str
structs: Dict[int, Any] # TODO: struct def type
functions: Dict[str, Any] # TODO: function types
def get_nth_line_bounds(self, source_name: str, n: int):
if source_name not in self.sources:
raise KeyError("Unknown source file \"{}\"!".format(source_name))
start = 0
source = self.sources[source_name]
for i in range(n):
next_start = source.find('\n', start)
if next_start == -1:
return None
start = next_start + 1
return start, source.find('\n', start)
def get_lines_containing(self, span: 'Span'):
if span.source_name not in self.sources:
raise KeyError("Unknown source file \"{}\"!".format(span.source_name))
start = 0
line_no = 0
source = self.sources[span.source_name]
while True:
next_start = source.find('\n', start)
line_no += 1
# handle eof
if next_start == -1:
return None
# as long as the next newline comes before the spans start we are good
if next_start < span.start:
start = next_start + 1
continue
# if the whole span is on one line, we are good as well
if next_start >= span.end:
return [ source[start:next_start] ], start, line_no
while next_start < span.end:
next_start = source.find('\n', next_start+1)
return source[start:next_start].split('\n'), start, line_no
@dataclass(frozen=True)
class Span:
start: int
"""
Start of tokens location in source file, global byte offset in file
"""
end: int
"""
End of tokens location in source file, global byte offset in file
"""
source_name: str
context: LexingContext
def union(self, *spans: 'Span'):
for span in spans:
assert span.source_name == self.source_name
assert span.context == self.context
return Span(
start=min(self.start, *(span.start for span in spans)),
end=max(self.end, *(span.end for span in spans)),
source_name=self.source_name,
context=self.context
)
def transform(self, start:int=0, end:int=0):
return Span(self.start + start, self.end - end, self.source_name, self.context)
def __repr__(self):
return "{}(start={},end={},source_name={})".format(
self.__class__.__name__,
self.start, self.end, self.source_name
)
class TokenType(Enum):
Keyword = auto()
Integer = auto()
Float = auto()
Identifier = auto()
String = auto()
LBracket = auto()
RBracket = auto()
Operator = auto()
LineComment = auto()
MultiComment = auto()
EOL = auto() # End of Line
EOI = auto() # End of Input
@dataclass(frozen=True)
class Token:
span: Span
content: str
kind: TokenType
def __new__(cls, span: Span, content: str, kind: TokenType):
if kind in token_type_to_subclass_map and cls == Token:
return token_type_to_subclass_map[kind].__new__(
token_type_to_subclass_map[kind], span, content, kind
)
return super().__new__(cls)
def __repr__(self):
fields_to_print = [field for field in fields(self) if field.name not in ('span', 'context', 'kind')]
if self.__class__ == Token:
return "{}[{}]({})".format(
self.__class__.__name__,
self.kind.name,
", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
)
else:
return "{}({})".format(
self.__class__.__name__,
", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
)
@dataclass(frozen=True, init=False, repr=False)
class IntegerLiteralToken(Token):
value: int
format: Literal['hex', 'int', 'dec', 'oct']
suffix: str | None
def __init__(self, span: Span, content: str, kind: TokenType):
super().__init__(span, content, kind)
assert kind == TokenType.Integer
suffix = None
for suffix_ in integer_type_suffixes:
if content.endswith(suffix_):
suffix = suffix_
content.removesuffix(suffix_)
break
format = 'dec'
if content.startswith('0x') or content.startswith('0X'):
value = int(content, 16)
format = 'hex'
elif content.startswith('0b'):
value = int(content, 2)
format = 'bin'
else:
value = int(content, 10)
object.__setattr__(self, "value", value)
object.__setattr__(self, "suffix", suffix)
object.__setattr__(self, "format", format)
@dataclass(frozen=True, repr=False)
class KeywordToken(Token):
pass
@dataclass(frozen=True, repr=False)
class OperatorToken(Token):
pass
@dataclass(frozen=True, repr=False)
class IdentifierToken(Token):
pass
token_type_to_subclass_map = {
TokenType.Integer: IntegerLiteralToken,
TokenType.Keyword: KeywordToken,
TokenType.Operator: OperatorToken,
TokenType.Identifier: IdentifierToken
}
keywords = {
'const', 'let', 'for', 'if', 'function',
'true', 'false', 'in', 'not', 'or', 'and',
'struct', 'private', 'public', 'return',
'impure', 'while', 'use', 'do', 'continue',
'break'
}
digits = {
'bin': '01_',
'hex': '0123456789abcdefABCDEF_',
'dec': '0123456789_',
'oct': '01234567_' # TODO: implement octal literals?
}
operators = {
'+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '||',
'??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--',
'=', '*=', '+=', '/=', '-='
}
integer_type_suffixes = {
'i8', 'i16', 'i32', 'i64',
'u8', 'u16', 'u32', 'u64',
}
reserved_special_chars = {
'#', '~', '`', '"', '\'', '@', '|', ';'
}
parens = '[]{}()<>'
identifier_terminating_chars = set((*operators, *parens, ' ', '\n', '\t', '\r', *reserved_special_chars))

90
compiler/errors.py Normal file
View File

@ -0,0 +1,90 @@
from .defs import Span, LexingContext, Token, TokenType
from math import exp, log10, ceil
from typing import Iterable
def create_span_context_str(span: Span, message: str, color: str = '\033[31m'):
lines, offset_into_file, line_no = span.context.get_lines_containing(span)
relative_offset = span.start - offset_into_file
annotation_len = span.end - span.start
digit_len = ceil(log10(line_no + len(lines)))
if digit_len == 0:
digit_len = 1
output_str = ">>> In file {}:{}\n".format(span.source_name, line_no)
for i, source_line in enumerate(lines):
source_line = source_line[:relative_offset] + color + source_line[relative_offset:relative_offset+annotation_len] + '\033[0m' + source_line[relative_offset+annotation_len:]
output_str += '{:>{}d}: {}\n'.format(line_no + i, digit_len, source_line)
if relative_offset > len(source_line):
continue
# TODO: handle multi-line underlines
output_str += "{}{}{}{}\n".format(
color,
' ' * (relative_offset + digit_len + 2),
'^' * min(annotation_len, len(source_line) - relative_offset),
'\033[0m'
)
if annotation_len > len(source_line) - relative_offset:
relative_offset = 0
annotation_len -= len(source_line) - relative_offset
if message:
output_str += color
output_str += ' ' * (relative_offset + digit_len + 2) + '|\n'
for message_line in message.split("\n"):
output_str += ' ' * (relative_offset + digit_len + 2) + message_line + '\n'
return output_str + '\033[0m'
def print_warning(span: Span, message: str, color="\033[33m"):
print(create_span_context_str(span, "Warning: " + message, color))
class CompilerError(Exception):
span: Span
message: str
def __init__(self, msg: str, span: Span=None) -> None:
super().__init__((msg, span))
self.span = span
self.message = msg
def print_context_message(self):
if not self.span:
print("\n".join(">>> {}".format(line) for line in self.message.split('\n')))
else:
print(create_span_context_str(self.span, self.message))
class EndOfInputError(CompilerError):
def __init__(self,span: Span, search_str:str = None) -> None:
if search_str:
super().__init__(f"Unexpected end-of-input in {span.source_name} while scanning for {search_str}!", span)
else:
super().__init__(f"Unexpected end-of-input in {span.source_name}!", span)
class ParseError(CompilerError):
def __init__(self, msg: str, span: Span = None) -> None:
super().__init__(msg, span)
class InvalidTokenError(CompilerError):
def __init__(self, token: Token, expected_type: Iterable[str | TokenType] = None, message: str = None) -> None:
expected = ", expected {}".format(", ".join(f"{x}" for x in expected_type)) if expected_type else ""
super().__init__("Unexpected token {}{} {}".format(
token, expected, '\n' + message if message else ""
), token.span if token is not None else None)
class UnsupportedSyntaxError(CompilerError):
def __init__(self, token: Token, feature: str) -> None:
super().__init__("Unsupported syntax: {}".format(feature), token.span)

74
compiler/helpers.py Normal file
View File

@ -0,0 +1,74 @@
from typing import TypeVar, Generic, Iterable, Iterator, List
from .defs import Token, TokenType
T = TypeVar("T")
class PeekableIterator(Iterator[T]):
_peeked: List[T]
last_item: T | None
def __init__(self, iterable: Iterable[T]) -> None:
self.iter = iterable
self._peeked = list()
self.last_item = None
def peek(self, offset: int = 0):
while len(self._peeked) <= offset:
try:
self._peeked.append(next(self.iter))
except StopIteration:
return None
return self._peeked[offset]
def __next__(self) -> T:
if len(self._peeked) > 0:
item = self._peeked.pop(0)
else:
item = next(self.iter)
self.last_item = item
return item
def __iter__(self) -> Iterator[T]:
return self
def next(self) -> T:
try:
return next(self)
except StopIteration:
return None
def has_next(self):
return self.peek() is not None
class ParserIterator(PeekableIterator[Token]):
def __init__(self, iterable: Iterable[Token]) -> None:
super().__init__(t for t in iterable if t.kind not in (TokenType.LineComment, TokenType.MultiComment))
self.ignore_newline = False
def peek(self, offset: int = 0):
while len(self._peeked) <= offset:
try:
self._peeked.append(next(self.iter))
except StopIteration:
return None
token = self._peeked[offset]
if self.ignore_newline and token.kind == TokenType.EOL:
return self.peek(offset=offset+1)
return token
def __next__(self) -> T:
if len(self._peeked) > 0:
item = self._peeked.pop(0)
else:
item = next(self.iter)
self.last_item = item
if self.ignore_newline and item.kind == TokenType.EOL:
return next(self)
return item

201
compiler/lexer.py Normal file
View File

@ -0,0 +1,201 @@
from ast import operator
from dataclasses import dataclass
from sre_parse import parse_template
from typing import Dict, List, Iterable, Generator, Tuple
from enum import Enum, auto
from unicodedata import digit
from .errors import EndOfInputError, ParseError
from .defs import Span, LexingContext, Token, TokenType, digits, keywords, operators, integer_type_suffixes, parens, identifier_terminating_chars
class Lexer:
separators = ':-+*/#!"\'=?%&<>[]{}()\n \t'
parens = '[]{}()<>'
context: LexingContext
content: str
pos: int
line: int
fname: str
size: int
def __init__(self, fname: str, context: LexingContext):
self.content = context.sources[fname]
self.fname = fname
self.pos = self.line = 0
self.word = ""
self.size = len(self.content)
self.context = context
def peek(self, offset: int = 0):
if self.pos + offset >= self.size:
return None
return self.content[self.pos + offset]
def startswith(self, *patterns: str, offset: int = 0):
# match longest first
for pattern in sorted(patterns, key=len, reverse=True):
if self.content.startswith(pattern, self.pos + offset):
return pattern
return False
def read_until(self, pattern: str, inclusive=True) -> Tuple[str, Span]:
start = self.pos
pos = self.pos
while not self.content[pos:].startswith(pattern) and pos < self.size:
pos += 1
if pos == self.size:
raise EndOfInputError(Span(start, pos, self.fname, self.context), pattern)
if inclusive:
pos += len(pattern)
self.pos = pos
return self.content[start:pos], Span(start, pos, self.fname, self.context)
def do_parse(self) -> Iterable[Token]:
while True:
c = self.peek()
# reached end of input
if c == None:
yield Token(Span(self.pos, self.pos, self.fname, self.context), "", TokenType.EOI)
break
if c in '\n\r':
start = self.pos
if self.startswith('\r\n'):
self.pos += 1
self.pos += 1
yield self.Token(start, TokenType.EOL)
continue
# check for integer literals
if self.startswith('0x', '0X', '0b', *'0123456789'):
yield self.parse_integer()
continue
# check for parenthesis
if c in parens:
# left parens at position 0, 2, 4, 6
left_paren = parens.index(c) % 2 == 0
self.pos += 1
yield self.Token(self.pos -1, TokenType.LBracket if left_paren else TokenType.RBracket)
continue
if self.startswith('//'):
start = self.pos
self.read_until('\n', inclusive=False) # read until newline, but don't consume newline
yield self.Token(start, TokenType.LineComment)
continue
if self.startswith('/*'):
start = self.pos
self.read_until('*/')
yield self.Token(start, TokenType.MultiComment)
continue
starts_with_keyword = self.startswith(*keywords)
if starts_with_keyword:
start = self.pos
self.pos += len(starts_with_keyword)
yield self.Token(start, TokenType.Keyword)
self.consume_expected_whitespace()
continue
starts_with_operator = self.startswith(*operators)
if starts_with_operator:
start = self.pos
self.pos += len(starts_with_operator)
yield self.Token(start, TokenType.Operator)
continue
if self.peek() in '"\'':
yield self.parse_string()
continue
if self.peek() in ' \t':
self.pos += 1
continue
# must be an identifier then
start = self.pos
while self.peek() not in identifier_terminating_chars:
self.pos += 1
if start == self.pos:
raise ParseError("Expected identifier!", Span(start, start+1, self.fname, self.context))
yield self.Token(start, TokenType.Identifier)
continue
def consume_expected_whitespace(self):
if self.peek() in '\r\n':
return
if self.peek() not in '\t ':
raise ParseError("Expected whitespace here", Span(self.pos, self.pos+1, self.fname, self.context))
while self.peek() in '\t ':
self.pos += 1
def parse_integer(self):
start = self.pos
if self.startswith('-'):
self.pos += 1
parse_type = 'dec'
if self.startswith('0x', '0X'):
parse_type = 'hex'
self.pos += 2
elif self.startswith('0b'):
parse_type = 'bin'
self.pos += 2
while self.peek() in digits[parse_type]:
self.pos += 1
suffix = self.startswith(*integer_type_suffixes)
if suffix:
self.pos += len(suffix)
return self.Token(start, TokenType.Integer)
def parse_string(self):
start = self.pos
terminator = self.peek()
escaped = False
self.pos += 1
string = ""
while not escaped and self.peek() != terminator:
char = self.peek()
if escaped:
match char:
case 'r':
string += '\r'
case 'b':
string += '\b'
case 'n':
string += '\n'
case 't':
string += '\t'
case 'e': # support terminal escape codes
string += '\033'
case other:
string += '\\' + other
escaped = False
elif self.peek() == '\\':
escaped = True
else:
string += char
self.pos += 1
# consume trailing terminator
self.pos += 1
return self.Token(start, TokenType.String, content=string)
def Token(self, start: int, type: TokenType, end=None, content=None) -> Token:
if end is None:
end = self.pos
if content is None:
content = self.content[start:end]
return Token(Span(start, end, self.fname, self.context), content, type)

466
compiler/parser.py Normal file
View File

@ -0,0 +1,466 @@
from dataclasses import dataclass
import imp
from webbrowser import Opera
from .defs import Token, IntegerLiteralToken, TokenType, OperatorToken, KeywordToken, IdentifierToken, Span
from .lexer import Lexer
from .helpers import ParserIterator
from .errors import CompilerError, EndOfInputError, InvalidTokenError, UnsupportedSyntaxError, print_warning
from typing import Tuple, Optional, List, Dict, Set, Iterable
@dataclass(frozen=True)
class Type:
name: IdentifierToken | str
wraps: Tuple['Type', ...]
@dataclass(frozen=True)
class Value:
type: Optional[Type]
value: 'ASTNode'
@dataclass(frozen=True)
class FunctionArgument:
name: IdentifierToken
type: Type
#default_value: Value | None
@dataclass(frozen=True)
class ASTNode:
pass
@dataclass(frozen=True)
class FunctionNode(ASTNode):
name: IdentifierToken
args: Tuple[FunctionArgument, ...]
return_type: Type
contents: Tuple[ASTNode, ...]
@dataclass(frozen=True)
class FunctionCallNode(ASTNode):
function: Value
arguments: List[Value]
@dataclass(frozen=True)
class VariableDeclarationNode(ASTNode):
name: IdentifierToken
modifiers: List[str]
type: Type
value: Value
@dataclass(frozen=True)
class ForLoopNode(ASTNode):
variable_name: IdentifierToken | str
iterator: Value
body: List[ASTNode]
@dataclass(frozen=True)
class SpreadOperatorNode(ASTNode):
left_side: Value
right_side: Value
type: Type | None
@dataclass(frozen=True)
class IntegerImmediateNode(ASTNode):
value: int
type: Type
span: Span
@dataclass(frozen=True)
class StringImmediateNode(ASTNode):
value: str
type: Type
span: Span
@dataclass(frozen=True)
class VariableNameNode(ASTNode):
name: IdentifierToken
@dataclass(frozen=True)
class BracketetExpressionNode(ASTNode):
content: ASTNode
@dataclass(frozen=True)
class UseStatement(ASTNode):
path: List[IdentifierToken]
class Parser:
"""
This class takes a lexed input and produces a syntax tree.
This only validates syntax, but does no type checking etc...
"""
types: Dict[str, Type]
lexer: Lexer
tokens: ParserIterator[Token]
def __init__(self, lexer: Lexer):
self.variables = dict()
self.lexer = lexer
# strip comments from tokens
self.tokens = ParserIterator(lexer.do_parse())
def parse(self):
body = []
while True:
thing = self.parse_file_level_block()
if thing is None:
return body
body.append(thing)
print(thing)
def consume_next_token(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
if not isinstance(types, set) and types is not None:
types = {types}
peeked = self.tokens.peek()
if peeked is None:
raise EndOfInputError(self.tokens.last_item.span, content)
if types is not None and peeked.kind not in types:
raise InvalidTokenError(peeked, (*types, content), msg)
if content is not None and peeked.content != content:
raise InvalidTokenError(peeked, {content}, msg)
return self.tokens.next()
def consume_optional_eol(self):
"""
This function tries to consume EOL tokens, if they are available
"""
while self.tokens.peek().kind == TokenType.EOL:
self.tokens.next()
def consume_expected_eol(self, msg):
"""
This function consumes at least one EOL token, or fails
"""
if self.tokens.peek().kind != TokenType.EOL:
raise InvalidTokenError(self.tokens.peek(), expected_type=["\\n"], message=msg)
while self.tokens.peek().kind == TokenType.EOL:
self.tokens.next()
def consume_optional(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
try:
return self.consume_next_token(types, content, msg)
except InvalidTokenError:
return False
def parse_file_level_block(self) -> ASTNode | None:
"""
File-level blocks are statements written at the file level.
"""
# this part ignores newlines!
prev_ignore_lvl = self.tokens.ignore_newline
self.tokens.ignore_newline = True
try:
match self.tokens.peek():
case KeywordToken(content="function"):
return self.parse_function_definition()
case KeywordToken(content="struct"):
return self.parse_struct()
case KeywordToken(content="const"):
return self.parse_const_declaration()
case KeywordToken(content="use"):
return self.parse_import_statement()
case Token(kind=TokenType.EOI):
return None
case None:
raise Exception("Unexpected None token!")
case unknown_token:
raise InvalidTokenError(unknown_token, ("function", "struct"), "Only function and struct declarations are allowed at file-level!")
finally:
self.tokens.ignore_newline = prev_ignore_lvl
def parse_import_statement(self):
"""
parse an import-equivalent statement:
use std.String
"""
self.consume_next_token(types=TokenType.Keyword, content="use")
path = []
if self.tokens.peek().kind == TokenType.String:
raise UnsupportedSyntaxError(self.tokens.peek(), "file paths in use statements!")
prev = self.tokens.ignore_newline
self.tokens.ignore_newline = False
while self.tokens.peek().kind != TokenType.EOL:
path.append(self.consume_next_token(TokenType.Identifier))
if self.tokens.peek().content == '.':
self.consume_next_token(types=TokenType.Operator, content='.')
self.consume_expected_eol("'use' statement must be terminated by EOL!")
self.tokens.ignore_newline = prev
return UseStatement(path)
def parse_basic_block(self) -> Iterable[ASTNode]:
"""
A "Basic Block" is a block inside a function, for loop, etc.
"""
# when parsing blocks, newlines are important!
prev_ignore_lvl = self.tokens.ignore_newline
self.tokens.ignore_newline = False
if prev_ignore_lvl:
# consume all remaining EOLs
self.consume_optional_eol()
try:
while True:
match self.tokens.peek():
case KeywordToken(content="function"):
yield self.parse_function_definition()
case KeywordToken(content="const"):
yield self.parse_const_declaration()
case KeywordToken(content="let"):
raise UnsupportedSyntaxError(self.tokens.peek(), "'let' not supported yet")
case KeywordToken(content="for"):
yield self.parse_for_statement()
case KeywordToken(content="return"):
raise UnsupportedSyntaxError(self.tokens.peek(), "'return' not supported yet")
case KeywordToken(content="if"):
raise UnsupportedSyntaxError(self.tokens.peek(), "'if' not supported yet")
case KeywordToken(content="struct"):
# TODO: support
raise UnsupportedSyntaxError(self.tokens.peek(), "structs not supported yet")
case Token(kind=TokenType.RBracket, content="}"):
break
case other:
yield self.parse_value()
self.consume_expected_eol(msg="Only one statement per line permitted!")
finally:
self.tokens.ignore_newline = prev_ignore_lvl
def parse_function_definition(self):
"""
Parses a function definition including the body
"""
self.tokens.next()
function_name = self.consume_next_token(types=TokenType.Identifier, msg="'function' keyword must be followed by identifier!")
# consume parenthesis
self.consume_next_token(types = TokenType.LBracket, content="(", msg="A function declaration must contain a list of arguments enclosed in parenthesis!")
args = []
# TODO: we actually want to match against Token(kind=TokenType.RParen, content=")")
while self.tokens.peek().content != ')':
args.append(self.parse_function_def_arg())
self.consume_optional_eol()
if not self.consume_optional(content=','):
break
self.consume_optional_eol()
self.consume_next_token(types=TokenType.RBracket, content=")", msg="Expected ')' at the end of function argument list!")
if self.tokens.peek().content == '->':
raise UnsupportedSyntaxError(self.tokens.peek(), "Function return type annotations are not yet supported!")
if self.tokens.peek().content == '=>':
raise UnsupportedSyntaxError(self.tokens.peek(), "Short function body notation not yet supported!")
self.consume_next_token(types=TokenType.LBracket, content="{")
content = list(self.parse_basic_block())
self.consume_next_token(types=TokenType.RBracket, content="}", msg="Expected '}' at the end of a function body!")
return FunctionNode(function_name, args, None, content)
def parse_function_def_arg(self) -> FunctionArgument:
"""
Parse a single argument of a function.
Currently this allows name: type
In the future we want to also support name: type = value
"""
identifier = self.consume_next_token(types=TokenType.Identifier, msg="Function argument name expected!")
self.consume_next_token(types=TokenType.Operator, content=":", msg="Function argument name must be followed by a colon ':' and a type definition!")
type = self.parse_type()
if self.tokens.peek().content == '=':
raise UnsupportedSyntaxError(self.tokens.peek(), "default values for function arguments")
return FunctionArgument(identifier, type)
def parse_type(self) -> Type:
"""
Parse a type declaration, such as String, i64, or Vector<i64>
"""
main_type = self.consume_next_token(types=TokenType.Identifier, msg="Expected type name!")
# if this type does not wrap any other types, we are done!
if self.tokens.peek().content != '<':
return Type(main_type, [])
wrapped_types = []
start_token = self.consume_next_token(content="<")
while self.tokens.peek().content != '>':
wrapped_types.append(self.parse_type())
if not self.consume_optional(content=","):
break
self.consume_optional_eol()
self.consume_next_token(content=">", msg="Error while parsing list of wrapped types, expected '>' at the end of the type list!")
if len(wrapped_types) == 0:
print_warning(self.tokens.last_item.span.union(start_token.span), "Empty set of type arguments!")
return Type(main_type, wrapped_types)
def parse_const_declaration(self):
"""
parse a const declaration, so basically
const name: type = value
"""
self.consume_next_token(types=TokenType.Keyword, content="const")
identifier = self.consume_next_token(types=TokenType.Identifier, msg="const keywords must be immediately followed by a variable name!")
type = None
if self.tokens.peek().content == ':':
self.consume_next_token(content=':')
type = self.parse_type()
self.consume_next_token(content='=', msg="Expected '=' in const declaration!")
value = self.parse_value()
self.consume_expected_eol("Const declaration statement must be terminated by a newline!")
return VariableDeclarationNode(identifier, ['const'], type, Value(None, value))
def parse_value(self) -> ASTNode:
"""
This function parses a "value", so basically any statement that evaluates to a value
This can be a literal, a function call, an array/struct constructor, etc.
"""
# handle bracketet expression
if self.tokens.peek().content == '(':
self.consume_next_token(content='(')
self.consume_optional_eol()
value = self.parse_value()
self.consume_optional_eol()
self.consume_next_token(content=')', msg="Expected closing bracket")
value = self._inner_parse_value()
match self.tokens.peek():
case OperatorToken(content='..'):
self.consume_next_token(types=TokenType.Operator, content="..")
right_hand_type = self.parse_type()
return SpreadOperatorNode(value, right_hand_type, None)
#case OperatorToken(content):
# raise UnsupportedSyntaxError(self.tokens.peek(), f"'{content}' is not implemented yet!")
case Token(kind=TokenType.LBracket, content="("):
self.consume_next_token(content="(")
self.consume_optional_eol()
args = list(self.parse_inner_function_call_args())
self.consume_next_token(content=')', msg="")
return FunctionCallNode(value, args)
def _inner_parse_value(self) -> ASTNode:
match self.tokens.peek():
case IntegerLiteralToken(value, suffix, span):
if suffix:
type = Type(suffix, [])
else:
# assume widest signed integer type available
type = Type('i64', [])
self.consume_next_token()
return IntegerImmediateNode(value, type, span)
case Token(span, content, kind=TokenType.String):
self.consume_next_token()
return StringImmediateNode(content, Type("String", []), span)
case Token(content="{", kind=TokenType.LBracket):
return self.parse_structured_value()
case IdentifierToken():
return VariableNameNode(self.consume_next_token(TokenType.Identifier))
case other:
raise UnsupportedSyntaxError(other, "This type of value is not implemented yet!")
def parse_inner_function_call_args(self) -> Iterable[ASTNode]:
if self.tokens.peek().content == ')':
return
while True:
self.consume_optional_eol()
yield self.parse_value()
self.consume_optional_eol()
if self.tokens.peek().content == ',':
self.consume_next_token(content=",")
continue
if self.tokens.peek().content == ')':
break
def parse_structured_value(self) -> ASTNode:
"""
parse either a list or struct initializer:
list initializer:
const data: Vector<i64> = {1,2,3,4,5}
struct initializer:
const data: MyStruct = {
field1: 1
field2: "Hello World"
arrayField: {"test", "123", "these are strings"}
}
"""
raise UnsupportedSyntaxError(self.tokens.peek(), "Structured values such as lists, dictionaries and structs!")
def parse_for_statement(self) -> ForLoopNode:
self.consume_next_token(content='for', types=TokenType.Keyword)
loop_variable_name = self.consume_next_token(types=TokenType.Identifier, msg="Name of the loop variable expected!")
self.consume_next_token(types=TokenType.Keyword, content="in", msg="for <name> in <value> format required!")
iterator = self.parse_value()
self.consume_next_token(content='{')
self.consume_optional_eol()
body = list(self.parse_basic_block())
self.consume_optional_eol()
self.consume_next_token(content='}')
self.consume_optional_eol()
return ForLoopNode(loop_variable_name, iterator, body)

21
example.pmp Normal file
View File

@ -0,0 +1,21 @@
/*struct something {
i64 i
Array<i64> data
private i64 total
}*/
use std.string
function main(args: Array<String>) {
const cars = 100
for x in 0x01..cars {
print("{} cars are driving around the block", cars,)
}
}

View File

View File

@ -0,0 +1,2 @@
#pragma once

25
test.py Normal file
View File

@ -0,0 +1,25 @@
from compiler.errors import *
from compiler.lexer import Lexer, LexingContext
from compiler.parser import Parser
import os
fname = os.path.abspath('./example.pmp')
c = LexingContext(dict(), fname, dict(), dict())
try:
with open(fname, 'r') as f:
c.sources[fname] = f.read()
lex = Lexer(fname, c)
#for token in a.do_parse():
# print(token)
a = Parser(lex)
elems = a.parse()
except CompilerError as err:
err.print_context_message()

25
zipfs.pmp Normal file
View File

@ -0,0 +1,25 @@
use std.random randint
const ALPHABET = 26
function main() {
// generate a lot of words
map const x in 0..10000 into words {
let word = 0
let len = 0
let char = randint(ALPHABET)
while char != 0 {
word *= ALPHABET
word += char
len++
char = randint(ALPHABET)
}
yield word
}
}