parser is working with very basic functionality
This commit is contained in:
commit
525ef8f467
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
venv
|
||||
__pycache__
|
0
compiler/__init__.py
Normal file
0
compiler/__init__.py
Normal file
0
compiler/ast_printer.py
Normal file
0
compiler/ast_printer.py
Normal file
213
compiler/defs.py
Normal file
213
compiler/defs.py
Normal file
@ -0,0 +1,213 @@
|
||||
from dataclasses import dataclass, fields
|
||||
from mimetypes import suffix_map
|
||||
from ntpath import join
|
||||
from typing import Dict, List, Iterable, Generator, Any, Literal
|
||||
from enum import Enum, auto
|
||||
|
||||
|
||||
@dataclass
|
||||
class LexingContext:
|
||||
sources: Dict[str,str]
|
||||
entrypoint: str
|
||||
structs: Dict[int, Any] # TODO: struct def type
|
||||
functions: Dict[str, Any] # TODO: function types
|
||||
|
||||
def get_nth_line_bounds(self, source_name: str, n: int):
|
||||
if source_name not in self.sources:
|
||||
raise KeyError("Unknown source file \"{}\"!".format(source_name))
|
||||
start = 0
|
||||
source = self.sources[source_name]
|
||||
for i in range(n):
|
||||
next_start = source.find('\n', start)
|
||||
if next_start == -1:
|
||||
return None
|
||||
start = next_start + 1
|
||||
return start, source.find('\n', start)
|
||||
|
||||
def get_lines_containing(self, span: 'Span'):
|
||||
if span.source_name not in self.sources:
|
||||
raise KeyError("Unknown source file \"{}\"!".format(span.source_name))
|
||||
start = 0
|
||||
line_no = 0
|
||||
source = self.sources[span.source_name]
|
||||
while True:
|
||||
next_start = source.find('\n', start)
|
||||
line_no += 1
|
||||
# handle eof
|
||||
if next_start == -1:
|
||||
return None
|
||||
# as long as the next newline comes before the spans start we are good
|
||||
if next_start < span.start:
|
||||
start = next_start + 1
|
||||
continue
|
||||
# if the whole span is on one line, we are good as well
|
||||
if next_start >= span.end:
|
||||
return [ source[start:next_start] ], start, line_no
|
||||
while next_start < span.end:
|
||||
next_start = source.find('\n', next_start+1)
|
||||
|
||||
return source[start:next_start].split('\n'), start, line_no
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Span:
|
||||
start: int
|
||||
"""
|
||||
Start of tokens location in source file, global byte offset in file
|
||||
"""
|
||||
end: int
|
||||
"""
|
||||
End of tokens location in source file, global byte offset in file
|
||||
"""
|
||||
source_name: str
|
||||
context: LexingContext
|
||||
|
||||
def union(self, *spans: 'Span'):
|
||||
for span in spans:
|
||||
assert span.source_name == self.source_name
|
||||
assert span.context == self.context
|
||||
return Span(
|
||||
start=min(self.start, *(span.start for span in spans)),
|
||||
end=max(self.end, *(span.end for span in spans)),
|
||||
source_name=self.source_name,
|
||||
context=self.context
|
||||
)
|
||||
|
||||
def transform(self, start:int=0, end:int=0):
|
||||
return Span(self.start + start, self.end - end, self.source_name, self.context)
|
||||
|
||||
def __repr__(self):
|
||||
return "{}(start={},end={},source_name={})".format(
|
||||
self.__class__.__name__,
|
||||
self.start, self.end, self.source_name
|
||||
)
|
||||
|
||||
class TokenType(Enum):
|
||||
Keyword = auto()
|
||||
Integer = auto()
|
||||
Float = auto()
|
||||
Identifier = auto()
|
||||
String = auto()
|
||||
LBracket = auto()
|
||||
RBracket = auto()
|
||||
Operator = auto()
|
||||
LineComment = auto()
|
||||
MultiComment = auto()
|
||||
EOL = auto() # End of Line
|
||||
EOI = auto() # End of Input
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Token:
|
||||
span: Span
|
||||
content: str
|
||||
kind: TokenType
|
||||
|
||||
def __new__(cls, span: Span, content: str, kind: TokenType):
|
||||
if kind in token_type_to_subclass_map and cls == Token:
|
||||
return token_type_to_subclass_map[kind].__new__(
|
||||
token_type_to_subclass_map[kind], span, content, kind
|
||||
)
|
||||
return super().__new__(cls)
|
||||
|
||||
def __repr__(self):
|
||||
fields_to_print = [field for field in fields(self) if field.name not in ('span', 'context', 'kind')]
|
||||
if self.__class__ == Token:
|
||||
return "{}[{}]({})".format(
|
||||
self.__class__.__name__,
|
||||
self.kind.name,
|
||||
", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
|
||||
)
|
||||
else:
|
||||
return "{}({})".format(
|
||||
self.__class__.__name__,
|
||||
", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True, init=False, repr=False)
|
||||
class IntegerLiteralToken(Token):
|
||||
value: int
|
||||
format: Literal['hex', 'int', 'dec', 'oct']
|
||||
suffix: str | None
|
||||
|
||||
def __init__(self, span: Span, content: str, kind: TokenType):
|
||||
super().__init__(span, content, kind)
|
||||
assert kind == TokenType.Integer
|
||||
|
||||
suffix = None
|
||||
for suffix_ in integer_type_suffixes:
|
||||
if content.endswith(suffix_):
|
||||
suffix = suffix_
|
||||
content.removesuffix(suffix_)
|
||||
break
|
||||
|
||||
format = 'dec'
|
||||
if content.startswith('0x') or content.startswith('0X'):
|
||||
value = int(content, 16)
|
||||
format = 'hex'
|
||||
elif content.startswith('0b'):
|
||||
value = int(content, 2)
|
||||
format = 'bin'
|
||||
else:
|
||||
value = int(content, 10)
|
||||
object.__setattr__(self, "value", value)
|
||||
object.__setattr__(self, "suffix", suffix)
|
||||
object.__setattr__(self, "format", format)
|
||||
|
||||
|
||||
@dataclass(frozen=True, repr=False)
|
||||
class KeywordToken(Token):
|
||||
pass
|
||||
|
||||
@dataclass(frozen=True, repr=False)
|
||||
class OperatorToken(Token):
|
||||
pass
|
||||
|
||||
@dataclass(frozen=True, repr=False)
|
||||
class IdentifierToken(Token):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
token_type_to_subclass_map = {
|
||||
TokenType.Integer: IntegerLiteralToken,
|
||||
TokenType.Keyword: KeywordToken,
|
||||
TokenType.Operator: OperatorToken,
|
||||
TokenType.Identifier: IdentifierToken
|
||||
}
|
||||
|
||||
|
||||
keywords = {
|
||||
'const', 'let', 'for', 'if', 'function',
|
||||
'true', 'false', 'in', 'not', 'or', 'and',
|
||||
'struct', 'private', 'public', 'return',
|
||||
'impure', 'while', 'use', 'do', 'continue',
|
||||
'break'
|
||||
}
|
||||
|
||||
digits = {
|
||||
'bin': '01_',
|
||||
'hex': '0123456789abcdefABCDEF_',
|
||||
'dec': '0123456789_',
|
||||
'oct': '01234567_' # TODO: implement octal literals?
|
||||
}
|
||||
|
||||
operators = {
|
||||
'+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '||',
|
||||
'??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--',
|
||||
'=', '*=', '+=', '/=', '-='
|
||||
}
|
||||
|
||||
integer_type_suffixes = {
|
||||
'i8', 'i16', 'i32', 'i64',
|
||||
'u8', 'u16', 'u32', 'u64',
|
||||
}
|
||||
|
||||
reserved_special_chars = {
|
||||
'#', '~', '`', '"', '\'', '@', '|', ';'
|
||||
}
|
||||
|
||||
parens = '[]{}()<>'
|
||||
|
||||
identifier_terminating_chars = set((*operators, *parens, ' ', '\n', '\t', '\r', *reserved_special_chars))
|
90
compiler/errors.py
Normal file
90
compiler/errors.py
Normal file
@ -0,0 +1,90 @@
|
||||
from .defs import Span, LexingContext, Token, TokenType
|
||||
from math import exp, log10, ceil
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
|
||||
|
||||
def create_span_context_str(span: Span, message: str, color: str = '\033[31m'):
|
||||
lines, offset_into_file, line_no = span.context.get_lines_containing(span)
|
||||
relative_offset = span.start - offset_into_file
|
||||
annotation_len = span.end - span.start
|
||||
|
||||
digit_len = ceil(log10(line_no + len(lines)))
|
||||
if digit_len == 0:
|
||||
digit_len = 1
|
||||
|
||||
output_str = ">>> In file {}:{}\n".format(span.source_name, line_no)
|
||||
|
||||
for i, source_line in enumerate(lines):
|
||||
source_line = source_line[:relative_offset] + color + source_line[relative_offset:relative_offset+annotation_len] + '\033[0m' + source_line[relative_offset+annotation_len:]
|
||||
output_str += '{:>{}d}: {}\n'.format(line_no + i, digit_len, source_line)
|
||||
|
||||
if relative_offset > len(source_line):
|
||||
continue
|
||||
# TODO: handle multi-line underlines
|
||||
output_str += "{}{}{}{}\n".format(
|
||||
color,
|
||||
' ' * (relative_offset + digit_len + 2),
|
||||
'^' * min(annotation_len, len(source_line) - relative_offset),
|
||||
'\033[0m'
|
||||
)
|
||||
if annotation_len > len(source_line) - relative_offset:
|
||||
relative_offset = 0
|
||||
annotation_len -= len(source_line) - relative_offset
|
||||
|
||||
if message:
|
||||
output_str += color
|
||||
output_str += ' ' * (relative_offset + digit_len + 2) + '|\n'
|
||||
for message_line in message.split("\n"):
|
||||
output_str += ' ' * (relative_offset + digit_len + 2) + message_line + '\n'
|
||||
|
||||
return output_str + '\033[0m'
|
||||
|
||||
def print_warning(span: Span, message: str, color="\033[33m"):
|
||||
print(create_span_context_str(span, "Warning: " + message, color))
|
||||
|
||||
|
||||
class CompilerError(Exception):
|
||||
span: Span
|
||||
message: str
|
||||
|
||||
def __init__(self, msg: str, span: Span=None) -> None:
|
||||
super().__init__((msg, span))
|
||||
self.span = span
|
||||
self.message = msg
|
||||
|
||||
|
||||
def print_context_message(self):
|
||||
if not self.span:
|
||||
print("\n".join(">>> {}".format(line) for line in self.message.split('\n')))
|
||||
else:
|
||||
print(create_span_context_str(self.span, self.message))
|
||||
|
||||
|
||||
class EndOfInputError(CompilerError):
|
||||
def __init__(self,span: Span, search_str:str = None) -> None:
|
||||
|
||||
if search_str:
|
||||
super().__init__(f"Unexpected end-of-input in {span.source_name} while scanning for {search_str}!", span)
|
||||
else:
|
||||
super().__init__(f"Unexpected end-of-input in {span.source_name}!", span)
|
||||
|
||||
class ParseError(CompilerError):
|
||||
def __init__(self, msg: str, span: Span = None) -> None:
|
||||
super().__init__(msg, span)
|
||||
|
||||
class InvalidTokenError(CompilerError):
|
||||
def __init__(self, token: Token, expected_type: Iterable[str | TokenType] = None, message: str = None) -> None:
|
||||
|
||||
expected = ", expected {}".format(", ".join(f"{x}" for x in expected_type)) if expected_type else ""
|
||||
|
||||
super().__init__("Unexpected token {}{} {}".format(
|
||||
token, expected, '\n' + message if message else ""
|
||||
), token.span if token is not None else None)
|
||||
|
||||
|
||||
class UnsupportedSyntaxError(CompilerError):
|
||||
def __init__(self, token: Token, feature: str) -> None:
|
||||
super().__init__("Unsupported syntax: {}".format(feature), token.span)
|
||||
|
74
compiler/helpers.py
Normal file
74
compiler/helpers.py
Normal file
@ -0,0 +1,74 @@
|
||||
from typing import TypeVar, Generic, Iterable, Iterator, List
|
||||
from .defs import Token, TokenType
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
class PeekableIterator(Iterator[T]):
|
||||
_peeked: List[T]
|
||||
last_item: T | None
|
||||
|
||||
def __init__(self, iterable: Iterable[T]) -> None:
|
||||
self.iter = iterable
|
||||
self._peeked = list()
|
||||
self.last_item = None
|
||||
|
||||
def peek(self, offset: int = 0):
|
||||
while len(self._peeked) <= offset:
|
||||
try:
|
||||
self._peeked.append(next(self.iter))
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
return self._peeked[offset]
|
||||
|
||||
def __next__(self) -> T:
|
||||
if len(self._peeked) > 0:
|
||||
item = self._peeked.pop(0)
|
||||
else:
|
||||
item = next(self.iter)
|
||||
self.last_item = item
|
||||
return item
|
||||
|
||||
def __iter__(self) -> Iterator[T]:
|
||||
return self
|
||||
|
||||
def next(self) -> T:
|
||||
try:
|
||||
return next(self)
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
def has_next(self):
|
||||
return self.peek() is not None
|
||||
|
||||
|
||||
class ParserIterator(PeekableIterator[Token]):
|
||||
def __init__(self, iterable: Iterable[Token]) -> None:
|
||||
super().__init__(t for t in iterable if t.kind not in (TokenType.LineComment, TokenType.MultiComment))
|
||||
self.ignore_newline = False
|
||||
|
||||
def peek(self, offset: int = 0):
|
||||
while len(self._peeked) <= offset:
|
||||
try:
|
||||
self._peeked.append(next(self.iter))
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
token = self._peeked[offset]
|
||||
|
||||
if self.ignore_newline and token.kind == TokenType.EOL:
|
||||
return self.peek(offset=offset+1)
|
||||
|
||||
return token
|
||||
|
||||
def __next__(self) -> T:
|
||||
if len(self._peeked) > 0:
|
||||
item = self._peeked.pop(0)
|
||||
else:
|
||||
item = next(self.iter)
|
||||
self.last_item = item
|
||||
|
||||
if self.ignore_newline and item.kind == TokenType.EOL:
|
||||
return next(self)
|
||||
|
||||
return item
|
201
compiler/lexer.py
Normal file
201
compiler/lexer.py
Normal file
@ -0,0 +1,201 @@
|
||||
from ast import operator
|
||||
from dataclasses import dataclass
|
||||
from sre_parse import parse_template
|
||||
from typing import Dict, List, Iterable, Generator, Tuple
|
||||
from enum import Enum, auto
|
||||
from unicodedata import digit
|
||||
from .errors import EndOfInputError, ParseError
|
||||
from .defs import Span, LexingContext, Token, TokenType, digits, keywords, operators, integer_type_suffixes, parens, identifier_terminating_chars
|
||||
|
||||
|
||||
class Lexer:
|
||||
separators = ':-+*/#!"\'=?%&<>[]{}()\n \t'
|
||||
parens = '[]{}()<>'
|
||||
|
||||
context: LexingContext
|
||||
content: str
|
||||
pos: int
|
||||
line: int
|
||||
fname: str
|
||||
size: int
|
||||
|
||||
def __init__(self, fname: str, context: LexingContext):
|
||||
self.content = context.sources[fname]
|
||||
self.fname = fname
|
||||
self.pos = self.line = 0
|
||||
self.word = ""
|
||||
self.size = len(self.content)
|
||||
self.context = context
|
||||
|
||||
def peek(self, offset: int = 0):
|
||||
if self.pos + offset >= self.size:
|
||||
return None
|
||||
return self.content[self.pos + offset]
|
||||
|
||||
def startswith(self, *patterns: str, offset: int = 0):
|
||||
# match longest first
|
||||
for pattern in sorted(patterns, key=len, reverse=True):
|
||||
if self.content.startswith(pattern, self.pos + offset):
|
||||
return pattern
|
||||
return False
|
||||
|
||||
|
||||
def read_until(self, pattern: str, inclusive=True) -> Tuple[str, Span]:
|
||||
start = self.pos
|
||||
pos = self.pos
|
||||
while not self.content[pos:].startswith(pattern) and pos < self.size:
|
||||
pos += 1
|
||||
if pos == self.size:
|
||||
raise EndOfInputError(Span(start, pos, self.fname, self.context), pattern)
|
||||
|
||||
if inclusive:
|
||||
pos += len(pattern)
|
||||
self.pos = pos
|
||||
|
||||
return self.content[start:pos], Span(start, pos, self.fname, self.context)
|
||||
|
||||
def do_parse(self) -> Iterable[Token]:
|
||||
while True:
|
||||
c = self.peek()
|
||||
# reached end of input
|
||||
if c == None:
|
||||
yield Token(Span(self.pos, self.pos, self.fname, self.context), "", TokenType.EOI)
|
||||
break
|
||||
|
||||
if c in '\n\r':
|
||||
start = self.pos
|
||||
if self.startswith('\r\n'):
|
||||
self.pos += 1
|
||||
self.pos += 1
|
||||
yield self.Token(start, TokenType.EOL)
|
||||
continue
|
||||
|
||||
# check for integer literals
|
||||
if self.startswith('0x', '0X', '0b', *'0123456789'):
|
||||
yield self.parse_integer()
|
||||
continue
|
||||
|
||||
# check for parenthesis
|
||||
if c in parens:
|
||||
# left parens at position 0, 2, 4, 6
|
||||
left_paren = parens.index(c) % 2 == 0
|
||||
self.pos += 1
|
||||
yield self.Token(self.pos -1, TokenType.LBracket if left_paren else TokenType.RBracket)
|
||||
continue
|
||||
|
||||
if self.startswith('//'):
|
||||
start = self.pos
|
||||
self.read_until('\n', inclusive=False) # read until newline, but don't consume newline
|
||||
yield self.Token(start, TokenType.LineComment)
|
||||
continue
|
||||
|
||||
if self.startswith('/*'):
|
||||
start = self.pos
|
||||
self.read_until('*/')
|
||||
yield self.Token(start, TokenType.MultiComment)
|
||||
continue
|
||||
|
||||
starts_with_keyword = self.startswith(*keywords)
|
||||
if starts_with_keyword:
|
||||
start = self.pos
|
||||
self.pos += len(starts_with_keyword)
|
||||
yield self.Token(start, TokenType.Keyword)
|
||||
self.consume_expected_whitespace()
|
||||
continue
|
||||
|
||||
starts_with_operator = self.startswith(*operators)
|
||||
if starts_with_operator:
|
||||
start = self.pos
|
||||
self.pos += len(starts_with_operator)
|
||||
yield self.Token(start, TokenType.Operator)
|
||||
continue
|
||||
|
||||
if self.peek() in '"\'':
|
||||
yield self.parse_string()
|
||||
continue
|
||||
|
||||
if self.peek() in ' \t':
|
||||
self.pos += 1
|
||||
continue
|
||||
|
||||
# must be an identifier then
|
||||
start = self.pos
|
||||
while self.peek() not in identifier_terminating_chars:
|
||||
self.pos += 1
|
||||
if start == self.pos:
|
||||
raise ParseError("Expected identifier!", Span(start, start+1, self.fname, self.context))
|
||||
yield self.Token(start, TokenType.Identifier)
|
||||
continue
|
||||
|
||||
def consume_expected_whitespace(self):
|
||||
if self.peek() in '\r\n':
|
||||
return
|
||||
if self.peek() not in '\t ':
|
||||
raise ParseError("Expected whitespace here", Span(self.pos, self.pos+1, self.fname, self.context))
|
||||
while self.peek() in '\t ':
|
||||
self.pos += 1
|
||||
|
||||
def parse_integer(self):
|
||||
start = self.pos
|
||||
|
||||
if self.startswith('-'):
|
||||
self.pos += 1
|
||||
|
||||
parse_type = 'dec'
|
||||
if self.startswith('0x', '0X'):
|
||||
parse_type = 'hex'
|
||||
self.pos += 2
|
||||
elif self.startswith('0b'):
|
||||
parse_type = 'bin'
|
||||
self.pos += 2
|
||||
|
||||
while self.peek() in digits[parse_type]:
|
||||
self.pos += 1
|
||||
|
||||
suffix = self.startswith(*integer_type_suffixes)
|
||||
if suffix:
|
||||
self.pos += len(suffix)
|
||||
|
||||
return self.Token(start, TokenType.Integer)
|
||||
|
||||
|
||||
def parse_string(self):
|
||||
start = self.pos
|
||||
terminator = self.peek()
|
||||
escaped = False
|
||||
self.pos += 1
|
||||
string = ""
|
||||
while not escaped and self.peek() != terminator:
|
||||
char = self.peek()
|
||||
if escaped:
|
||||
match char:
|
||||
case 'r':
|
||||
string += '\r'
|
||||
case 'b':
|
||||
string += '\b'
|
||||
case 'n':
|
||||
string += '\n'
|
||||
case 't':
|
||||
string += '\t'
|
||||
case 'e': # support terminal escape codes
|
||||
string += '\033'
|
||||
case other:
|
||||
string += '\\' + other
|
||||
escaped = False
|
||||
elif self.peek() == '\\':
|
||||
escaped = True
|
||||
else:
|
||||
string += char
|
||||
self.pos += 1
|
||||
# consume trailing terminator
|
||||
self.pos += 1
|
||||
|
||||
return self.Token(start, TokenType.String, content=string)
|
||||
|
||||
def Token(self, start: int, type: TokenType, end=None, content=None) -> Token:
|
||||
if end is None:
|
||||
end = self.pos
|
||||
if content is None:
|
||||
content = self.content[start:end]
|
||||
return Token(Span(start, end, self.fname, self.context), content, type)
|
||||
|
466
compiler/parser.py
Normal file
466
compiler/parser.py
Normal file
@ -0,0 +1,466 @@
|
||||
from dataclasses import dataclass
|
||||
import imp
|
||||
from webbrowser import Opera
|
||||
from .defs import Token, IntegerLiteralToken, TokenType, OperatorToken, KeywordToken, IdentifierToken, Span
|
||||
from .lexer import Lexer
|
||||
from .helpers import ParserIterator
|
||||
from .errors import CompilerError, EndOfInputError, InvalidTokenError, UnsupportedSyntaxError, print_warning
|
||||
|
||||
from typing import Tuple, Optional, List, Dict, Set, Iterable
|
||||
|
||||
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Type:
|
||||
name: IdentifierToken | str
|
||||
wraps: Tuple['Type', ...]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Value:
|
||||
type: Optional[Type]
|
||||
value: 'ASTNode'
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FunctionArgument:
|
||||
name: IdentifierToken
|
||||
type: Type
|
||||
#default_value: Value | None
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ASTNode:
|
||||
pass
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FunctionNode(ASTNode):
|
||||
name: IdentifierToken
|
||||
args: Tuple[FunctionArgument, ...]
|
||||
return_type: Type
|
||||
contents: Tuple[ASTNode, ...]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FunctionCallNode(ASTNode):
|
||||
function: Value
|
||||
arguments: List[Value]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VariableDeclarationNode(ASTNode):
|
||||
name: IdentifierToken
|
||||
modifiers: List[str]
|
||||
type: Type
|
||||
value: Value
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ForLoopNode(ASTNode):
|
||||
variable_name: IdentifierToken | str
|
||||
iterator: Value
|
||||
body: List[ASTNode]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SpreadOperatorNode(ASTNode):
|
||||
left_side: Value
|
||||
right_side: Value
|
||||
type: Type | None
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IntegerImmediateNode(ASTNode):
|
||||
value: int
|
||||
type: Type
|
||||
span: Span
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class StringImmediateNode(ASTNode):
|
||||
value: str
|
||||
type: Type
|
||||
span: Span
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VariableNameNode(ASTNode):
|
||||
name: IdentifierToken
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BracketetExpressionNode(ASTNode):
|
||||
content: ASTNode
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class UseStatement(ASTNode):
|
||||
path: List[IdentifierToken]
|
||||
|
||||
|
||||
class Parser:
|
||||
"""
|
||||
This class takes a lexed input and produces a syntax tree.
|
||||
|
||||
This only validates syntax, but does no type checking etc...
|
||||
"""
|
||||
types: Dict[str, Type]
|
||||
|
||||
lexer: Lexer
|
||||
tokens: ParserIterator[Token]
|
||||
|
||||
def __init__(self, lexer: Lexer):
|
||||
self.variables = dict()
|
||||
self.lexer = lexer
|
||||
# strip comments from tokens
|
||||
self.tokens = ParserIterator(lexer.do_parse())
|
||||
|
||||
def parse(self):
|
||||
body = []
|
||||
|
||||
while True:
|
||||
thing = self.parse_file_level_block()
|
||||
if thing is None:
|
||||
return body
|
||||
body.append(thing)
|
||||
print(thing)
|
||||
|
||||
|
||||
def consume_next_token(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
|
||||
if not isinstance(types, set) and types is not None:
|
||||
types = {types}
|
||||
|
||||
peeked = self.tokens.peek()
|
||||
if peeked is None:
|
||||
raise EndOfInputError(self.tokens.last_item.span, content)
|
||||
if types is not None and peeked.kind not in types:
|
||||
raise InvalidTokenError(peeked, (*types, content), msg)
|
||||
if content is not None and peeked.content != content:
|
||||
raise InvalidTokenError(peeked, {content}, msg)
|
||||
|
||||
return self.tokens.next()
|
||||
|
||||
def consume_optional_eol(self):
|
||||
"""
|
||||
This function tries to consume EOL tokens, if they are available
|
||||
"""
|
||||
while self.tokens.peek().kind == TokenType.EOL:
|
||||
self.tokens.next()
|
||||
|
||||
def consume_expected_eol(self, msg):
|
||||
"""
|
||||
This function consumes at least one EOL token, or fails
|
||||
"""
|
||||
if self.tokens.peek().kind != TokenType.EOL:
|
||||
raise InvalidTokenError(self.tokens.peek(), expected_type=["\\n"], message=msg)
|
||||
while self.tokens.peek().kind == TokenType.EOL:
|
||||
self.tokens.next()
|
||||
|
||||
|
||||
|
||||
def consume_optional(self, types:Set[TokenType]|TokenType = None, content: str = None, msg: str = None):
|
||||
try:
|
||||
return self.consume_next_token(types, content, msg)
|
||||
except InvalidTokenError:
|
||||
return False
|
||||
|
||||
|
||||
def parse_file_level_block(self) -> ASTNode | None:
|
||||
"""
|
||||
File-level blocks are statements written at the file level.
|
||||
|
||||
|
||||
"""
|
||||
# this part ignores newlines!
|
||||
prev_ignore_lvl = self.tokens.ignore_newline
|
||||
self.tokens.ignore_newline = True
|
||||
|
||||
try:
|
||||
match self.tokens.peek():
|
||||
case KeywordToken(content="function"):
|
||||
return self.parse_function_definition()
|
||||
case KeywordToken(content="struct"):
|
||||
return self.parse_struct()
|
||||
case KeywordToken(content="const"):
|
||||
return self.parse_const_declaration()
|
||||
case KeywordToken(content="use"):
|
||||
return self.parse_import_statement()
|
||||
case Token(kind=TokenType.EOI):
|
||||
return None
|
||||
case None:
|
||||
raise Exception("Unexpected None token!")
|
||||
case unknown_token:
|
||||
raise InvalidTokenError(unknown_token, ("function", "struct"), "Only function and struct declarations are allowed at file-level!")
|
||||
finally:
|
||||
self.tokens.ignore_newline = prev_ignore_lvl
|
||||
|
||||
|
||||
def parse_import_statement(self):
|
||||
"""
|
||||
parse an import-equivalent statement:
|
||||
|
||||
use std.String
|
||||
|
||||
"""
|
||||
self.consume_next_token(types=TokenType.Keyword, content="use")
|
||||
|
||||
path = []
|
||||
|
||||
if self.tokens.peek().kind == TokenType.String:
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "file paths in use statements!")
|
||||
|
||||
prev = self.tokens.ignore_newline
|
||||
self.tokens.ignore_newline = False
|
||||
|
||||
while self.tokens.peek().kind != TokenType.EOL:
|
||||
path.append(self.consume_next_token(TokenType.Identifier))
|
||||
if self.tokens.peek().content == '.':
|
||||
self.consume_next_token(types=TokenType.Operator, content='.')
|
||||
|
||||
self.consume_expected_eol("'use' statement must be terminated by EOL!")
|
||||
|
||||
self.tokens.ignore_newline = prev
|
||||
|
||||
return UseStatement(path)
|
||||
|
||||
|
||||
|
||||
def parse_basic_block(self) -> Iterable[ASTNode]:
|
||||
"""
|
||||
A "Basic Block" is a block inside a function, for loop, etc.
|
||||
"""
|
||||
# when parsing blocks, newlines are important!
|
||||
prev_ignore_lvl = self.tokens.ignore_newline
|
||||
|
||||
self.tokens.ignore_newline = False
|
||||
|
||||
if prev_ignore_lvl:
|
||||
# consume all remaining EOLs
|
||||
self.consume_optional_eol()
|
||||
|
||||
try:
|
||||
while True:
|
||||
match self.tokens.peek():
|
||||
case KeywordToken(content="function"):
|
||||
yield self.parse_function_definition()
|
||||
case KeywordToken(content="const"):
|
||||
yield self.parse_const_declaration()
|
||||
case KeywordToken(content="let"):
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "'let' not supported yet")
|
||||
case KeywordToken(content="for"):
|
||||
yield self.parse_for_statement()
|
||||
case KeywordToken(content="return"):
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "'return' not supported yet")
|
||||
case KeywordToken(content="if"):
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "'if' not supported yet")
|
||||
case KeywordToken(content="struct"):
|
||||
# TODO: support
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "structs not supported yet")
|
||||
case Token(kind=TokenType.RBracket, content="}"):
|
||||
break
|
||||
case other:
|
||||
yield self.parse_value()
|
||||
self.consume_expected_eol(msg="Only one statement per line permitted!")
|
||||
finally:
|
||||
self.tokens.ignore_newline = prev_ignore_lvl
|
||||
|
||||
|
||||
|
||||
def parse_function_definition(self):
|
||||
"""
|
||||
Parses a function definition including the body
|
||||
"""
|
||||
self.tokens.next()
|
||||
|
||||
function_name = self.consume_next_token(types=TokenType.Identifier, msg="'function' keyword must be followed by identifier!")
|
||||
|
||||
# consume parenthesis
|
||||
self.consume_next_token(types = TokenType.LBracket, content="(", msg="A function declaration must contain a list of arguments enclosed in parenthesis!")
|
||||
|
||||
args = []
|
||||
|
||||
# TODO: we actually want to match against Token(kind=TokenType.RParen, content=")")
|
||||
while self.tokens.peek().content != ')':
|
||||
args.append(self.parse_function_def_arg())
|
||||
self.consume_optional_eol()
|
||||
if not self.consume_optional(content=','):
|
||||
break
|
||||
self.consume_optional_eol()
|
||||
|
||||
self.consume_next_token(types=TokenType.RBracket, content=")", msg="Expected ')' at the end of function argument list!")
|
||||
|
||||
if self.tokens.peek().content == '->':
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "Function return type annotations are not yet supported!")
|
||||
|
||||
if self.tokens.peek().content == '=>':
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "Short function body notation not yet supported!")
|
||||
|
||||
self.consume_next_token(types=TokenType.LBracket, content="{")
|
||||
|
||||
content = list(self.parse_basic_block())
|
||||
|
||||
self.consume_next_token(types=TokenType.RBracket, content="}", msg="Expected '}' at the end of a function body!")
|
||||
|
||||
return FunctionNode(function_name, args, None, content)
|
||||
|
||||
|
||||
def parse_function_def_arg(self) -> FunctionArgument:
|
||||
"""
|
||||
Parse a single argument of a function.
|
||||
|
||||
Currently this allows name: type
|
||||
|
||||
In the future we want to also support name: type = value
|
||||
"""
|
||||
identifier = self.consume_next_token(types=TokenType.Identifier, msg="Function argument name expected!")
|
||||
|
||||
self.consume_next_token(types=TokenType.Operator, content=":", msg="Function argument name must be followed by a colon ':' and a type definition!")
|
||||
|
||||
type = self.parse_type()
|
||||
|
||||
if self.tokens.peek().content == '=':
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "default values for function arguments")
|
||||
|
||||
return FunctionArgument(identifier, type)
|
||||
|
||||
|
||||
def parse_type(self) -> Type:
|
||||
"""
|
||||
Parse a type declaration, such as String, i64, or Vector<i64>
|
||||
"""
|
||||
main_type = self.consume_next_token(types=TokenType.Identifier, msg="Expected type name!")
|
||||
|
||||
# if this type does not wrap any other types, we are done!
|
||||
if self.tokens.peek().content != '<':
|
||||
return Type(main_type, [])
|
||||
|
||||
wrapped_types = []
|
||||
start_token = self.consume_next_token(content="<")
|
||||
|
||||
while self.tokens.peek().content != '>':
|
||||
wrapped_types.append(self.parse_type())
|
||||
if not self.consume_optional(content=","):
|
||||
break
|
||||
self.consume_optional_eol()
|
||||
|
||||
self.consume_next_token(content=">", msg="Error while parsing list of wrapped types, expected '>' at the end of the type list!")
|
||||
|
||||
if len(wrapped_types) == 0:
|
||||
print_warning(self.tokens.last_item.span.union(start_token.span), "Empty set of type arguments!")
|
||||
|
||||
return Type(main_type, wrapped_types)
|
||||
|
||||
def parse_const_declaration(self):
|
||||
"""
|
||||
parse a const declaration, so basically
|
||||
|
||||
|
||||
const name: type = value
|
||||
"""
|
||||
self.consume_next_token(types=TokenType.Keyword, content="const")
|
||||
identifier = self.consume_next_token(types=TokenType.Identifier, msg="const keywords must be immediately followed by a variable name!")
|
||||
|
||||
type = None
|
||||
if self.tokens.peek().content == ':':
|
||||
self.consume_next_token(content=':')
|
||||
type = self.parse_type()
|
||||
|
||||
self.consume_next_token(content='=', msg="Expected '=' in const declaration!")
|
||||
|
||||
value = self.parse_value()
|
||||
|
||||
self.consume_expected_eol("Const declaration statement must be terminated by a newline!")
|
||||
|
||||
return VariableDeclarationNode(identifier, ['const'], type, Value(None, value))
|
||||
|
||||
def parse_value(self) -> ASTNode:
|
||||
"""
|
||||
This function parses a "value", so basically any statement that evaluates to a value
|
||||
|
||||
This can be a literal, a function call, an array/struct constructor, etc.
|
||||
"""
|
||||
# handle bracketet expression
|
||||
if self.tokens.peek().content == '(':
|
||||
self.consume_next_token(content='(')
|
||||
self.consume_optional_eol()
|
||||
value = self.parse_value()
|
||||
self.consume_optional_eol()
|
||||
self.consume_next_token(content=')', msg="Expected closing bracket")
|
||||
|
||||
value = self._inner_parse_value()
|
||||
|
||||
match self.tokens.peek():
|
||||
case OperatorToken(content='..'):
|
||||
self.consume_next_token(types=TokenType.Operator, content="..")
|
||||
right_hand_type = self.parse_type()
|
||||
return SpreadOperatorNode(value, right_hand_type, None)
|
||||
#case OperatorToken(content):
|
||||
# raise UnsupportedSyntaxError(self.tokens.peek(), f"'{content}' is not implemented yet!")
|
||||
case Token(kind=TokenType.LBracket, content="("):
|
||||
self.consume_next_token(content="(")
|
||||
self.consume_optional_eol()
|
||||
args = list(self.parse_inner_function_call_args())
|
||||
self.consume_next_token(content=')', msg="")
|
||||
return FunctionCallNode(value, args)
|
||||
|
||||
def _inner_parse_value(self) -> ASTNode:
|
||||
match self.tokens.peek():
|
||||
case IntegerLiteralToken(value, suffix, span):
|
||||
if suffix:
|
||||
type = Type(suffix, [])
|
||||
else:
|
||||
# assume widest signed integer type available
|
||||
type = Type('i64', [])
|
||||
self.consume_next_token()
|
||||
return IntegerImmediateNode(value, type, span)
|
||||
case Token(span, content, kind=TokenType.String):
|
||||
self.consume_next_token()
|
||||
return StringImmediateNode(content, Type("String", []), span)
|
||||
case Token(content="{", kind=TokenType.LBracket):
|
||||
return self.parse_structured_value()
|
||||
case IdentifierToken():
|
||||
return VariableNameNode(self.consume_next_token(TokenType.Identifier))
|
||||
case other:
|
||||
raise UnsupportedSyntaxError(other, "This type of value is not implemented yet!")
|
||||
|
||||
def parse_inner_function_call_args(self) -> Iterable[ASTNode]:
|
||||
if self.tokens.peek().content == ')':
|
||||
return
|
||||
|
||||
while True:
|
||||
self.consume_optional_eol()
|
||||
yield self.parse_value()
|
||||
self.consume_optional_eol()
|
||||
|
||||
if self.tokens.peek().content == ',':
|
||||
self.consume_next_token(content=",")
|
||||
continue
|
||||
|
||||
if self.tokens.peek().content == ')':
|
||||
break
|
||||
|
||||
def parse_structured_value(self) -> ASTNode:
|
||||
"""
|
||||
parse either a list or struct initializer:
|
||||
|
||||
list initializer:
|
||||
const data: Vector<i64> = {1,2,3,4,5}
|
||||
|
||||
struct initializer:
|
||||
const data: MyStruct = {
|
||||
field1: 1
|
||||
field2: "Hello World"
|
||||
arrayField: {"test", "123", "these are strings"}
|
||||
}
|
||||
"""
|
||||
raise UnsupportedSyntaxError(self.tokens.peek(), "Structured values such as lists, dictionaries and structs!")
|
||||
|
||||
def parse_for_statement(self) -> ForLoopNode:
|
||||
self.consume_next_token(content='for', types=TokenType.Keyword)
|
||||
|
||||
loop_variable_name = self.consume_next_token(types=TokenType.Identifier, msg="Name of the loop variable expected!")
|
||||
|
||||
self.consume_next_token(types=TokenType.Keyword, content="in", msg="for <name> in <value> format required!")
|
||||
|
||||
iterator = self.parse_value()
|
||||
|
||||
self.consume_next_token(content='{')
|
||||
self.consume_optional_eol()
|
||||
|
||||
body = list(self.parse_basic_block())
|
||||
|
||||
self.consume_optional_eol()
|
||||
self.consume_next_token(content='}')
|
||||
self.consume_optional_eol()
|
||||
|
||||
return ForLoopNode(loop_variable_name, iterator, body)
|
21
example.pmp
Normal file
21
example.pmp
Normal file
@ -0,0 +1,21 @@
|
||||
/*struct something {
|
||||
i64 i
|
||||
Array<i64> data
|
||||
|
||||
private i64 total
|
||||
}*/
|
||||
|
||||
use std.string
|
||||
|
||||
function main(args: Array<String>) {
|
||||
const cars = 100
|
||||
|
||||
for x in 0x01..cars {
|
||||
print("{} cars are driving around the block", cars,)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
0
runtime_lib/ProgramQueue.cpp
Normal file
0
runtime_lib/ProgramQueue.cpp
Normal file
2
runtime_lib/ProgramQueue.h
Normal file
2
runtime_lib/ProgramQueue.h
Normal file
@ -0,0 +1,2 @@
|
||||
#pragma once
|
||||
|
25
test.py
Normal file
25
test.py
Normal file
@ -0,0 +1,25 @@
|
||||
from compiler.errors import *
|
||||
from compiler.lexer import Lexer, LexingContext
|
||||
from compiler.parser import Parser
|
||||
import os
|
||||
|
||||
fname = os.path.abspath('./example.pmp')
|
||||
|
||||
c = LexingContext(dict(), fname, dict(), dict())
|
||||
|
||||
try:
|
||||
with open(fname, 'r') as f:
|
||||
c.sources[fname] = f.read()
|
||||
|
||||
lex = Lexer(fname, c)
|
||||
|
||||
#for token in a.do_parse():
|
||||
# print(token)
|
||||
|
||||
a = Parser(lex)
|
||||
|
||||
elems = a.parse()
|
||||
|
||||
|
||||
except CompilerError as err:
|
||||
err.print_context_message()
|
25
zipfs.pmp
Normal file
25
zipfs.pmp
Normal file
@ -0,0 +1,25 @@
|
||||
use std.random randint
|
||||
|
||||
const ALPHABET = 26
|
||||
|
||||
function main() {
|
||||
|
||||
// generate a lot of words
|
||||
map const x in 0..10000 into words {
|
||||
let word = 0
|
||||
let len = 0
|
||||
let char = randint(ALPHABET)
|
||||
|
||||
while char != 0 {
|
||||
word *= ALPHABET
|
||||
word += char
|
||||
len++
|
||||
|
||||
char = randint(ALPHABET)
|
||||
}
|
||||
|
||||
yield word
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user