You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
213 lines
6.3 KiB
Python
213 lines
6.3 KiB
Python
from dataclasses import dataclass, fields
|
|
from mimetypes import suffix_map
|
|
from ntpath import join
|
|
from typing import Dict, List, Iterable, Generator, Any, Literal
|
|
from enum import Enum, auto
|
|
|
|
|
|
@dataclass
|
|
class LexingContext:
|
|
sources: Dict[str,str]
|
|
entrypoint: str
|
|
structs: Dict[int, Any] # TODO: struct def type
|
|
functions: Dict[str, Any] # TODO: function types
|
|
|
|
def get_nth_line_bounds(self, source_name: str, n: int):
|
|
if source_name not in self.sources:
|
|
raise KeyError("Unknown source file \"{}\"!".format(source_name))
|
|
start = 0
|
|
source = self.sources[source_name]
|
|
for i in range(n):
|
|
next_start = source.find('\n', start)
|
|
if next_start == -1:
|
|
return None
|
|
start = next_start + 1
|
|
return start, source.find('\n', start)
|
|
|
|
def get_lines_containing(self, span: 'Span'):
|
|
if span.source_name not in self.sources:
|
|
raise KeyError("Unknown source file \"{}\"!".format(span.source_name))
|
|
start = 0
|
|
line_no = 0
|
|
source = self.sources[span.source_name]
|
|
while True:
|
|
next_start = source.find('\n', start)
|
|
line_no += 1
|
|
# handle eof
|
|
if next_start == -1:
|
|
return None
|
|
# as long as the next newline comes before the spans start we are good
|
|
if next_start < span.start:
|
|
start = next_start + 1
|
|
continue
|
|
# if the whole span is on one line, we are good as well
|
|
if next_start >= span.end:
|
|
return [ source[start:next_start] ], start, line_no
|
|
while next_start < span.end:
|
|
next_start = source.find('\n', next_start+1)
|
|
|
|
return source[start:next_start].split('\n'), start, line_no
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Span:
|
|
start: int
|
|
"""
|
|
Start of tokens location in source file, global byte offset in file
|
|
"""
|
|
end: int
|
|
"""
|
|
End of tokens location in source file, global byte offset in file
|
|
"""
|
|
source_name: str
|
|
context: LexingContext
|
|
|
|
def union(self, *spans: 'Span'):
|
|
for span in spans:
|
|
assert span.source_name == self.source_name
|
|
assert span.context == self.context
|
|
return Span(
|
|
start=min(self.start, *(span.start for span in spans)),
|
|
end=max(self.end, *(span.end for span in spans)),
|
|
source_name=self.source_name,
|
|
context=self.context
|
|
)
|
|
|
|
def transform(self, start:int=0, end:int=0):
|
|
return Span(self.start + start, self.end - end, self.source_name, self.context)
|
|
|
|
def __repr__(self):
|
|
return "{}(start={},end={},source_name={})".format(
|
|
self.__class__.__name__,
|
|
self.start, self.end, self.source_name
|
|
)
|
|
|
|
class TokenType(Enum):
|
|
Keyword = auto()
|
|
Integer = auto()
|
|
Float = auto()
|
|
Identifier = auto()
|
|
String = auto()
|
|
LBracket = auto()
|
|
RBracket = auto()
|
|
Operator = auto()
|
|
LineComment = auto()
|
|
MultiComment = auto()
|
|
EOL = auto() # End of Line
|
|
EOI = auto() # End of Input
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Token:
|
|
span: Span
|
|
content: str
|
|
kind: TokenType
|
|
|
|
def __new__(cls, span: Span, content: str, kind: TokenType):
|
|
if kind in token_type_to_subclass_map and cls == Token:
|
|
return token_type_to_subclass_map[kind].__new__(
|
|
token_type_to_subclass_map[kind], span, content, kind
|
|
)
|
|
return super().__new__(cls)
|
|
|
|
def __repr__(self):
|
|
fields_to_print = [field for field in fields(self) if field.name not in ('span', 'context', 'kind')]
|
|
if self.__class__ == Token:
|
|
return "{}[{}]({})".format(
|
|
self.__class__.__name__,
|
|
self.kind.name,
|
|
", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
|
|
)
|
|
else:
|
|
return "{}({})".format(
|
|
self.__class__.__name__,
|
|
", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True, init=False, repr=False)
|
|
class IntegerLiteralToken(Token):
|
|
value: int
|
|
format: Literal['hex', 'int', 'dec', 'oct']
|
|
suffix: str | None
|
|
|
|
def __init__(self, span: Span, content: str, kind: TokenType):
|
|
super().__init__(span, content, kind)
|
|
assert kind == TokenType.Integer
|
|
|
|
suffix = None
|
|
for suffix_ in integer_type_suffixes:
|
|
if content.endswith(suffix_):
|
|
suffix = suffix_
|
|
content.removesuffix(suffix_)
|
|
break
|
|
|
|
format = 'dec'
|
|
if content.startswith('0x') or content.startswith('0X'):
|
|
value = int(content, 16)
|
|
format = 'hex'
|
|
elif content.startswith('0b'):
|
|
value = int(content, 2)
|
|
format = 'bin'
|
|
else:
|
|
value = int(content, 10)
|
|
object.__setattr__(self, "value", value)
|
|
object.__setattr__(self, "suffix", suffix)
|
|
object.__setattr__(self, "format", format)
|
|
|
|
|
|
@dataclass(frozen=True, repr=False)
|
|
class KeywordToken(Token):
|
|
pass
|
|
|
|
@dataclass(frozen=True, repr=False)
|
|
class OperatorToken(Token):
|
|
pass
|
|
|
|
@dataclass(frozen=True, repr=False)
|
|
class IdentifierToken(Token):
|
|
pass
|
|
|
|
|
|
|
|
token_type_to_subclass_map = {
|
|
TokenType.Integer: IntegerLiteralToken,
|
|
TokenType.Keyword: KeywordToken,
|
|
TokenType.Operator: OperatorToken,
|
|
TokenType.Identifier: IdentifierToken
|
|
}
|
|
|
|
|
|
keywords = {
|
|
'const', 'let', 'for', 'if', 'function',
|
|
'true', 'false', 'in', 'not', 'or', 'and',
|
|
'struct', 'private', 'public', 'return',
|
|
'impure', 'while', 'use', 'do', 'continue',
|
|
'break'
|
|
}
|
|
|
|
digits = {
|
|
'bin': '01_',
|
|
'hex': '0123456789abcdefABCDEF_',
|
|
'dec': '0123456789_',
|
|
'oct': '01234567_' # TODO: implement octal literals?
|
|
}
|
|
|
|
operators = {
|
|
'+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '||',
|
|
'??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--',
|
|
'=', '*=', '+=', '/=', '-='
|
|
}
|
|
|
|
integer_type_suffixes = {
|
|
'i8', 'i16', 'i32', 'i64',
|
|
'u8', 'u16', 'u32', 'u64',
|
|
}
|
|
|
|
reserved_special_chars = {
|
|
'#', '~', '`', '"', '\'', '@', '|', ';'
|
|
}
|
|
|
|
parens = '[]{}()<>'
|
|
|
|
identifier_terminating_chars = set((*operators, *parens, ' ', '\n', '\t', '\r', *reserved_special_chars)) |