You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

213 lines
6.3 KiB

from dataclasses import dataclass, fields
from mimetypes import suffix_map
from ntpath import join
from typing import Dict, List, Iterable, Generator, Any, Literal
from enum import Enum, auto
class LexingContext:
sources: Dict[str,str]
entrypoint: str
structs: Dict[int, Any] # TODO: struct def type
functions: Dict[str, Any] # TODO: function types
def get_nth_line_bounds(self, source_name: str, n: int):
if source_name not in self.sources:
raise KeyError("Unknown source file \"{}\"!".format(source_name))
start = 0
source = self.sources[source_name]
for i in range(n):
next_start = source.find('\n', start)
if next_start == -1:
return None
start = next_start + 1
return start, source.find('\n', start)
def get_lines_containing(self, span: 'Span'):
if span.source_name not in self.sources:
raise KeyError("Unknown source file \"{}\"!".format(span.source_name))
start = 0
line_no = 0
source = self.sources[span.source_name]
while True:
next_start = source.find('\n', start)
line_no += 1
# handle eof
if next_start == -1:
return None
# as long as the next newline comes before the spans start we are good
if next_start < span.start:
start = next_start + 1
# if the whole span is on one line, we are good as well
if next_start >= span.end:
return [ source[start:next_start] ], start, line_no
while next_start < span.end:
next_start = source.find('\n', next_start+1)
return source[start:next_start].split('\n'), start, line_no
class Span:
start: int
Start of tokens location in source file, global byte offset in file
end: int
End of tokens location in source file, global byte offset in file
source_name: str
context: LexingContext
def union(self, *spans: 'Span'):
for span in spans:
assert span.source_name == self.source_name
assert span.context == self.context
return Span(
start=min(self.start, *(span.start for span in spans)),
end=max(self.end, *(span.end for span in spans)),
def transform(self, start:int=0, end:int=0):
return Span(self.start + start, self.end - end, self.source_name, self.context)
def __repr__(self):
return "{}(start={},end={},source_name={})".format(
self.start, self.end, self.source_name
class TokenType(Enum):
Keyword = auto()
Integer = auto()
Float = auto()
Identifier = auto()
String = auto()
LBracket = auto()
RBracket = auto()
Operator = auto()
LineComment = auto()
MultiComment = auto()
EOL = auto() # End of Line
EOI = auto() # End of Input
class Token:
span: Span
content: str
kind: TokenType
def __new__(cls, span: Span, content: str, kind: TokenType):
if kind in token_type_to_subclass_map and cls == Token:
return token_type_to_subclass_map[kind].__new__(
token_type_to_subclass_map[kind], span, content, kind
return super().__new__(cls)
def __repr__(self):
fields_to_print = [field for field in fields(self) if not in ('span', 'context', 'kind')]
if self.__class__ == Token:
return "{}[{}]({})".format(
", ".join("{}={}".format(, repr(getattr(self, for field in fields_to_print)
return "{}({})".format(
", ".join("{}={}".format(, repr(getattr(self, for field in fields_to_print)
@dataclass(frozen=True, init=False, repr=False)
class IntegerLiteralToken(Token):
value: int
format: Literal['hex', 'int', 'dec', 'oct']
suffix: str | None
def __init__(self, span: Span, content: str, kind: TokenType):
super().__init__(span, content, kind)
assert kind == TokenType.Integer
suffix = None
for suffix_ in integer_type_suffixes:
if content.endswith(suffix_):
suffix = suffix_
format = 'dec'
if content.startswith('0x') or content.startswith('0X'):
value = int(content, 16)
format = 'hex'
elif content.startswith('0b'):
value = int(content, 2)
format = 'bin'
value = int(content, 10)
object.__setattr__(self, "value", value)
object.__setattr__(self, "suffix", suffix)
object.__setattr__(self, "format", format)
@dataclass(frozen=True, repr=False)
class KeywordToken(Token):
@dataclass(frozen=True, repr=False)
class OperatorToken(Token):
@dataclass(frozen=True, repr=False)
class IdentifierToken(Token):
token_type_to_subclass_map = {
TokenType.Integer: IntegerLiteralToken,
TokenType.Keyword: KeywordToken,
TokenType.Operator: OperatorToken,
TokenType.Identifier: IdentifierToken
keywords = {
'const', 'let', 'for', 'if', 'function',
'true', 'false', 'in', 'not', 'or', 'and',
'struct', 'private', 'public', 'return',
'impure', 'while', 'use', 'do', 'continue',
digits = {
'bin': '01_',
'hex': '0123456789abcdefABCDEF_',
'dec': '0123456789_',
'oct': '01234567_' # TODO: implement octal literals?
operators = {
'+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '||',
'??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--',
'=', '*=', '+=', '/=', '-='
integer_type_suffixes = {
'i8', 'i16', 'i32', 'i64',
'u8', 'u16', 'u32', 'u64',
reserved_special_chars = {
'#', '~', '`', '"', '\'', '@', '|', ';'
parens = '[]{}()<>'
identifier_terminating_chars = set((*operators, *parens, ' ', '\n', '\t', '\r', *reserved_special_chars))