from dataclasses import dataclass, fields from mimetypes import suffix_map from ntpath import join from typing import Dict, List, Iterable, Generator, Any, Literal from enum import Enum, auto @dataclass class LexingContext: sources: Dict[str,str] entrypoint: str structs: Dict[int, Any] # TODO: struct def type functions: Dict[str, Any] # TODO: function types def get_nth_line_bounds(self, source_name: str, n: int): if source_name not in self.sources: raise KeyError("Unknown source file \"{}\"!".format(source_name)) start = 0 source = self.sources[source_name] for i in range(n): next_start = source.find('\n', start) if next_start == -1: return None start = next_start + 1 return start, source.find('\n', start) def get_lines_containing(self, span: 'Span'): if span.source_name not in self.sources: raise KeyError("Unknown source file \"{}\"!".format(span.source_name)) start = 0 line_no = 0 source = self.sources[span.source_name] while True: next_start = source.find('\n', start) line_no += 1 # handle eof if next_start == -1: return None # as long as the next newline comes before the spans start we are good if next_start < span.start: start = next_start + 1 continue # if the whole span is on one line, we are good as well if next_start >= span.end: return [ source[start:next_start] ], start, line_no while next_start < span.end: next_start = source.find('\n', next_start+1) return source[start:next_start].split('\n'), start, line_no @dataclass(frozen=True) class Span: start: int """ Start of tokens location in source file, global byte offset in file """ end: int """ End of tokens location in source file, global byte offset in file """ source_name: str context: LexingContext def union(self, *spans: 'Span'): for span in spans: assert span.source_name == self.source_name assert span.context == self.context return Span( start=min(self.start, *(span.start for span in spans)), end=max(self.end, *(span.end for span in spans)), source_name=self.source_name, context=self.context ) def transform(self, start:int=0, end:int=0): return Span(self.start + start, self.end - end, self.source_name, self.context) def __repr__(self): return "{}(start={},end={},source_name={})".format( self.__class__.__name__, self.start, self.end, self.source_name ) class TokenType(Enum): Keyword = auto() Integer = auto() Float = auto() Identifier = auto() String = auto() LBracket = auto() RBracket = auto() Operator = auto() LineComment = auto() MultiComment = auto() EOL = auto() # End of Line EOI = auto() # End of Input @dataclass(frozen=True) class Token: span: Span content: str kind: TokenType def __new__(cls, span: Span, content: str, kind: TokenType): if kind in token_type_to_subclass_map and cls == Token: return token_type_to_subclass_map[kind].__new__( token_type_to_subclass_map[kind], span, content, kind ) return super().__new__(cls) def __repr__(self): fields_to_print = [field for field in fields(self) if field.name not in ('span', 'context', 'kind')] if self.__class__ == Token: return "{}[{}]({})".format( self.__class__.__name__, self.kind.name, ", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print) ) else: return "{}({})".format( self.__class__.__name__, ", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print) ) @dataclass(frozen=True, init=False, repr=False) class IntegerLiteralToken(Token): value: int format: Literal['hex', 'int', 'dec', 'oct'] suffix: str | None def __init__(self, span: Span, content: str, kind: TokenType): super().__init__(span, content, kind) assert kind == TokenType.Integer suffix = None for suffix_ in integer_type_suffixes: if content.endswith(suffix_): suffix = suffix_ content.removesuffix(suffix_) break format = 'dec' if content.startswith('0x') or content.startswith('0X'): value = int(content, 16) format = 'hex' elif content.startswith('0b'): value = int(content, 2) format = 'bin' else: value = int(content, 10) object.__setattr__(self, "value", value) object.__setattr__(self, "suffix", suffix) object.__setattr__(self, "format", format) @dataclass(frozen=True, repr=False) class KeywordToken(Token): pass @dataclass(frozen=True, repr=False) class OperatorToken(Token): pass @dataclass(frozen=True, repr=False) class IdentifierToken(Token): pass token_type_to_subclass_map = { TokenType.Integer: IntegerLiteralToken, TokenType.Keyword: KeywordToken, TokenType.Operator: OperatorToken, TokenType.Identifier: IdentifierToken } keywords = { 'const', 'let', 'for', 'if', 'function', 'true', 'false', 'in', 'not', 'or', 'and', 'struct', 'private', 'public', 'return', 'impure', 'while', 'use', 'do', 'continue', 'break' } digits = { 'bin': '01_', 'hex': '0123456789abcdefABCDEF_', 'dec': '0123456789_', 'oct': '01234567_' # TODO: implement octal literals? } operators = { '+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '||', '??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--', '=', '*=', '+=', '/=', '-=' } integer_type_suffixes = { 'i8', 'i16', 'i32', 'i64', 'u8', 'u16', 'u32', 'u64', } reserved_special_chars = { '#', '~', '`', '"', '\'', '@', '|', ';' } parens = '[]{}()<>' identifier_terminating_chars = set((*operators, *parens, ' ', '\n', '\t', '\r', *reserved_special_chars))