pmp/compiler/defs.py

from dataclasses import dataclass, fields
from mimetypes import suffix_map
from ntpath import join
from typing import Dict, List, Iterable, Generator, Any, Literal
from enum import Enum, auto


@dataclass
class LexingContext:
    sources: Dict[str,str]
    entrypoint: str
    structs: Dict[int, Any]         # TODO: struct def type
    functions: Dict[str, Any]       # TODO: function types

    def get_nth_line_bounds(self, source_name: str, n: int):
        if source_name not in self.sources:
            raise KeyError("Unknown source file \"{}\"!".format(source_name))
        start = 0
        source = self.sources[source_name]
        for i in range(n):
            next_start = source.find('\n', start)
            if next_start == -1:
                return None
            start = next_start + 1
        return start, source.find('\n', start)

    def get_lines_containing(self, span: 'Span'):
        if span.source_name not in self.sources:
            raise KeyError("Unknown source file \"{}\"!".format(span.source_name))
        start = 0
        line_no = 0
        source = self.sources[span.source_name]
        while True:
            next_start = source.find('\n', start)
            line_no += 1
            # handle eof
            if next_start == -1:
                return None
            # as long as the next newline comes before the spans start we are good
            if next_start < span.start:
                start = next_start + 1
                continue
            # if the whole span is on one line, we are good as well
            if next_start >= span.end:
                return [ source[start:next_start] ], start, line_no
            while next_start < span.end:
                next_start = source.find('\n', next_start+1)

            return source[start:next_start].split('\n'), start, line_no


@dataclass(frozen=True)
class Span:
    start: int
    """
    Start of tokens location in source file, global byte offset in file
    """
    end: int
    """
    End of tokens location in source file, global byte offset in file
    """
    source_name: str
    context: LexingContext

    def union(self, *spans: 'Span'):
        for span in spans:
            assert span.source_name == self.source_name
            assert span.context == self.context
        return Span(
            start=min(self.start, *(span.start for span in spans)),
            end=max(self.end, *(span.end for span in spans)),
            source_name=self.source_name,
            context=self.context
        )

    def transform(self, start:int=0, end:int=0):
        return Span(self.start + start, self.end - end, self.source_name, self.context)

    def __repr__(self):
        return "{}(start={},end={},source_name={})".format(
            self.__class__.__name__,
            self.start, self.end, self.source_name
        )

class TokenType(Enum):
    Keyword = auto()
    Integer = auto()
    Float = auto()
    Identifier = auto()
    String = auto()
    LBracket = auto()
    RBracket = auto()
    Operator = auto()
    LineComment = auto()
    MultiComment = auto()
    EOL = auto()    # End of Line
    EOI = auto()    # End of Input


@dataclass(frozen=True)
class Token:
    span: Span
    content: str
    kind: TokenType

    def __new__(cls, span: Span, content: str, kind: TokenType):
        if kind in token_type_to_subclass_map and cls == Token:
            return token_type_to_subclass_map[kind].__new__(
                token_type_to_subclass_map[kind], span, content, kind
            )
        return super().__new__(cls)

    def __repr__(self):
        fields_to_print = [field for field in fields(self) if field.name not in ('span', 'context', 'kind')]
        if self.__class__ == Token:
            return "{}[{}]({})".format(
                self.__class__.__name__,
                self.kind.name,
                ", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
            )
        else:
            return "{}({})".format(
                self.__class__.__name__,
                ", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
            )


@dataclass(frozen=True, init=False, repr=False)
class IntegerLiteralToken(Token):
    value: int
    format: Literal['hex', 'int', 'dec', 'oct']
    suffix: str | None

    def __init__(self, span: Span, content: str, kind: TokenType):
        super().__init__(span, content, kind)
        assert kind == TokenType.Integer

        suffix = None
        for suffix_ in integer_type_suffixes:
            if content.endswith(suffix_):
                suffix = suffix_
                content.removesuffix(suffix_)
                break

        format = 'dec'
        if content.startswith('0x') or content.startswith('0X'):
            value = int(content, 16)
            format = 'hex'
        elif content.startswith('0b'):
            value = int(content, 2)
            format = 'bin'
        else:
            value = int(content, 10)
        object.__setattr__(self, "value", value)
        object.__setattr__(self, "suffix", suffix)
        object.__setattr__(self, "format", format)


@dataclass(frozen=True, repr=False)
class KeywordToken(Token):
    pass

@dataclass(frozen=True, repr=False)
class OperatorToken(Token):
    pass

@dataclass(frozen=True, repr=False)
class IdentifierToken(Token):
    pass


token_type_to_subclass_map = {
    TokenType.Integer: IntegerLiteralToken,
    TokenType.Keyword: KeywordToken,
    TokenType.Operator: OperatorToken,
    TokenType.Identifier: IdentifierToken
}


keywords = {
    'const', 'let', 'for', 'if', 'function',
    'true', 'false', 'in', 'not', 'or', 'and',
    'struct', 'private', 'public', 'return',
    'impure', 'while', 'use', 'do', 'continue',
    'break'
}

digits = {
    'bin': '01_',
    'hex': '0123456789abcdefABCDEF_',
    'dec': '0123456789_',
    'oct': '01234567_' # TODO: implement octal literals?
}

operators = {
    '+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '||',
    '??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--',
    '=', '*=', '+=', '/=', '-='
}

integer_type_suffixes = {
    'i8', 'i16', 'i32', 'i64',
    'u8', 'u16', 'u32', 'u64',
}

reserved_special_chars = {
    '#', '~', '`', '"', '\'', '@', '|', ';'
}

parens = '[]{}()<>'

identifier_terminating_chars = set((*operators, *parens, ' ', '\n', '\t', '\r', *reserved_special_chars))