pmp/compiler/defs.py

from dataclasses import dataclass, fields
from mimetypes import suffix_map
from ntpath import join
from typing import Dict, List, Iterable, Generator, Any, Literal
from enum import Enum, auto


@dataclass
class LexingContext:
    sources: Dict[str,str]
    entrypoint: str
    structs: Dict[int, Any]         # TODO: struct def type
    functions: Dict[str, Any]       # TODO: function types 

    def get_nth_line_bounds(self, source_name: str, n: int):
        if source_name not in self.sources:
            raise KeyError("Unknown source file \"{}\"!".format(source_name))
        start = 0
        source = self.sources[source_name]
        for i in range(n):
            next_start = source.find('\n', start)
            if next_start == -1:
                return None
            start = next_start + 1
        return start, source.find('\n', start)

    def get_lines_containing(self, span: 'Span'):
        if span.source_name not in self.sources:
            raise KeyError("Unknown source file \"{}\"!".format(span.source_name))
        start = 0
        line_no = 0
        source = self.sources[span.source_name]
        while True:
            next_start = source.find('\n', start)
            line_no += 1
            # handle eof
            if next_start == -1:
                return None
            # as long as the next newline comes before the spans start we are good
            if next_start < span.start:
                start = next_start + 1
                continue
            # if the whole span is on one line, we are good as well
            if next_start >= span.end:
                return [ source[start:next_start] ], start, line_no
            while next_start < span.end:
                next_start = source.find('\n', next_start+1)

            return source[start:next_start].split('\n'), start, line_no


@dataclass(frozen=True)
class Span:
    start: int
    """
    Start of tokens location in source file, global byte offset in file
    """
    end: int
    """
    End of tokens location in source file, global byte offset in file
    """
    source_name: str
    context: LexingContext

    def union(self, *spans: 'Span'):
        for span in spans:
            assert span.source_name == self.source_name
            assert span.context == self.context
        return Span(
            start=min(self.start, *(span.start for span in spans)),
            end=max(self.end, *(span.end for span in spans)),
            source_name=self.source_name,
            context=self.context
        )

    def transform(self, start:int=0, end:int=0):
        return Span(self.start + start, self.end - end, self.source_name, self.context)

    def __repr__(self):
        return "{}(start={},end={},source_name={})".format(
            self.__class__.__name__,
            self.start, self.end, self.source_name
        )

class TokenType(Enum):
    Keyword = auto()
    Integer = auto()
    Float = auto()
    Identifier = auto()
    String = auto()
    LBracket = auto()
    RBracket = auto()
    Operator = auto()
    LineComment = auto()
    MultiComment = auto()
    EOL = auto()    # End of Line
    EOI = auto()    # End of Input


@dataclass(frozen=True)
class Token:
    span: Span
    content: str
    kind: TokenType

    def __new__(cls, span: Span, content: str, kind: TokenType):
        if kind in token_type_to_subclass_map and cls == Token:
            return token_type_to_subclass_map[kind].__new__(
                token_type_to_subclass_map[kind], span, content, kind
            )
        return super().__new__(cls)

    def __repr__(self):
        fields_to_print = [field for field in fields(self) if field.name not in ('span', 'context', 'kind')]
        if self.__class__ == Token:
            return "{}[{}]({})".format(
                self.__class__.__name__,
                self.kind.name,
                ", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
            )
        else:
            return "{}({})".format(
                self.__class__.__name__,
                ", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)
            )


@dataclass(frozen=True, init=False, repr=False)
class IntegerLiteralToken(Token):
    value: int
    format: Literal['hex', 'int', 'dec', 'oct']
    suffix: str | None

    def __init__(self, span: Span, content: str, kind: TokenType):
        super().__init__(span, content, kind)
        assert kind == TokenType.Integer

        suffix = None
        for suffix_ in integer_type_suffixes:
            if content.endswith(suffix_):
                suffix = suffix_
                content.removesuffix(suffix_)
                break

        format = 'dec'
        if content.startswith('0x') or content.startswith('0X'):
            value = int(content, 16)
            format = 'hex'
        elif content.startswith('0b'):
            value = int(content, 2)
            format = 'bin'
        else:
            value = int(content, 10)
        object.__setattr__(self, "value", value)
        object.__setattr__(self, "suffix", suffix)
        object.__setattr__(self, "format", format)


@dataclass(frozen=True, repr=False)
class KeywordToken(Token):
    pass

@dataclass(frozen=True, repr=False)
class OperatorToken(Token):
    pass

@dataclass(frozen=True, repr=False)
class IdentifierToken(Token):
    pass


token_type_to_subclass_map = {
    TokenType.Integer: IntegerLiteralToken,
    TokenType.Keyword: KeywordToken,
    TokenType.Operator: OperatorToken,
    TokenType.Identifier: IdentifierToken
}


keywords = {
    'const', 'let', 'for', 'if', 'function', 
    'true', 'false', 'in', 'not', 'or', 'and', 
    'struct', 'private', 'public', 'return', 
    'impure', 'while', 'use', 'do', 'continue',
    'break'
}

digits = {
    'bin': '01_',
    'hex': '0123456789abcdefABCDEF_',
    'dec': '0123456789_',
    'oct': '01234567_' # TODO: implement octal literals?
}

operators = {
    '+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '||',
    '??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--', 
    '=', '*=', '+=', '/=', '-='
}

integer_type_suffixes = {
    'i8', 'i16', 'i32', 'i64', 
    'u8', 'u16', 'u32', 'u64',
}

reserved_special_chars = {
    '#', '~', '`', '"', '\'', '@', '|', ';'
}

parens = '[]{}()<>'

identifier_terminating_chars = set((*operators, *parens, ' ', '\n', '\t', '\r', *reserved_special_chars))
parser is working with very basic functionality 2 years ago			`from dataclasses import dataclass, fields`
			`from mimetypes import suffix_map`
			`from ntpath import join`
			`from typing import Dict, List, Iterable, Generator, Any, Literal`
			`from enum import Enum, auto`


			`@dataclass`
			`class LexingContext:`
			`sources: Dict[str,str]`
			`entrypoint: str`
			`structs: Dict[int, Any] # TODO: struct def type`
			`functions: Dict[str, Any] # TODO: function types`

			`def get_nth_line_bounds(self, source_name: str, n: int):`
			`if source_name not in self.sources:`
			`raise KeyError("Unknown source file \"{}\"!".format(source_name))`
			`start = 0`
			`source = self.sources[source_name]`
			`for i in range(n):`
			`next_start = source.find('\n', start)`
			`if next_start == -1:`
			`return None`
			`start = next_start + 1`
			`return start, source.find('\n', start)`

			`def get_lines_containing(self, span: 'Span'):`
			`if span.source_name not in self.sources:`
			`raise KeyError("Unknown source file \"{}\"!".format(span.source_name))`
			`start = 0`
			`line_no = 0`
			`source = self.sources[span.source_name]`
			`while True:`
			`next_start = source.find('\n', start)`
			`line_no += 1`
			`# handle eof`
			`if next_start == -1:`
			`return None`
			`# as long as the next newline comes before the spans start we are good`
			`if next_start < span.start:`
			`start = next_start + 1`
			`continue`
			`# if the whole span is on one line, we are good as well`
			`if next_start >= span.end:`
			`return [ source[start:next_start] ], start, line_no`
			`while next_start < span.end:`
			`next_start = source.find('\n', next_start+1)`

			`return source[start:next_start].split('\n'), start, line_no`


			`@dataclass(frozen=True)`
			`class Span:`
			`start: int`
			`"""`
			`Start of tokens location in source file, global byte offset in file`
			`"""`
			`end: int`
			`"""`
			`End of tokens location in source file, global byte offset in file`
			`"""`
			`source_name: str`
			`context: LexingContext`

			`def union(self, *spans: 'Span'):`
			`for span in spans:`
			`assert span.source_name == self.source_name`
			`assert span.context == self.context`
			`return Span(`
			`start=min(self.start, *(span.start for span in spans)),`
			`end=max(self.end, *(span.end for span in spans)),`
			`source_name=self.source_name,`
			`context=self.context`
			`)`

			`def transform(self, start:int=0, end:int=0):`
			`return Span(self.start + start, self.end - end, self.source_name, self.context)`

			`def __repr__(self):`
			`return "{}(start={},end={},source_name={})".format(`
			`self.__class__.__name__,`
			`self.start, self.end, self.source_name`
			`)`

			`class TokenType(Enum):`
			`Keyword = auto()`
			`Integer = auto()`
			`Float = auto()`
			`Identifier = auto()`
			`String = auto()`
			`LBracket = auto()`
			`RBracket = auto()`
			`Operator = auto()`
			`LineComment = auto()`
			`MultiComment = auto()`
			`EOL = auto() # End of Line`
			`EOI = auto() # End of Input`


			`@dataclass(frozen=True)`
			`class Token:`
			`span: Span`
			`content: str`
			`kind: TokenType`

			`def __new__(cls, span: Span, content: str, kind: TokenType):`
			`if kind in token_type_to_subclass_map and cls == Token:`
			`return token_type_to_subclass_map[kind].__new__(`
			`token_type_to_subclass_map[kind], span, content, kind`
			`)`
			`return super().__new__(cls)`

			`def __repr__(self):`
			`fields_to_print = [field for field in fields(self) if field.name not in ('span', 'context', 'kind')]`
			`if self.__class__ == Token:`
			`return "{}[{}]({})".format(`
			`self.__class__.__name__,`
			`self.kind.name,`
			`", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)`
			`)`
			`else:`
			`return "{}({})".format(`
			`self.__class__.__name__,`
			`", ".join("{}={}".format(field.name, repr(getattr(self, field.name))) for field in fields_to_print)`
			`)`


			`@dataclass(frozen=True, init=False, repr=False)`
			`class IntegerLiteralToken(Token):`
			`value: int`
			`format: Literal['hex', 'int', 'dec', 'oct']`
			`suffix: str \| None`

			`def __init__(self, span: Span, content: str, kind: TokenType):`
			`super().__init__(span, content, kind)`
			`assert kind == TokenType.Integer`

			`suffix = None`
			`for suffix_ in integer_type_suffixes:`
			`if content.endswith(suffix_):`
			`suffix = suffix_`
			`content.removesuffix(suffix_)`
			`break`

			`format = 'dec'`
			`if content.startswith('0x') or content.startswith('0X'):`
			`value = int(content, 16)`
			`format = 'hex'`
			`elif content.startswith('0b'):`
			`value = int(content, 2)`
			`format = 'bin'`
			`else:`
			`value = int(content, 10)`
			`object.__setattr__(self, "value", value)`
			`object.__setattr__(self, "suffix", suffix)`
			`object.__setattr__(self, "format", format)`


			`@dataclass(frozen=True, repr=False)`
			`class KeywordToken(Token):`
			`pass`

			`@dataclass(frozen=True, repr=False)`
			`class OperatorToken(Token):`
			`pass`

			`@dataclass(frozen=True, repr=False)`
			`class IdentifierToken(Token):`
			`pass`



			`token_type_to_subclass_map = {`
			`TokenType.Integer: IntegerLiteralToken,`
			`TokenType.Keyword: KeywordToken,`
			`TokenType.Operator: OperatorToken,`
			`TokenType.Identifier: IdentifierToken`
			`}`


			`keywords = {`
			`'const', 'let', 'for', 'if', 'function',`
			`'true', 'false', 'in', 'not', 'or', 'and',`
			`'struct', 'private', 'public', 'return',`
			`'impure', 'while', 'use', 'do', 'continue',`
			`'break'`
			`}`

			`digits = {`
			`'bin': '01_',`
			`'hex': '0123456789abcdefABCDEF_',`
			`'dec': '0123456789_',`
			`'oct': '01234567_' # TODO: implement octal literals?`
			`}`

			`operators = {`
			`'+', '-', '*', '/', '!', '.', ',', '<', '>', ':', '<<', '>>', '&&', '\|\|',`
			`'??', '%','==', '!=', '<=', '>=', '..', '=>', '++', '--',`
			`'=', '*=', '+=', '/=', '-='`
			`}`

			`integer_type_suffixes = {`
			`'i8', 'i16', 'i32', 'i64',`
			`'u8', 'u16', 'u32', 'u64',`
			`}`

			`reserved_special_chars = {`
			'#', '~', '`', '"', '\'', '@', '\|', ';'
			`}`

			`parens = '[]{}()<>'`

			`identifier_terminating_chars = set((operators, parens, ' ', '\n', '\t', '\r', *reserved_special_chars))`