riscemu/riscemu/assembler.py

from enum import Enum, auto
from typing import List
from typing import Optional, Tuple, Union

from .colors import FMT_PARSE, FMT_NONE
from riscemu.types.exceptions import ParseException, ASSERT_LEN
from .helpers import parse_numeric_argument, align_addr, get_section_base_name
from .tokenizer import Token
from .types import (
    Program,
    T_RelativeAddress,
    InstructionContext,
    Instruction,
    BinaryDataMemorySection,
    InstructionMemorySection,
    Int32,
)

INSTRUCTION_SECTION_NAMES = (".text", ".init", ".fini")
"""
A tuple containing all section names which contain executable code (instead of data)

The first segment of each segment (first segment of ".text.main" is ".text") is checked
against this list to determine the type of it.
"""


class MemorySectionType(Enum):
    Data = auto()
    Instructions = auto()


class CurrentSection:
    name: str
    data: Union[List[Instruction], bytearray]
    type: MemorySectionType
    base: int

    def __init__(self, name: str, type: MemorySectionType, base: int = 0):
        self.name = name
        self.type = type
        self.base = base
        if self.type == MemorySectionType.Data:
            self.data = bytearray()
        elif self.type == MemorySectionType.Instructions:
            self.data = list()
        else:
            raise ParseException("Unknown section type: {}".format(type))

    def current_address(self) -> T_RelativeAddress:
        if self.type == MemorySectionType.Data:
            return len(self.data) + self.base
        return len(self.data) * 4 + self.base

    def __repr__(self):
        return "{}(name={},data={},type={})".format(
            self.__class__.__name__, self.name, self.data, self.type.name
        )


class ParseContext:
    section: Optional[CurrentSection]
    context: InstructionContext
    program: Program

    def __init__(self, name: str):
        self.program = Program(name)
        self.context = self.program.context
        self.section = None

    def finalize(self) -> Program:
        self._finalize_section()
        return self.program

    def _finalize_section(self):
        if self.section is None:
            return

        if self.section.type == MemorySectionType.Data:
            section = BinaryDataMemorySection(
                self.section.data,
                self.section.name,
                self.context,
                self.program.name,
                self.section.base,
            )
            self.program.add_section(section)
        elif self.section.type == MemorySectionType.Instructions:
            section = InstructionMemorySection(
                self.section.data,
                self.section.name,
                self.context,
                self.program.name,
                self.section.base,
            )
            self.program.add_section(section)

        self.section = None

    def new_section(self, name: str, type: MemorySectionType, alignment: int = 4):
        base = align_addr(self.current_address(), alignment)

        self._finalize_section()
        self.section = CurrentSection(name, type, base)

    def add_label(
        self, name: str, value: int, is_global: bool = False, is_relative: bool = False
    ):
        self.context.labels[name] = value
        if is_global:
            self.program.global_labels.add(name)
        if is_relative:
            self.program.relative_labels.add(name)

    def current_address(self):
        if self.section:
            return self.section.current_address()
        return self.program.base if self.program.base is not None else 0

    def __repr__(self):
        return "{}(\n\tsetion={},\n\tprogram={}\n)".format(
            self.__class__.__name__, self.section, self.program
        )


def ASSERT_IN_SECTION_TYPE(context: ParseContext, type: MemorySectionType):
    if context.section is None:
        raise ParseException(
            "Error, expected to be in {} section, but no section is present...".format(
                type.name
            )
        )
    if context.section.type != type:
        raise ParseException(
            "Error, expected to be in {} section, but currently in {}...".format(
                type.name, context.section
            )
        )


class AssemblerDirectives:
    """
    This class represents a collection of all assembler directives as documented by
    https://github.com/riscv-non-isa/riscv-asm-manual/blob/master/riscv-asm.md#pseudo-ops

    All class methods prefixed with op_ are directly used as assembler directives.
    """

    @classmethod
    def op_align(cls, token: Token, args: Tuple[str], context: ParseContext):
        ASSERT_LEN(args, 1)
        ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)
        align_to = parse_numeric_argument(args[0])
        current_mod = context.current_address() % align_to
        if current_mod == 0:
            return
        context.section.data += bytearray(align_to - current_mod)

    @classmethod
    def op_section(cls, token: Token, args: Tuple[str], context: ParseContext):
        ASSERT_LEN(args, 1)
        if get_section_base_name(args[0]) in INSTRUCTION_SECTION_NAMES:
            context.new_section(args[0], MemorySectionType.Instructions)
        else:
            context.new_section(args[0], MemorySectionType.Data)

    @classmethod
    def op_globl(cls, token: Token, args: Tuple[str], context: ParseContext):
        ASSERT_LEN(args, 1)
        context.program.global_labels.add(args[0])

    @classmethod
    def op_global(cls, token: Token, args: Tuple[str], context: ParseContext):
        cls.op_globl(token, args, context)

    @classmethod
    def op_equ(cls, token: Token, args: Tuple[str], context: ParseContext):
        ASSERT_LEN(args, 2)
        name = args[0]
        value = parse_numeric_argument(args[1])
        context.context.labels[name] = value

    @classmethod
    def op_space(cls, token: Token, args: Tuple[str], context: ParseContext):
        ASSERT_LEN(args, 1)
        ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)

        size = parse_numeric_argument(args[0])
        cls.add_bytes(size, None, context)

    @classmethod
    def op_zero(cls, token: Token, args: Tuple[str], context: ParseContext):
        ASSERT_LEN(args, 1)
        ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)
        size = parse_numeric_argument(args[0])
        cls.add_bytes(size, bytearray(size), context)

    @classmethod
    def add_bytes(
        cls, size: int, content: Union[None, int, bytearray], context: ParseContext
    ):
        ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)

        if content is None:
            content = bytearray(size)
        if isinstance(content, int):
            content = Int32(content).to_bytes(size)

        context.section.data += content

    @classmethod
    def add_text(cls, text: str, context: ParseContext, zero_terminate: bool = True):
        # replace '\t' and '\n' escape sequences
        text = text.replace("\\n", "\n").replace("\\t", "\t")

        encoded_bytes = bytearray(text.encode("ascii"))
        if zero_terminate:
            encoded_bytes += bytearray(1)
        cls.add_bytes(len(encoded_bytes), encoded_bytes, context)

    @classmethod
    def handle_instruction(cls, token: Token, args: Tuple[str], context: ParseContext):
        op = token.value[1:]
        if hasattr(cls, "op_" + op):
            getattr(cls, "op_" + op)(token, args, context)
        elif op in ("text", "data", "rodata", "bss", "sbss"):
            cls.op_section(token, (token.value,), context)
        elif op in ("string", "asciiz", "asciz", "ascii"):
            ASSERT_LEN(args, 1)
            cls.add_text(args[0], context, op == "ascii")
        elif op in DATA_OP_SIZES:
            size = DATA_OP_SIZES[op]
            for arg in args:
                cls.add_bytes(size, parse_numeric_argument(arg), context)
        else:
            print(
                FMT_PARSE
                + "Unknown assembler directive: {} {} in {}".format(
                    token, args, context
                )
                + FMT_NONE
            )


DATA_OP_SIZES = {
    "byte": 1,
    "2byte": 2,
    "half": 2,
    "short": 2,
    "4byte": 4,
    "word": 4,
    "long": 4,
    "8byte": 8,
    "dword": 8,
    "quad": 8,
}
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`from enum import Enum, auto`
Maor round of bugfixes and incremental improvements - fixed errors in TextIO and IOModule - moved to Int32 and UInt32 based arithmetic - added a lot of end-to-end and other tests 3 years ago			`from typing import List`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`from typing import Optional, Tuple, Union`

			`from .colors import FMT_PARSE, FMT_NONE`
[restructured] moved more types and exceptions to riscemu.types 3 years ago			`from riscemu.types.exceptions import ParseException, ASSERT_LEN`
Maor round of bugfixes and incremental improvements - fixed errors in TextIO and IOModule - moved to Int32 and UInt32 based arithmetic - added a lot of end-to-end and other tests 3 years ago			`from .helpers import parse_numeric_argument, align_addr, get_section_base_name`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`from .tokenizer import Token`
format black 2 years ago			`from .types import (`
			`Program,`
			`T_RelativeAddress,`
			`InstructionContext,`
			`Instruction,`
			`BinaryDataMemorySection,`
			`InstructionMemorySection,`
			`Int32,`
			`)`

			`INSTRUCTION_SECTION_NAMES = (".text", ".init", ".fini")`
started with base type overhaul 3 years ago			`"""`
			`A tuple containing all section names which contain executable code (instead of data)`

			`The first segment of each segment (first segment of ".text.main" is ".text") is checked`
			`against this list to determine the type of it.`
			`"""`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago

			`class MemorySectionType(Enum):`
			`Data = auto()`
			`Instructions = auto()`


			`class CurrentSection:`
			`name: str`
finished basic RISC-V parser 3 years ago			`data: Union[List[Instruction], bytearray]`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`type: MemorySectionType`
finished basic RISC-V parser 3 years ago			`base: int`

			`def __init__(self, name: str, type: MemorySectionType, base: int = 0):`
			`self.name = name`
			`self.type = type`
			`self.base = base`
			`if self.type == MemorySectionType.Data:`
			`self.data = bytearray()`
			`elif self.type == MemorySectionType.Instructions:`
			`self.data = list()`
			`else:`
			`raise ParseException("Unknown section type: {}".format(type))`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago
			`def current_address(self) -> T_RelativeAddress:`
			`if self.type == MemorySectionType.Data:`
finished basic RISC-V parser 3 years ago			`return len(self.data) + self.base`
			`return len(self.data) * 4 + self.base`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago
			`def __repr__(self):`
			`return "{}(name={},data={},type={})".format(`
format black 2 years ago			`self.__class__.__name__, self.name, self.data, self.type.name`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`)`


			`class ParseContext:`
			`section: Optional[CurrentSection]`
			`context: InstructionContext`
			`program: Program`

			`def __init__(self, name: str):`
			`self.program = Program(name)`
			`self.context = self.program.context`
			`self.section = None`

			`def finalize(self) -> Program:`
finished basic RISC-V parser 3 years ago			`self._finalize_section()`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`return self.program`

finished basic RISC-V parser 3 years ago			`def _finalize_section(self):`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`if self.section is None:`
			`return`
Parser: fixed error when labels where used outside of sections 3 years ago
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`if self.section.type == MemorySectionType.Data:`
started with base type overhaul 3 years ago			`section = BinaryDataMemorySection(`
format black 2 years ago			`self.section.data,`
			`self.section.name,`
			`self.context,`
			`self.program.name,`
			`self.section.base,`
started with base type overhaul 3 years ago			`)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`self.program.add_section(section)`
			`elif self.section.type == MemorySectionType.Instructions:`
started with base type overhaul 3 years ago			`section = InstructionMemorySection(`
format black 2 years ago			`self.section.data,`
			`self.section.name,`
			`self.context,`
			`self.program.name,`
			`self.section.base,`
started with base type overhaul 3 years ago			`)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`self.program.add_section(section)`
Parser: fixed error when labels where used outside of sections 3 years ago
finished basic RISC-V parser 3 years ago			`self.section = None`

started with base type overhaul 3 years ago			`def new_section(self, name: str, type: MemorySectionType, alignment: int = 4):`
Parser: fixed error when labels where used outside of sections 3 years ago			`base = align_addr(self.current_address(), alignment)`

finished basic RISC-V parser 3 years ago			`self._finalize_section()`
			`self.section = CurrentSection(name, type, base)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago
format black 2 years ago			`def add_label(`
			`self, name: str, value: int, is_global: bool = False, is_relative: bool = False`
			`):`
added interactive mode, fixed some bugs 3 years ago			`self.context.labels[name] = value`
			`if is_global:`
			`self.program.global_labels.add(name)`
			`if is_relative:`
			`self.program.relative_labels.add(name)`

Parser: fixed error when labels where used outside of sections 3 years ago			`def current_address(self):`
			`if self.section:`
			`return self.section.current_address()`
			`return self.program.base if self.program.base is not None else 0`

[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`def __repr__(self):`
			`return "{}(\n\tsetion={},\n\tprogram={}\n)".format(`
			`self.__class__.__name__, self.section, self.program`
			`)`


			`def ASSERT_IN_SECTION_TYPE(context: ParseContext, type: MemorySectionType):`
			`if context.section is None:`
format black 2 years ago			`raise ParseException(`
			`"Error, expected to be in {} section, but no section is present...".format(`
			`type.name`
			`)`
			`)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`if context.section.type != type:`
			`raise ParseException(`
format black 2 years ago			`"Error, expected to be in {} section, but currently in {}...".format(`
			`type.name, context.section`
			`)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`)`


			`class AssemblerDirectives:`
			`"""`
			`This class represents a collection of all assembler directives as documented by`
			`https://github.com/riscv-non-isa/riscv-asm-manual/blob/master/riscv-asm.md#pseudo-ops`

			`All class methods prefixed with op_ are directly used as assembler directives.`
			`"""`

			`@classmethod`
			`def op_align(cls, token: Token, args: Tuple[str], context: ParseContext):`
			`ASSERT_LEN(args, 1)`
			`ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)`
			`align_to = parse_numeric_argument(args[0])`
Parser: fixed error when labels where used outside of sections 3 years ago			`current_mod = context.current_address() % align_to`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`if current_mod == 0:`
			`return`
			`context.section.data += bytearray(align_to - current_mod)`

			`@classmethod`
			`def op_section(cls, token: Token, args: Tuple[str], context: ParseContext):`
			`ASSERT_LEN(args, 1)`
			`if get_section_base_name(args[0]) in INSTRUCTION_SECTION_NAMES:`
finished basic RISC-V parser 3 years ago			`context.new_section(args[0], MemorySectionType.Instructions)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`else:`
finished basic RISC-V parser 3 years ago			`context.new_section(args[0], MemorySectionType.Data)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago
			`@classmethod`
			`def op_globl(cls, token: Token, args: Tuple[str], context: ParseContext):`
			`ASSERT_LEN(args, 1)`
			`context.program.global_labels.add(args[0])`

finished basic RISC-V parser 3 years ago			`@classmethod`
			`def op_global(cls, token: Token, args: Tuple[str], context: ParseContext):`
			`cls.op_globl(token, args, context)`

[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`@classmethod`
			`def op_equ(cls, token: Token, args: Tuple[str], context: ParseContext):`
			`ASSERT_LEN(args, 2)`
			`name = args[0]`
			`value = parse_numeric_argument(args[1])`
			`context.context.labels[name] = value`

finished basic RISC-V parser 3 years ago			`@classmethod`
			`def op_space(cls, token: Token, args: Tuple[str], context: ParseContext):`
			`ASSERT_LEN(args, 1)`
			`ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)`

			`size = parse_numeric_argument(args[0])`
			`cls.add_bytes(size, None, context)`

[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`@classmethod`
			`def op_zero(cls, token: Token, args: Tuple[str], context: ParseContext):`
			`ASSERT_LEN(args, 1)`
			`ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)`
			`size = parse_numeric_argument(args[0])`
			`cls.add_bytes(size, bytearray(size), context)`

			`@classmethod`
format black 2 years ago			`def add_bytes(`
			`cls, size: int, content: Union[None, int, bytearray], context: ParseContext`
			`):`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)`

			`if content is None:`
			`content = bytearray(size)`
various small bugfixes 3 years ago			`if isinstance(content, int):`
fixes #10 - fixed how preconfigured memory is handled 3 years ago			`content = Int32(content).to_bytes(size)`
various small bugfixes 3 years ago
finished basic RISC-V parser 3 years ago			`context.section.data += content`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago
			`@classmethod`
			`def add_text(cls, text: str, context: ParseContext, zero_terminate: bool = True):`
fixed ascii escape sequences and section address calculation 3 years ago			`# replace '\t' and '\n' escape sequences`
format black 2 years ago			`text = text.replace("\\n", "\n").replace("\\t", "\t")`
fixed ascii escape sequences and section address calculation 3 years ago
format black 2 years ago			`encoded_bytes = bytearray(text.encode("ascii"))`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`if zero_terminate:`
			`encoded_bytes += bytearray(1)`
			`cls.add_bytes(len(encoded_bytes), encoded_bytes, context)`

			`@classmethod`
			`def handle_instruction(cls, token: Token, args: Tuple[str], context: ParseContext):`
			`op = token.value[1:]`
format black 2 years ago			`if hasattr(cls, "op_" + op):`
			`getattr(cls, "op_" + op)(token, args, context)`
			`elif op in ("text", "data", "rodata", "bss", "sbss"):`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`cls.op_section(token, (token.value,), context)`
format black 2 years ago			`elif op in ("string", "asciiz", "asciz", "ascii"):`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`ASSERT_LEN(args, 1)`
format black 2 years ago			`cls.add_text(args[0], context, op == "ascii")`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`elif op in DATA_OP_SIZES:`
			`size = DATA_OP_SIZES[op]`
			`for arg in args:`
			`cls.add_bytes(size, parse_numeric_argument(arg), context)`
			`else:`
format black 2 years ago			`print(`
			`FMT_PARSE`
			`+ "Unknown assembler directive: {} {} in {}".format(`
			`token, args, context`
			`)`
			`+ FMT_NONE`
			`)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago

			`DATA_OP_SIZES = {`
format black 2 years ago			`"byte": 1,`
			`"2byte": 2,`
			`"half": 2,`
			`"short": 2,`
			`"4byte": 4,`
			`"word": 4,`
			`"long": 4,`
			`"8byte": 8,`
			`"dword": 8,`
			`"quad": 8,`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`}`