riscemu/riscemu/ExecutableParser.py

"""
RiscEmu (c) 2021 Anton Lydike

SPDX-License-Identifier: MIT

This file holds the parser that parses the tokenizer output.
"""

from .helpers import parse_numeric_argument, int_to_bytes
from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
from .Exceptions import *

from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken

from typing import Dict, Tuple, List, Optional


class ExecutableParser:
    """
    Parses output form the RiscVTokenizer
    """
    tokenizer: 'RiscVTokenizer'

    def __init__(self, tokenizer: 'RiscVTokenizer'):
        self.instructions: List[RiscVInstructionToken] = list()
        self.symbols: Dict[str, Tuple[str, int]] = dict()
        self.sections: Dict[str, MemorySection] = dict()
        self.tokenizer = tokenizer
        self.active_section: Optional[str] = None
        self.implicit_sections = False
        self.globals: List[str] = list()

    def parse(self) -> Executable:
        """
        parse tokenizer output into an executable
        :return: the parsed executable
        :raise ParseException: Raises a ParseException when invalid input is read
        """
        for token in self.tokenizer.tokens:
            if isinstance(token, RiscVInstructionToken):
                self.parse_instruction(token)
            elif isinstance(token, RiscVSymbolToken):
                self.handle_symbol(token)
            elif isinstance(token, RiscVPseudoOpToken):
                self.handle_pseudo_op(token)
        return self._get_execuable()

    def _get_execuable(self) -> Executable:
        start_ptr = ('text', 0)
        if '_start' in self.symbols:
            start_ptr = self.symbols['_start']
        elif 'main' in self.symbols:
            start_ptr = self.symbols['main']
        return Executable(start_ptr, self.sections, self.symbols, self.globals, self.tokenizer.name)

    def parse_instruction(self, ins: 'RiscVInstructionToken') -> None:
        """
        parses an Instruction token
        :param ins: the instruction token
        """
        if self.active_section is None:
            self.op_text()
            self.implicit_sections = True

        ASSERT_EQ(self.active_section, 'text')
        sec = self._curr_sec()
        if isinstance(sec, InstructionMemorySection):
            sec.add_insn(ins)
        else:
            raise ParseException("SHOULD NOT BE REACHED")

    def handle_symbol(self, token: 'RiscVSymbolToken'):
        """
        Handle a symbol token (such as 'main:')
        :param token: the symbol token
        """
        ASSERT_NOT_IN(token.name, self.symbols)
        ASSERT_NOT_NULL(self.active_section)
        sec_pos = self._curr_sec().size
        self.symbols[token.name] = (self.active_section, sec_pos)

    def handle_pseudo_op(self, op: 'RiscVPseudoOpToken'):
        """
        Handle a pseudo op token (such as '.word 0xffaabbcc')
        :param op: the peseudo-op token
        """
        name = 'op_' + op.name
        if hasattr(self, name):
            getattr(self, name)(op)
        else:
            raise ParseException("Unknown pseudo op: {}".format(op), (op,))

    ## Pseudo op implementations:
    def op_section(self, op: 'RiscVPseudoOpToken'):
        """
        handles a .section token
        :param op: The token
        """
        ASSERT_LEN(op.args, 1)
        name = op.args[0][1:]
        ASSERT_IN(name, ('data', 'rodata', 'text'))
        getattr(self, 'op_' + name)(op)

    def op_text(self, op: 'RiscVPseudoOpToken' = None):
        """
        handles a .text token
        :param op: The token
        """
        self._set_sec('text', MemoryFlags(read_only=True, executable=True), cls=InstructionMemorySection)

    def op_data(self, op: 'RiscVPseudoOpToken' = None):
        """
        handles a .data token
        :param op: The token
        """
        self._set_sec('data', MemoryFlags(read_only=False, executable=False))

    def op_rodata(self, op: 'RiscVPseudoOpToken' = None):
        """
        handles a .rodata token
        :param op: The token
        """
        self._set_sec('rodata', MemoryFlags(read_only=True, executable=False))

    def op_space(self, op: 'RiscVPseudoOpToken'):
        """
        handles a .space token. Inserts empty space into the current (data or rodata) section
        :param op: The token
        """
        ASSERT_IN(self.active_section, ('data', 'rodata'))
        ASSERT_LEN(op.args, 1)
        size = parse_numeric_argument(op.args[0])
        self._curr_sec().add(bytearray(size))

    def op_ascii(self, op: 'RiscVPseudoOpToken'):
        """
        handles a .ascii token. Inserts ascii encoded text into the currrent data section
        :param op: The token
        """
        ASSERT_IN(self.active_section, ('data', 'rodata'))
        ASSERT_LEN(op.args, 1)
        str = op.args[0][1:-1].encode('ascii').decode('unicode_escape')
        self._curr_sec().add(bytearray(str, 'ascii'))

    def op_asciiz(self, op: 'RiscVPseudoOpToken'):
        """
        handles a .ascii token. Inserts nullterminated ascii encoded text into the currrent data section
        :param op: The token
        """
        ASSERT_IN(self.active_section, ('data', 'rodata'))
        ASSERT_LEN(op.args, 1)
        str = op.args[0][1:-1].encode('ascii').decode('unicode_escape')
        self._curr_sec().add(bytearray(str + '\0', 'ascii'))

    def op_global(self, op: 'RiscVPseudoOpToken'):
        """
        handles a .global token. Marks the token as global
        :param op: The token
        """
        ASSERT_LEN(op.args, 1)
        name = op.args[0]
        self.globals.append(name)

    def op_set(self, op: 'RiscVPseudoOpToken'):
        """
        handles a .set name, val token. Sets the symbol name to val
        :param op: The token
        """
        ASSERT_LEN(op.args, 2)
        name = op.args[0]
        val = parse_numeric_argument(op.args[1])
        self.symbols[name] = ('_static_', val)

    def op_align(self, op: 'RiscVPseudoOpToken'):
        """
        handles an align token. Currently a nop (just not implemented fully yet, as linker handles most alignement tasks)
        :param op: The token
        """
        pass

    def op_word(self, op: 'RiscVPseudoOpToken'):
        ASSERT_LEN(op.args, 1)
        val = parse_numeric_argument(op.args[0])
        self._curr_sec().add(int_to_bytes(val, 4))

    ## Section handler code
    def _set_sec(self, name: str, flags: MemoryFlags, cls=MemorySection):
        if name not in self.sections:
            self.sections[name] = cls(name, flags)
        self.active_section = name

    def _curr_sec(self):
        return self.sections[self.active_section]
added lots more documentation and copyright notices 4 years ago			`"""`
			`RiscEmu (c) 2021 Anton Lydike`

fixed errorneous license text in headers 4 years ago			`SPDX-License-Identifier: MIT`
added lots more documentation and copyright notices 4 years ago
			`This file holds the parser that parses the tokenizer output.`
			`"""`

added .word pseudo op 4 years ago			`from .helpers import parse_numeric_argument, int_to_bytes`
parsing of tokenized asm into MemorySections works 4 years ago			`from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags`
			`from .Exceptions import *`

parsing and simple running works somewhat 4 years ago			`from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken`
parsing of tokenized asm into MemorySections works 4 years ago
parsing and simple running works somewhat 4 years ago			`from typing import Dict, Tuple, List, Optional`
parsing of tokenized asm into MemorySections works 4 years ago

			`class ExecutableParser:`
added lots more documentation and copyright notices 4 years ago			`"""`
			`Parses output form the RiscVTokenizer`
			`"""`
parsing and simple running works somewhat 4 years ago			`tokenizer: 'RiscVTokenizer'`
parsing of tokenized asm into MemorySections works 4 years ago
parsing and simple running works somewhat 4 years ago			`def __init__(self, tokenizer: 'RiscVTokenizer'):`
parsing of tokenized asm into MemorySections works 4 years ago			`self.instructions: List[RiscVInstructionToken] = list()`
			`self.symbols: Dict[str, Tuple[str, int]] = dict()`
			`self.sections: Dict[str, MemorySection] = dict()`
			`self.tokenizer = tokenizer`
parsing and simple running works somewhat 4 years ago			`self.active_section: Optional[str] = None`
parsing of tokenized asm into MemorySections works 4 years ago			`self.implicit_sections = False`
added pseudo-op .set name val, and .global symb 4 years ago			`self.globals: List[str] = list()`
parsing of tokenized asm into MemorySections works 4 years ago
added lots more documentation and copyright notices 4 years ago			`def parse(self) -> Executable:`
			`"""`
			`parse tokenizer output into an executable`
			`:return: the parsed executable`
			`:raise ParseException: Raises a ParseException when invalid input is read`
			`"""`
parsing of tokenized asm into MemorySections works 4 years ago			`for token in self.tokenizer.tokens:`
			`if isinstance(token, RiscVInstructionToken):`
			`self.parse_instruction(token)`
			`elif isinstance(token, RiscVSymbolToken):`
			`self.handle_symbol(token)`
			`elif isinstance(token, RiscVPseudoOpToken):`
			`self.handle_pseudo_op(token)`
added lots more documentation and copyright notices 4 years ago			`return self._get_execuable()`
parsing of tokenized asm into MemorySections works 4 years ago
added lots more documentation and copyright notices 4 years ago			`def _get_execuable(self) -> Executable:`
parsing of tokenized asm into MemorySections works 4 years ago			`start_ptr = ('text', 0)`
			`if '_start' in self.symbols:`
			`start_ptr = self.symbols['_start']`
			`elif 'main' in self.symbols:`
			`start_ptr = self.symbols['main']`
removed stack pref pseudo-op in preperation for real stack impl 4 years ago			`return Executable(start_ptr, self.sections, self.symbols, self.globals, self.tokenizer.name)`
parsing of tokenized asm into MemorySections works 4 years ago
added lots more documentation and copyright notices 4 years ago			`def parse_instruction(self, ins: 'RiscVInstructionToken') -> None:`
			`"""`
			`parses an Instruction token`
			`:param ins: the instruction token`
			`"""`
parsing of tokenized asm into MemorySections works 4 years ago			`if self.active_section is None:`
			`self.op_text()`
			`self.implicit_sections = True`

			`ASSERT_EQ(self.active_section, 'text')`
added lots more documentation and copyright notices 4 years ago			`sec = self._curr_sec()`
parsing of tokenized asm into MemorySections works 4 years ago			`if isinstance(sec, InstructionMemorySection):`
			`sec.add_insn(ins)`
			`else:`
			`raise ParseException("SHOULD NOT BE REACHED")`

parsing and simple running works somewhat 4 years ago			`def handle_symbol(self, token: 'RiscVSymbolToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`Handle a symbol token (such as 'main:')`
			`:param token: the symbol token`
			`"""`
parsing of tokenized asm into MemorySections works 4 years ago			`ASSERT_NOT_IN(token.name, self.symbols)`
Minor fixes like imports and edge-case handling 4 years ago			`ASSERT_NOT_NULL(self.active_section)`
added lots more documentation and copyright notices 4 years ago			`sec_pos = self._curr_sec().size`
parsing of tokenized asm into MemorySections works 4 years ago			`self.symbols[token.name] = (self.active_section, sec_pos)`

parsing and simple running works somewhat 4 years ago			`def handle_pseudo_op(self, op: 'RiscVPseudoOpToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`Handle a pseudo op token (such as '.word 0xffaabbcc')`
			`:param op: the peseudo-op token`
			`"""`
parsing of tokenized asm into MemorySections works 4 years ago			`name = 'op_' + op.name`
			`if hasattr(self, name):`
			`getattr(self, name)(op)`
			`else:`
			`raise ParseException("Unknown pseudo op: {}".format(op), (op,))`

			`## Pseudo op implementations:`
parsing and simple running works somewhat 4 years ago			`def op_section(self, op: 'RiscVPseudoOpToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .section token`
			`:param op: The token`
			`"""`
parsing of tokenized asm into MemorySections works 4 years ago			`ASSERT_LEN(op.args, 1)`
			`name = op.args[0][1:]`
			`ASSERT_IN(name, ('data', 'rodata', 'text'))`
			`getattr(self, 'op_' + name)(op)`

parsing and simple running works somewhat 4 years ago			`def op_text(self, op: 'RiscVPseudoOpToken' = None):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .text token`
			`:param op: The token`
			`"""`
			`self._set_sec('text', MemoryFlags(read_only=True, executable=True), cls=InstructionMemorySection)`
parsing of tokenized asm into MemorySections works 4 years ago
parsing and simple running works somewhat 4 years ago			`def op_data(self, op: 'RiscVPseudoOpToken' = None):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .data token`
			`:param op: The token`
			`"""`
			`self._set_sec('data', MemoryFlags(read_only=False, executable=False))`
parsing of tokenized asm into MemorySections works 4 years ago
parsing and simple running works somewhat 4 years ago			`def op_rodata(self, op: 'RiscVPseudoOpToken' = None):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .rodata token`
			`:param op: The token`
			`"""`
			`self._set_sec('rodata', MemoryFlags(read_only=True, executable=False))`
parsing of tokenized asm into MemorySections works 4 years ago
parsing and simple running works somewhat 4 years ago			`def op_space(self, op: 'RiscVPseudoOpToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .space token. Inserts empty space into the current (data or rodata) section`
			`:param op: The token`
			`"""`
parsing of tokenized asm into MemorySections works 4 years ago			`ASSERT_IN(self.active_section, ('data', 'rodata'))`
			`ASSERT_LEN(op.args, 1)`
			`size = parse_numeric_argument(op.args[0])`
added lots more documentation and copyright notices 4 years ago			`self._curr_sec().add(bytearray(size))`
parsing of tokenized asm into MemorySections works 4 years ago
parsing and simple running works somewhat 4 years ago			`def op_ascii(self, op: 'RiscVPseudoOpToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .ascii token. Inserts ascii encoded text into the currrent data section`
			`:param op: The token`
			`"""`
parsing of tokenized asm into MemorySections works 4 years ago			`ASSERT_IN(self.active_section, ('data', 'rodata'))`
			`ASSERT_LEN(op.args, 1)`
added escape character deocoding for ascii and asciiz pseudo-ops 4 years ago			`str = op.args[0][1:-1].encode('ascii').decode('unicode_escape')`
added lots more documentation and copyright notices 4 years ago			`self._curr_sec().add(bytearray(str, 'ascii'))`
parsing of tokenized asm into MemorySections works 4 years ago
parsing and simple running works somewhat 4 years ago			`def op_asciiz(self, op: 'RiscVPseudoOpToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .ascii token. Inserts nullterminated ascii encoded text into the currrent data section`
			`:param op: The token`
			`"""`
parsing of tokenized asm into MemorySections works 4 years ago			`ASSERT_IN(self.active_section, ('data', 'rodata'))`
			`ASSERT_LEN(op.args, 1)`
added escape character deocoding for ascii and asciiz pseudo-ops 4 years ago			`str = op.args[0][1:-1].encode('ascii').decode('unicode_escape')`
added lots more documentation and copyright notices 4 years ago			`self._curr_sec().add(bytearray(str + '\0', 'ascii'))`
parsing of tokenized asm into MemorySections works 4 years ago
added pseudo-op .set name val, and .global symb 4 years ago			`def op_global(self, op: 'RiscVPseudoOpToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .global token. Marks the token as global`
			`:param op: The token`
			`"""`
added pseudo-op .set name val, and .global symb 4 years ago			`ASSERT_LEN(op.args, 1)`
added global symbol support! 4 years ago			`name = op.args[0]`
added pseudo-op .set name val, and .global symb 4 years ago			`self.globals.append(name)`

			`def op_set(self, op: 'RiscVPseudoOpToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles a .set name, val token. Sets the symbol name to val`
			`:param op: The token`
			`"""`
added pseudo-op .set name val, and .global symb 4 years ago			`ASSERT_LEN(op.args, 2)`
			`name = op.args[0]`
			`val = parse_numeric_argument(op.args[1])`
			`self.symbols[name] = ('_static_', val)`

added .space pseudo op, currently does nothing 4 years ago			`def op_align(self, op: 'RiscVPseudoOpToken'):`
added lots more documentation and copyright notices 4 years ago			`"""`
			`handles an align token. Currently a nop (just not implemented fully yet, as linker handles most alignement tasks)`
			`:param op: The token`
			`"""`
added .space pseudo op, currently does nothing 4 years ago			`pass`

added .word pseudo op 4 years ago			`def op_word(self, op: 'RiscVPseudoOpToken'):`
			`ASSERT_LEN(op.args, 1)`
			`val = parse_numeric_argument(op.args[0])`
added lots more documentation and copyright notices 4 years ago			`self._curr_sec().add(int_to_bytes(val, 4))`
added .word pseudo op 4 years ago
parsing of tokenized asm into MemorySections works 4 years ago			`## Section handler code`
added lots more documentation and copyright notices 4 years ago			`def _set_sec(self, name: str, flags: MemoryFlags, cls=MemorySection):`
parsing of tokenized asm into MemorySections works 4 years ago			`if name not in self.sections:`
			`self.sections[name] = cls(name, flags)`
			`self.active_section = name`

added lots more documentation and copyright notices 4 years ago			`def _curr_sec(self):`
parsing of tokenized asm into MemorySections works 4 years ago			`return self.sections[self.active_section]`