[wip] almost done with the rework of the parser and internal data structure representation of programs

This commit is contained in:
Anton Lydike 2021-12-18 00:25:39 +01:00
parent 84562de98f
commit dc4dca6fea
25 changed files with 604 additions and 647 deletions

View File

@ -9,12 +9,12 @@ on them.
import sys
from typing import Tuple, List, Dict, Callable, Type
from .Executable import MemoryFlags
from .Syscall import SyscallInterface, get_syscall_symbols
from .Exceptions import RiscemuBaseException, LaunchDebuggerException
from .base_types import MemoryFlags
from .syscall import SyscallInterface, get_syscall_symbols
from .exceptions import RiscemuBaseException, LaunchDebuggerException
from .MMU import MMU
from .Config import RunConfig
from .Registers import Registers
from .config import RunConfig
from .registers import Registers
from .debug import launch_debug_session
from .colors import FMT_CPU, FMT_NONE, FMT_ERROR
@ -23,7 +23,7 @@ import riscemu
import typing
if typing.TYPE_CHECKING:
from . import Executable, LoadedExecutable, LoadedInstruction
from . import base_types, LoadedExecutable, LoadedInstruction
from .instructions.InstructionSet import InstructionSet
@ -78,7 +78,7 @@ class CPU:
"""
return RiscVTokenizer(tokenizer_input, self.all_instructions())
def load(self, e: riscemu.Executable):
def load(self, e: riscemu.base_types):
"""
Load an executable into Memory
"""

View File

@ -1,319 +0,0 @@
"""
RiscEmu (c) 2021 Anton Lydike
SPDX-License-Identifier: MIT
This file holds Executable and LoadedExecutable classes as well as loading and some linking code.
FIXME: refactor this code into muliple files
"""
from dataclasses import dataclass, field
from typing import Dict, List, Tuple, Union, Optional
from .Exceptions import *
from .helpers import *
from math import log
import typing
if typing.TYPE_CHECKING:
from .Tokenizer import RiscVInstructionToken
@dataclass(frozen=True)
class MemoryFlags:
read_only: bool
executable: bool
def __repr__(self):
return "{}({},{})".format(
self.__class__.__name__,
'ro' if self.read_only else 'rw',
'x' if self.executable else '-'
)
@dataclass
class MemorySection:
name: str
flags: MemoryFlags
size: int = 0
content: List[bytearray] = field(default_factory=list)
def add(self, data: bytearray):
self.content.append(data)
self.size += len(data)
def continuous_content(self, parent: 'LoadedExecutable'):
"""
converts the content into one continuous bytearray
"""
if self.size == 0:
return bytearray(0)
content = self.content[0]
for b in self.content[1:]:
content += b
return content
@dataclass
class InstructionMemorySection(MemorySection):
content: List['RiscVInstructionToken'] = field(default_factory=list)
def add_insn(self, insn: 'RiscVInstructionToken'):
self.content.append(insn)
self.size += 1
def continuous_content(self, parent: 'LoadedExecutable'):
return [
LoadedInstruction(ins.instruction, ins.args, parent)
for ins in self.content
]
@dataclass()
class Executable:
run_ptr: Tuple[str, int]
sections: Dict[str, MemorySection]
symbols: Dict[str, Tuple[str, int]]
exported_symbols: List[str]
name: str
def __repr__(self):
return "{}(sections = {}, symbols = {}, run_ptr = {}, globals={})".format(
self.__class__.__name__,
" ".join(self.sections.keys()),
" ".join(self.symbols.keys()),
self.run_ptr,
",".join(self.exported_symbols)
)
### LOADING CODE
@dataclass(frozen=True)
class LoadedInstruction:
"""
An instruction which is loaded into memory. It knows the binary it belongs to to resolve symbols
"""
name: str
args: List[str]
bin: 'LoadedExecutable'
def get_imm(self, num: int):
"""
parse and get immediate argument
"""
if len(self.args) <= num:
raise ParseException("Instruction {} expected argument at {} (args: {})".format(self.name, num, self.args))
arg = self.args[num]
# look up symbols
if self.bin.has_symb(arg):
return self.bin.lookup_symbol(arg)
return parse_numeric_argument(arg)
def get_imm_reg(self, num: int):
"""
parse and get an argument imm(reg)
"""
if len(self.args) <= num:
raise ParseException("Instruction {} expected argument at {} (args: {})".format(self.name, num, self.args))
arg = self.args[num]
ASSERT_IN("(", arg)
imm, reg = arg[:-1].split("(")
if self.bin.has_symb(imm):
return self.bin.lookup_symbol(imm), reg
return parse_numeric_argument(imm), reg
def get_reg(self, num: int):
"""
parse and get an register argument
"""
if len(self.args) <= num:
raise ParseException("Instruction {} expected argument at {} (args: {})".format(self.name, num, self.args))
return self.args[num]
def __repr__(self):
return "{} {}".format(self.name, ", ".join(self.args))
@dataclass(frozen=True)
class LoadedMemorySection:
"""
A section which is loaded into memory
"""
name: str
base: int
size: int
content: Union[List[LoadedInstruction], bytearray] = field(repr=False)
flags: MemoryFlags
owner: str
def read(self, offset: int, size: int):
if offset < 0:
raise MemoryAccessException('Invalid offset {}'.format(offset), self.base + offset, size, 'read')
if offset + size > self.size:
raise MemoryAccessException('Outside section boundary of section {}'.format(self.name), self.base + offset,
size, 'read')
return self.content[offset: offset + size]
def read_instruction(self, offset):
if not self.flags.executable:
raise MemoryAccessException('Section not executable!', self.base + offset, 1, 'read exec')
if offset < 0:
raise MemoryAccessException('Invalid offset {}'.format(offset), self.base + offset, 1, 'read exec')
if offset >= self.size:
raise MemoryAccessException('Outside section boundary of section {}'.format(self.name), self.base + offset,
1, 'read exec')
return self.content[offset]
def write(self, offset, size, data):
if self.flags.read_only:
raise MemoryAccessException('Section not writeable {}'.format(self.name), self.base + offset, size, 'write')
if offset < 0:
raise MemoryAccessException('Invalid offset {}'.format(offset), self.base + offset, 1, 'write')
if offset >= self.size:
raise MemoryAccessException('Outside section boundary of section {}'.format(self.name), self.base + offset,
size, 'write')
for i in range(size):
self.content[offset + i] = data[i]
def dump(self, at_addr=None, fmt='hex', max_rows=10, group=4, bytes_per_row=16, all=False):
highlight = -1
if at_addr is None:
at_addr = self.base
else:
highlight = at_addr - self.base
at_off = at_addr - self.base
start = max(align_addr(at_off - ((max_rows * bytes_per_row) // 2), 8) - 8, 0)
if all:
end = self.size
start = 0
else:
end = min(start + (max_rows * bytes_per_row), self.size)
fmt_str = " 0x{:0" + str(ceil(log(self.base + end, 16))) + "X}: {}"
if self.flags.executable:
# this section holds instructions!
start = 0 if all else max(at_off - (max_rows // 2), 0)
end = self.size if all else min(self.size, start + max_rows)
print(FMT_MEM + "{}, viewing {} instructions:".format(
self, end - start
) + FMT_NONE)
for i in range(start, end):
if i == highlight:
ins = FMT_UNDERLINE + FMT_ORANGE + repr(self.content[i]) + FMT_NONE
else:
ins = repr(self.content[i])
print(fmt_str.format(self.base + i, ins))
else:
print(FMT_MEM + "{}, viewing {} bytes:".format(
self, end - start
) + FMT_NONE)
for i in range(0, end - start, bytes_per_row):
data = self.content[start + i: min(start + i + bytes_per_row, end)]
if start + i <= highlight <= start + i + bytes_per_row:
# do hightlight here!
hi_ind = (highlight - start - i) // group
print(fmt_str.format(self.base + start + i, format_bytes(data, fmt, group, highlight=hi_ind)))
else:
print(fmt_str.format(self.base + start + i, format_bytes(data, fmt, group)))
if end == self.size:
print(FMT_MEM + "End of section!" + FMT_NONE)
else:
print(FMT_MEM + "More bytes ..." + FMT_NONE)
def __repr__(self):
return "{}[{}] at 0x{:08X} (size={}bytes, flags={}, owner={})".format(
self.__class__.__name__,
self.name,
self.base,
self.size,
self.flags,
self.owner
)
class LoadedExecutable:
"""
This represents an executable which is loaded into memory at address base_addr
This is basicalle the "loader" in normal system environments
It initializes the stack and heap
It still holds a symbol table, that is not accessible memory since I don't want to deal with
binary strings in memory etc.
"""
name: str
base_addr: int
sections_by_name: Dict[str, LoadedMemorySection]
sections: List[LoadedMemorySection]
symbols: Dict[str, int]
run_ptr: int
exported_symbols: Dict[str, int]
global_symbol_table: Dict[str, int]
def __init__(self, exe: Executable, base_addr: int, global_symbol_table: Dict[str, int]):
self.name = exe.name
self.base_addr = base_addr
self.sections = list()
self.sections_by_name = dict()
self.symbols = dict()
self.exported_symbols = dict()
self.global_symbol_table = global_symbol_table
curr = base_addr
for sec in exe.sections.values():
loaded_sec = LoadedMemorySection(
sec.name,
curr,
sec.size,
sec.continuous_content(self),
sec.flags,
self.name
)
self.sections.append(loaded_sec)
self.sections_by_name[loaded_sec.name] = loaded_sec
curr = align_addr(loaded_sec.size + curr)
for name, (sec_name, offset) in exe.symbols.items():
if sec_name == '_static_':
self.symbols[name] = offset
else:
ASSERT_IN(sec_name, self.sections_by_name)
self.symbols[name] = self.sections_by_name[sec_name].base + offset
for name in exe.exported_symbols:
self.exported_symbols[name] = self.symbols[name]
self.size = curr - base_addr
# translate run_ptr from executable
run_ptr_sec, run_ptr_off = exe.run_ptr
self.run_ptr = self.sections_by_name[run_ptr_sec].base + run_ptr_off
def lookup_symbol(self, name):
if name in self.symbols:
return self.symbols[name]
if name in self.global_symbol_table:
return self.global_symbol_table[name]
raise LinkerException('Symbol {} not found!'.format(name), (self,))
def __repr__(self):
return '{}[{}](base=0x{:08X}, size={}bytes, sections={}, run_ptr=0x{:08X})'.format(
self.__class__.__name__,
self.name,
self.base_addr,
self.size,
" ".join(self.sections_by_name.keys()),
self.run_ptr
)
def has_symb(self, arg):
return arg in self.symbols or arg in self.global_symbol_table

View File

@ -1,193 +0,0 @@
"""
RiscEmu (c) 2021 Anton Lydike
SPDX-License-Identifier: MIT
This file holds the parser that parses the tokenizer output.
"""
from .helpers import parse_numeric_argument, int_to_bytes
from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
from .Exceptions import *
from .Tokenizer import tokenize, TokenType, Token, COMMA, NEWLINE
from typing import Dict, Tuple, List, Optional
class ExecutableParser:
"""
Parses output form the RiscVTokenizer
"""
tokenizer: 'RiscVTokenizer'
def __init__(self, tokenizer: 'RiscVTokenizer'):
self.instructions: List['RiscVInstructionToken'] = list()
self.symbols: Dict[str, Tuple[str, int]] = dict()
self.sections: Dict[str, MemorySection] = dict()
self.tokenizer = tokenizer
self.active_section: Optional[str] = None
self.implicit_sections = False
self.globals: List[str] = list()
def parse(self) -> Executable:
"""
parse tokenizer output into an executable
:return: the parsed executable
:raise ParseException: Raises a ParseException when invalid input is read
"""
for token in self.tokenizer.tokens:
if isinstance(token, 'RiscVInstructionToken'):
self.parse_instruction(token)
elif isinstance(token, 'RiscVSymbolToken'):
self.handle_symbol(token)
elif isinstance(token, 'RiscVPseudoOpToken'):
self.handle_pseudo_op(token)
return self._get_execuable()
def _get_execuable(self) -> Executable:
start_ptr = ('text', 0)
if '_start' in self.symbols:
start_ptr = self.symbols['_start']
elif 'main' in self.symbols:
start_ptr = self.symbols['main']
return Executable(start_ptr, self.sections, self.symbols, self.globals, self.tokenizer.name)
def parse_instruction(self, ins: 'RiscVInstructionToken') -> None:
"""
parses an Instruction token
:param ins: the instruction token
"""
if self.active_section is None:
self.op_text()
self.implicit_sections = True
ASSERT_EQ(self.active_section, 'text')
sec = self._curr_sec()
if isinstance(sec, InstructionMemorySection):
sec.add_insn(ins)
else:
raise ParseException("SHOULD NOT BE REACHED")
def handle_symbol(self, token: 'RiscVSymbolToken'):
"""
Handle a symbol token (such as 'main:')
:param token: the symbol token
"""
ASSERT_NOT_IN(token.name, self.symbols)
ASSERT_NOT_NULL(self.active_section)
sec_pos = self._curr_sec().size
self.symbols[token.name] = (self.active_section, sec_pos)
def handle_pseudo_op(self, op: 'RiscVPseudoOpToken'):
"""
Handle a pseudo op token (such as '.word 0xffaabbcc')
:param op: the peseudo-op token
"""
name = 'op_' + op.name
if hasattr(self, name):
getattr(self, name)(op)
else:
raise ParseException("Unknown pseudo op: {}".format(op), (op,))
## Pseudo op implementations:
def op_section(self, op: 'RiscVPseudoOpToken'):
"""
handles a .section token
:param op: The token
"""
ASSERT_LEN(op.args, 1)
name = op.args[0][1:]
ASSERT_IN(name, ('data', 'rodata', 'text'))
getattr(self, 'op_' + name)(op)
def op_text(self, op: 'RiscVPseudoOpToken' = None):
"""
handles a .text token
:param op: The token
"""
self._set_sec('text', MemoryFlags(read_only=True, executable=True), cls=InstructionMemorySection)
def op_data(self, op: 'RiscVPseudoOpToken' = None):
"""
handles a .data token
:param op: The token
"""
self._set_sec('data', MemoryFlags(read_only=False, executable=False))
def op_rodata(self, op: 'RiscVPseudoOpToken' = None):
"""
handles a .rodata token
:param op: The token
"""
self._set_sec('rodata', MemoryFlags(read_only=True, executable=False))
def op_space(self, op: 'RiscVPseudoOpToken'):
"""
handles a .space token. Inserts empty space into the current (data or rodata) section
:param op: The token
"""
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
size = parse_numeric_argument(op.args[0])
self._curr_sec().add(bytearray(size))
def op_ascii(self, op: 'RiscVPseudoOpToken'):
"""
handles a .ascii token. Inserts ascii encoded text into the currrent data section
:param op: The token
"""
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
str = op.args[0][1:-1].encode('ascii').decode('unicode_escape')
self._curr_sec().add(bytearray(str, 'ascii'))
def op_asciiz(self, op: 'RiscVPseudoOpToken'):
"""
handles a .ascii token. Inserts nullterminated ascii encoded text into the currrent data section
:param op: The token
"""
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
str = op.args[0][1:-1].encode('ascii').decode('unicode_escape')
self._curr_sec().add(bytearray(str + '\0', 'ascii'))
def op_global(self, op: 'RiscVPseudoOpToken'):
"""
handles a .global token. Marks the token as global
:param op: The token
"""
ASSERT_LEN(op.args, 1)
name = op.args[0]
self.globals.append(name)
def op_set(self, op: 'RiscVPseudoOpToken'):
"""
handles a .set name, val token. Sets the symbol name to val
:param op: The token
"""
ASSERT_LEN(op.args, 2)
name = op.args[0]
val = parse_numeric_argument(op.args[1])
self.symbols[name] = ('_static_', val)
def op_align(self, op: 'RiscVPseudoOpToken'):
"""
handles an align token. Currently a nop (just not implemented fully yet, as linker handles most alignement tasks)
:param op: The token
"""
pass
def op_word(self, op: 'RiscVPseudoOpToken'):
ASSERT_LEN(op.args, 1)
val = parse_numeric_argument(op.args[0])
self._curr_sec().add(int_to_bytes(val, 4))
## Section handler code
def _set_sec(self, name: str, flags: MemoryFlags, cls=MemorySection):
if name not in self.sections:
self.sections[name] = cls(name, flags)
self.active_section = name
def _curr_sec(self):
return self.sections[self.active_section]

View File

@ -4,10 +4,10 @@ RiscEmu (c) 2021 Anton Lydike
SPDX-License-Identifier: MIT
"""
from .Config import RunConfig
from .Executable import Executable, LoadedExecutable, LoadedMemorySection, LoadedInstruction, MemoryFlags
from .base_types import InstructionContext, Instruction, MemorySection, MemoryFlags, T_RelativeAddress, T_AbsoluteAddress, \
Program
from .helpers import align_addr, int_from_bytes
from .Exceptions import OutOfMemoryException, InvalidAllocationException
from .exceptions import OutOfMemoryException, InvalidAllocationException
from .colors import *
from typing import Dict, List, Tuple, Optional
@ -27,19 +27,14 @@ class MMU:
No single allocation can be bigger than 64 MB
"""
sections: List[LoadedMemorySection]
sections: List[MemorySection]
"""
A list of all loaded memory sections
"""
binaries: List[LoadedExecutable]
programs: List[Program]
"""
A list of all loaded executables
"""
last_bin: Optional[LoadedExecutable] = None
"""
The last loaded executable (the next executable is inserted directly after this one)
A list of all loaded programs
"""
global_symbols: Dict[str, int]
@ -47,79 +42,14 @@ class MMU:
The global symbol table
"""
last_ins_sec: Optional[LoadedMemorySection]
def __init__(self, conf: RunConfig):
def __init__(self):
"""
Create a new MMU, respecting the active RunConfiguration
:param conf: The config to respect
Create a new MMU
"""
self.sections: List[LoadedMemorySection] = list()
self.binaries: List[LoadedExecutable] = list()
self.first_free_addr: int = 0x100
self.conf: RunConfig = conf
self.global_symbols: Dict[str, int] = dict()
self.last_ins_sec = None
self.sections = list()
self.global_symbols = dict()
def load_bin(self, exe: Executable) -> LoadedExecutable:
"""
Load an executable into memory
:param exe: the executable to load
:return: A LoadedExecutable
:raises OutOfMemoryException: When all memory is used
"""
# align to 8 byte word
addr = align_addr(self.first_free_addr)
loaded_bin = LoadedExecutable(exe, addr, self.global_symbols)
if loaded_bin.size + addr > self.max_size:
raise OutOfMemoryException('load of executable')
self.binaries.append(loaded_bin)
self.first_free_addr = loaded_bin.base_addr + loaded_bin.size
# read sections into sec dict
for sec in loaded_bin.sections:
self.sections.append(sec)
self.global_symbols.update(loaded_bin.exported_symbols)
print(FMT_MEM + "[MMU] Successfully loaded{}: {}".format(FMT_NONE, loaded_bin))
return loaded_bin
def allocate_section(self, name: str, req_size: int, flag: MemoryFlags):
"""
Used to allocate a memory region (data only). Use `load_bin` if you want to load a binary, this is used for
stack and maybe malloc in the future.
:param name: Name of the section to allocate
:param req_size: The requested size
:param flag: The flags protecting this memory section
:return: The LoadedMemorySection
"""
if flag.executable:
raise InvalidAllocationException('cannot allocate executable section', name, req_size, flag)
if req_size < 0:
raise InvalidAllocationException('Invalid size request', name, req_size, flag)
if req_size > self.max_alloc_size:
raise InvalidAllocationException('Cannot allocate more than {} bytes at a time'.format(self.max_alloc_size),
name, req_size, flag)
base = align_addr(self.first_free_addr)
size = align_addr(req_size)
sec = LoadedMemorySection(name, base, size, bytearray(size), flag, "<runtime>")
self.sections.append(sec)
self.first_free_addr = base + size
return sec
def get_sec_containing(self, addr: int) -> Optional[LoadedMemorySection]:
def get_sec_containing(self, addr: T_AbsoluteAddress) -> Optional[MemorySection]:
"""
Returns the section that contains the address addr
@ -131,29 +61,25 @@ class MMU:
return sec
return None
def get_bin_containing(self, addr: int) -> Optional[LoadedExecutable]:
def get_bin_containing(self, addr: T_AbsoluteAddress) -> Optional[Program]:
for exe in self.binaries:
if exe.base_addr <= addr < exe.base_addr + exe.size:
return exe
return None
def read_ins(self, addr: int) -> LoadedInstruction:
def read_ins(self, addr: T_AbsoluteAddress) -> Instruction:
"""
Read a single instruction located at addr
:param addr: The location
:return: The Instruction
"""
sec = self.last_ins_sec
if sec is not None and sec.base <= addr < sec.base + sec.size:
return sec.read_instruction(addr - sec.base)
sec = self.get_sec_containing(addr)
self.last_ins_sec = sec
if sec is None:
print(FMT_MEM + "[MMU] Trying to read instruction form invalid region! "
"Have you forgotten an exit syscall or ret statement?" + FMT_NONE)
raise RuntimeError("No next instruction available!")
return sec.read_instruction(addr - sec.base)
return sec.read_ins(addr - sec.base)
def read(self, addr: int, size: int) -> bytearray:
"""
@ -164,6 +90,9 @@ class MMU:
:return: The bytearray at addr
"""
sec = self.get_sec_containing(addr)
if sec is None:
print(FMT_MEM + "[MMU] Trying to read data form invalid region at 0x{:x}! ".format(addr) + FMT_NONE)
raise RuntimeError("Reading from uninitialized memory region!")
return sec.read(addr - sec.base, size)
def write(self, addr: int, size: int, data):
@ -176,7 +105,7 @@ class MMU:
"""
sec = self.get_sec_containing(addr)
if sec is None:
print(FMT_MEM + '[MMU] Invalid write into non-initialized section at 0x{:08X}'.format(addr) + FMT_NONE)
print(FMT_MEM + '[MMU] Invalid write into non-initialized region at 0x{:08X}'.format(addr) + FMT_NONE)
raise RuntimeError("No write pls")
return sec.write(addr - sec.base, size, data)
@ -195,7 +124,7 @@ class MMU:
return
sec.dump(addr, *args, **kwargs)
def symbol(self, symb: str):
def label(self, symb: str):
"""
Look up the symbol symb in all local symbol tables (and the global one)
@ -204,9 +133,9 @@ class MMU:
print(FMT_MEM + "[MMU] Lookup for symbol {}:".format(symb) + FMT_NONE)
if symb in self.global_symbols:
print(" Found global symbol {}: 0x{:X}".format(symb, self.global_symbols[symb]))
for b in self.binaries:
if symb in b.symbols:
print(" Found local symbol {}: 0x{:X} in {}".format(symb, b.symbols[symb], b.name))
for section in self.sections:
if symb in section.context.labels:
print(" Found local labels {}: 0x{:X} in {}".format(symb, section.context.labels[symb], section.name))
def read_int(self, addr: int) -> int:
return int_from_bytes(self.read(addr, 4))

View File

@ -8,21 +8,19 @@ This package aims at providing an all-round usable RISC-V emulator and debugger
It contains everything needed to run assembly files, so you don't need any custom compilers or toolchains
"""
from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \
from .exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \
ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException
from .Executable import Executable, LoadedExecutable, LoadedMemorySection
from .ExecutableParser import ExecutableParser
from .base_types import Executable, LoadedExecutable, LoadedMemorySection
from .instructions import *
from .MMU import MMU
from .Registers import Registers
from .Syscall import SyscallInterface, Syscall
from .registers import Registers
from .syscall import SyscallInterface, Syscall
from .CPU import CPU
from .Config import RunConfig
from .config import RunConfig
__author__ = "Anton Lydike <Anton@Lydike.com>"
__copyright__ = "Copyright 2021 Anton Lydike"

169
riscemu/assembler.py Normal file
View File

@ -0,0 +1,169 @@
from typing import Optional, Tuple, Union
from enum import Enum, auto
from typing import Optional, Tuple, Union
from helpers import parse_numeric_argument
from .base_types import Program, T_RelativeAddress, InstructionContext
from .colors import FMT_PARSE, FMT_NONE
from .exceptions import ParseException
from .helpers import ASSERT_LEN
from .tokenizer import Token
from .types import BinaryDataMemorySection, InstructionMemorySection
INSTRUCTION_SECTION_NAMES = ('.text', '.init', '.fini')
class MemorySectionType(Enum):
Data = auto()
Instructions = auto()
class CurrentSection:
name: str
data: Union[list, bytearray]
type: MemorySectionType
def current_address(self) -> T_RelativeAddress:
if self.type == MemorySectionType.Data:
return len(self.data)
return len(self.data) * 4
def __repr__(self):
return "{}(name={},data={},type={})".format(
self.__class__.__name__, self.name,
self.data, self.type.name
)
class ParseContext:
section: Optional[CurrentSection]
context: InstructionContext
program: Program
def __init__(self, name: str):
self.program = Program(name)
self.context = self.program.context
self.section = None
def finalize(self) -> Program:
self.finalize_section()
return self.program
def finalize_section(self):
if self.section is None:
return
if self.section.type == MemorySectionType.Data:
section = BinaryDataMemorySection(self.section.data, self.section.name, self.context)
self.program.add_section(section)
elif self.section.type == MemorySectionType.Instructions:
section = InstructionMemorySection(self.section.data, self.section.name, self.context)
self.program.add_section(section)
def __repr__(self):
return "{}(\n\tsetion={},\n\tprogram={}\n)".format(
self.__class__.__name__, self.section, self.program
)
def ASSERT_IN_SECTION_TYPE(context: ParseContext, type: MemorySectionType):
if context.section is None:
raise ParseException('Error, expected to be in {} section, but no section is present...'.format(type.name))
if context.section.type != type:
raise ParseException(
'Error, expected to be in {} section, but currently in {}...'.format(type.name, context.section)
)
def get_section_base_name(section_name: str) -> str:
return '.' + section_name.split('.')[1]
class AssemblerDirectives:
"""
This class represents a collection of all assembler directives as documented by
https://github.com/riscv-non-isa/riscv-asm-manual/blob/master/riscv-asm.md#pseudo-ops
All class methods prefixed with op_ are directly used as assembler directives.
"""
@classmethod
def op_align(cls, token: Token, args: Tuple[str], context: ParseContext):
ASSERT_LEN(args, 1)
ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)
align_to = parse_numeric_argument(args[0])
current_mod = context.section.current_address() % align_to
if current_mod == 0:
return
context.section.data += bytearray(align_to - current_mod)
@classmethod
def op_section(cls, token: Token, args: Tuple[str], context: ParseContext):
ASSERT_LEN(args, 1)
context.finalize_section()
if get_section_base_name(args[0]) in INSTRUCTION_SECTION_NAMES:
context.section.type = MemorySectionType.Instructions
context.section.data = list()
else:
context.section.type = MemorySectionType.Data
context.section.data = bytearray()
context.section.name = args[0]
@classmethod
def op_globl(cls, token: Token, args: Tuple[str], context: ParseContext):
ASSERT_LEN(args, 1)
context.program.global_labels.add(args[0])
@classmethod
def op_equ(cls, token: Token, args: Tuple[str], context: ParseContext):
ASSERT_LEN(args, 2)
name = args[0]
value = parse_numeric_argument(args[1])
context.context.labels[name] = value
@classmethod
def op_zero(cls, token: Token, args: Tuple[str], context: ParseContext):
ASSERT_LEN(args, 1)
ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)
size = parse_numeric_argument(args[0])
cls.add_bytes(size, bytearray(size), context)
@classmethod
def add_bytes(cls, size: int, content: Union[None, int, bytearray], context: ParseContext):
ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data)
if content is None:
content = bytearray(size)
@classmethod
def add_text(cls, text: str, context: ParseContext, zero_terminate: bool = True):
encoded_bytes = bytearray(text.encode('ascii'))
if zero_terminate:
encoded_bytes += bytearray(1)
cls.add_bytes(len(encoded_bytes), encoded_bytes, context)
@classmethod
def handle_instruction(cls, token: Token, args: Tuple[str], context: ParseContext):
op = token.value[1:]
if hasattr(cls, 'op_' + op):
getattr(cls, 'op_' + op)(token, args, context)
elif op in ('text', 'data', 'rodata', 'bss', 'sbss'):
cls.op_section(token, (token.value,), context)
elif op in ('string', 'asciiz', 'asciz', 'ascii'):
ASSERT_LEN(args, 1)
cls.add_text(args[0], context, op == 'ascii')
elif op in DATA_OP_SIZES:
size = DATA_OP_SIZES[op]
for arg in args:
cls.add_bytes(size, parse_numeric_argument(arg), context)
else:
print(FMT_PARSE + "Unknown assembler directive: {} {} in {}".format(token, args, context) + FMT_NONE)
DATA_OP_SIZES = {
'byte': 1,
'2byte': 2, 'half': 2, 'short': 2,
'4byte': 4, 'word': 4, 'long': 4,
'8byte': 8, 'dword': 8, 'quad': 8,
}

186
riscemu/base_types.py Normal file
View File

@ -0,0 +1,186 @@
"""
RiscEmu (c) 2021 Anton Lydike
SPDX-License-Identifier: MIT
This file contains base classes which represent loaded programs
"""
import re
from abc import ABC
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Set
from collections import defaultdict
from .helpers import *
T_RelativeAddress = int
T_AbsoluteAddress = int
NUMBER_SYMBOL_PATTERN = re.compile(r'^\d+[fb]$')
@dataclass(frozen=True)
class MemoryFlags:
read_only: bool
executable: bool
def __repr__(self):
return "{}({},{})".format(
self.__class__.__name__,
'ro' if self.read_only else 'rw',
'x' if self.executable else '-'
)
class InstructionContext:
base_address: T_AbsoluteAddress
"""
The address where the instruction block is placed
"""
labels: Dict[str, T_RelativeAddress]
"""
This dictionary maps all labels to their relative position of the instruction block
"""
numbered_labels: Dict[str, List[T_RelativeAddress]]
"""
This dictionary maps numbered labels (which can occur multiple times) to a list of (block-relative) addresses where
the label was placed
"""
def __init__(self):
self.labels = dict()
self.numbered_labels = defaultdict(list)
self.base_address = 0
def resolve_label(self, symbol: str, address_at: Optional[T_RelativeAddress] = None) -> Optional[T_RelativeAddress]:
if NUMBER_SYMBOL_PATTERN.match(symbol):
if address_at is None:
raise ParseException("Cannot resolve relative symbol {} without an address!".format(symbol))
direction = symbol[-1]
if direction == 'b':
return max([addr for addr in self.numbered_labels.get(symbol[:-1], []) if addr < address_at],
default=None)
else:
return min([addr for addr in self.numbered_labels.get(symbol[:-1], []) if addr > address_at],
default=None)
else:
return self.labels.get(symbol, None)
class Instruction(ABC):
name: str
args: tuple
@abstractmethod
def get_imm(self, num: int) -> int:
"""
parse and get immediate argument
"""
pass
@abstractmethod
def get_imm_reg(self, num: int) -> Tuple[int, str]:
"""
parse and get an argument imm(reg)
"""
pass
@abstractmethod
def get_reg(self, num: int) -> str:
"""
parse and get an register argument
"""
pass
def __repr__(self):
return "{} {}".format(self.name, ", ".join(self.args))
@dataclass
class MemorySection(ABC):
name: str
flags: MemoryFlags
size: int
base: T_AbsoluteAddress
owner: str
context: InstructionContext
@abstractmethod
def read(self, offset: T_RelativeAddress, size: int) -> bytearray:
pass
@abstractmethod
def write(self, offset: T_RelativeAddress, size: int, data: bytearray):
pass
@abstractmethod
def read_ins(self, offset: T_RelativeAddress) -> Instruction:
pass
def dump(self, start: T_RelativeAddress, end: Optional[T_RelativeAddress], fmt: str = 'hex',
bytes_per_row: int = 16, rows: int = 10, group: int = 4):
if self.flags.executable:
bytes_per_row = 4
highlight = None
if end is None:
end = start + (bytes_per_row * (rows // 2))
highlight = start
start = start - (bytes_per_row * (rows // 2))
if self.flags.executable:
print(FMT_MEM + "{}, viewing {} instructions:".format(
self, (end - start) // 4
) + FMT_NONE)
for addr in range(start, end, 4):
if addr == highlight:
print(FMT_UNDERLINE + FMT_ORANGE, end='')
print("0x{:x}: {}{}".format(
self.base + addr, self.read_ins(addr), FMT_NONE
))
else:
print(FMT_MEM + "{}, viewing {} bytes:".format(
self, (end - start)
) + FMT_NONE)
for addr in range(start, end, bytes_per_row):
hi_ind = (highlight - addr) // group
print("0x{:x}: {}{}".format(
self.base + addr, format_bytes(self.read(addr, bytes_per_row), fmt, group, hi_ind), FMT_NONE
))
def __repr__(self):
return "{}[{}] at 0x{:08X} (size={}bytes, flags={}, owner={})".format(
self.__class__.__name__,
self.name,
self.base,
self.size,
self.flags,
self.owner
)
class Program:
name: str
context: InstructionContext
global_labels: Set[str]
sections: List[MemorySection]
base: T_AbsoluteAddress = 0
def __init__(self, name: str, base: int = 0):
self.name = name
self.context = InstructionContext()
self.sections = []
self.base = base
self.global_labels = set()
def add_section(self, sec: MemorySection):
self.sections.append(sec)
def __repr__(self):
return "{}(name={},context={},globals={},sections={},base={})".format(
self.__class__.__name__, self.name, self.context, self.global_labels,
[s.name for s in self.sections], self.base
)

View File

@ -10,7 +10,7 @@ from typing import Optional
@dataclass(frozen=True, init=True)
class RunConfig:
stack_size: int = 8 * 1024 * 64 # for 8KB stack
stack_size: int = 8 * 1024 * 64 # for 8KB stack
include_scall_symbols: bool = True
add_accept_imm: bool = False
# debugging
@ -21,3 +21,5 @@ class RunConfig:
scall_fs: bool = False
verbosity: int = 0
CONFIG = RunConfig()

View File

@ -5,9 +5,9 @@ SPDX-License-Identifier: MIT
"""
import typing
from .Registers import Registers
from .registers import Registers
from .colors import FMT_DEBUG, FMT_NONE
from .Executable import LoadedInstruction
from .base_types import Instruction
from .helpers import *
if typing.TYPE_CHECKING:
@ -50,7 +50,7 @@ def launch_debug_session(cpu: 'CPU', mmu: 'MMU', reg: 'Registers', prompt=""):
return
bin = mmu.get_bin_containing(cpu.pc)
ins = LoadedInstruction(name, list(args), bin)
ins = Instruction(name, list(args), bin)
print(FMT_DEBUG + "Running instruction " + ins + FMT_NONE)
cpu.run_instruction(ins)

View File

@ -7,11 +7,9 @@ SPDX-License-Identifier: MIT
import typing
from abc import abstractmethod
from .base_types import Instruction
from .colors import *
if typing.TYPE_CHECKING:
from .Executable import LoadedInstruction
class RiscemuBaseException(BaseException):
@abstractmethod
@ -116,7 +114,7 @@ class InvalidAllocationException(RiscemuBaseException):
class UnimplementedInstruction(RiscemuBaseException):
def __init__(self, ins: 'LoadedInstruction'):
def __init__(self, ins: Instruction):
self.ins = ins
def message(self):

View File

@ -5,7 +5,8 @@ SPDX-License-Identifier: MIT
"""
from math import log10, ceil
from .Exceptions import *
from .exceptions import *
from typing import Iterable, Iterator, TypeVar, Generic, List
def align_addr(addr: int, to_bytes: int = 8) -> int:
@ -105,3 +106,36 @@ def bind_twos_complement(val):
elif val > 2147483647:
return val - 4294967296
return val
T = TypeVar('T')
class Peekable(Generic[T], Iterator[T]):
def __init__(self, iterable: Iterable[T]):
self.iterable = iter(iterable)
self.cache: List[T] = list()
def __iter__(self) -> Iterator[T]:
return self
def __next__(self) -> T:
if self.cache:
return self.cache.pop()
return next(self.iterable)
def peek(self) -> T:
try:
if self.cache:
return self.cache[0]
pop = next(self.iterable)
self.cache.append(pop)
return pop
except StopIteration:
return None
def push_back(self, item: T):
self.cache = [item] + self.cache
def is_empty(self) -> bool:
return self.peek() is None

View File

@ -9,7 +9,7 @@ from typing import Tuple, Callable, Dict
from abc import ABC
from ..CPU import CPU
from ..helpers import ASSERT_LEN, ASSERT_IN, to_unsigned
from ..Executable import LoadedInstruction
from ..base_types import LoadedInstruction
class InstructionSet(ABC):

View File

@ -1,5 +1,5 @@
from .InstructionSet import InstructionSet, LoadedInstruction
from ..Exceptions import INS_NOT_IMPLEMENTED
from ..exceptions import INS_NOT_IMPLEMENTED
from ..helpers import int_from_bytes, int_to_bytes, to_unsigned, to_signed

View File

@ -9,9 +9,9 @@ from .InstructionSet import *
from ..helpers import int_from_bytes, int_to_bytes, to_unsigned, to_signed
from ..colors import FMT_DEBUG, FMT_NONE
from ..debug import launch_debug_session
from ..Exceptions import LaunchDebuggerException
from ..Syscall import Syscall
from ..Executable import LoadedInstruction
from ..exceptions import LaunchDebuggerException
from ..syscall import Syscall
from ..base_types import LoadedInstruction
class RV32I(InstructionSet):

View File

@ -5,7 +5,7 @@ SPDX-License-Identifier: MIT
"""
from .InstructionSet import *
from ..Exceptions import INS_NOT_IMPLEMENTED
from ..exceptions import INS_NOT_IMPLEMENTED
class RV32M(InstructionSet):

79
riscemu/parser.py Normal file
View File

@ -0,0 +1,79 @@
"""
RiscEmu (c) 2021 Anton Lydike
SPDX-License-Identifier: MIT
"""
import re
from typing import Dict, Tuple, Iterable, Callable
from helpers import Peekable
from .assembler import MemorySectionType, ParseContext, AssemblerDirectives
from .base_types import Program
from .colors import FMT_PARSE
from .exceptions import ParseException
from .tokenizer import Token, TokenType
from .types import SimpleInstruction
def parse_instruction(token: Token, args: Tuple[str], context: ParseContext):
if context.section is None or context.section.type != MemorySectionType.Instructions:
raise ParseException("{} {} encountered in invalid context: {}".format(token, args, context))
ins = SimpleInstruction(token.value, args, context.context, context.section.current_address())
context.section.data.append(ins)
def parse_label(token: Token, args: Tuple[str], context: ParseContext):
name = token.value[:-1]
if re.match(r'^\d+$', name):
# relative label:
context.context.numbered_labels[name].append(context.section.current_address())
else:
if name in context.context.labels:
print(FMT_PARSE + 'Warn: Symbol {} defined twice!'.format(name))
context.context.labels[name] = context.section.current_address()
PARSERS: Dict[TokenType, Callable[[Token, Tuple[str], ParseContext], None]] = {
TokenType.PSEUDO_OP: AssemblerDirectives.handle_instruction,
TokenType.LABEL: parse_label,
TokenType.INSTRUCTION_NAME: parse_instruction
}
def parse_tokens(name: str, tokens_iter: Iterable[Token]) -> Program:
context = ParseContext(name)
for token, args in composite_tokenizer(Peekable[Token](tokens_iter)):
if token.type not in PARSERS:
raise ParseException("Unexpected token type: {}, {}".format(token, args))
PARSERS[token.type](token, args, context)
return context.finalize()
def composite_tokenizer(tokens_iter: Iterable[Token]) -> Iterable[Tuple[Token, Tuple[str]]]:
tokens: Peekable[Token] = Peekable[Token](tokens_iter)
while not tokens.is_empty():
token = next(tokens)
if token.type in (TokenType.PSEUDO_OP, TokenType.LABEL, TokenType.INSTRUCTION_NAME):
yield token, tuple(take_arguments(tokens))
def take_arguments(tokens: Peekable[Token]) -> Iterable[str]:
"""
Consumes (argument comma)* and yields argument.value until newline is reached
If an argument is not followed by either a newline or a comma, a parse exception is raised
The newline at the end is consumed
:param tokens: A Peekable iterator over some Tokens
"""
while True:
if tokens.peek().type == TokenType.ARGUMENT:
yield next(tokens).value
if tokens.peek().type == TokenType.COMMA:
next(tokens)
elif tokens.peek().type == TokenType.NEWLINE:
next(tokens)
break
raise ParseException("Expected newline, instead got {}".format(tokens.peek()))

View File

@ -2,8 +2,8 @@ from dataclasses import dataclass
from typing import List, Dict, Tuple
from .Exceptions import *
from ..Exceptions import RiscemuBaseException
from ..Executable import MemoryFlags, LoadedMemorySection
from ..exceptions import RiscemuBaseException
from ..base_types import MemoryFlags, LoadedMemorySection
from ..decoder import decode, RISCV_REGS, format_ins
from ..helpers import FMT_PARSE, FMT_NONE, FMT_GREEN, FMT_BOLD

View File

@ -8,8 +8,8 @@ from typing import Dict, List, Optional, TYPE_CHECKING
from .ElfLoader import ElfInstruction, ElfLoadedMemorySection, InstructionAccessFault, InstructionAddressMisalignedTrap
from .PrivMMU import PrivMMU
from ..Config import RunConfig
from ..Executable import LoadedMemorySection, MemoryFlags
from ..config import RunConfig
from ..base_types import LoadedMemorySection, MemoryFlags
from ..IO.IOModule import IOModule
from ..colors import FMT_ERROR, FMT_NONE, FMT_MEM
from ..decoder import decode
@ -117,7 +117,7 @@ class MemoryImageMMU(PrivMMU):
return "{}{:+x} ({}:{})".format(sym, addr - val, sec.owner, sec.name)
return "{}:{}{:+x}".format(sec.owner, sec.name, addr - sec.base)
def symbol(self, symb: str):
def label(self, symb: str):
print(FMT_MEM + "Looking up symbol {}".format(symb))
for owner, symbs in self.debug_info['symbols'].items():
if symb in symbs:

View File

@ -15,7 +15,7 @@ from ..IO import TextIO
from ..instructions import RV32A, RV32M
if typing.TYPE_CHECKING:
from riscemu import Executable, LoadedExecutable, LoadedInstruction
from riscemu import base_types, LoadedExecutable, LoadedInstruction
from riscemu.instructions.InstructionSet import InstructionSet
@ -95,7 +95,7 @@ class PrivCPU(CPU):
print()
print(FMT_CPU + "Program stopped without exiting - perhaps you stopped the debugger?" + FMT_NONE)
def load(self, e: riscemu.Executable):
def load(self, e: riscemu.base_types):
raise NotImplementedError("Not supported!")
def run_loaded(self, le: 'riscemu.LoadedExecutable'):

View File

@ -5,7 +5,7 @@ SPDX-License-Identifier: MIT
"""
from ..instructions.RV32I import *
from ..Exceptions import INS_NOT_IMPLEMENTED
from ..exceptions import INS_NOT_IMPLEMENTED
from .Exceptions import *
from .privmodes import PrivModes
from ..colors import FMT_CPU, FMT_NONE

View File

@ -4,10 +4,10 @@ RiscEmu (c) 2021 Anton Lydike
SPDX-License-Identifier: MIT
"""
from .Config import RunConfig
from .config import RunConfig
from .helpers import *
from collections import defaultdict
from .Exceptions import InvalidRegisterException
from .exceptions import InvalidRegisterException
class Registers:
"""

View File

@ -10,15 +10,16 @@ from enum import Enum, auto
from typing import List, Iterable
from riscemu.decoder import RISCV_REGS
from .Exceptions import ParseException
from .exceptions import ParseException
LINE_COMMENT_STARTERS = ('#', ';', '//')
WHITESPACE_PATTERN = re.compile(r'\s+')
MEMORY_ADDRESS_PATTERN = re.compile('^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$')
MEMORY_ADDRESS_PATTERN = re.compile(r'^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$')
REGISTER_NAMES = RISCV_REGS
I = lambda x: x
class TokenType(Enum):
COMMA = auto()
ARGUMENT = auto()
@ -40,6 +41,7 @@ class Token:
return ', '
return '{}({}) '.format(self.type.name[0:3], self.value)
NEWLINE = Token(TokenType.NEWLINE, '\n')
COMMA = Token(TokenType.COMMA, ',')

72
riscemu/types.py Normal file
View File

@ -0,0 +1,72 @@
from typing import List, Tuple
from .exceptions import MemoryAccessException
from .helpers import parse_numeric_argument
from .base_types import Instruction, MemorySection, MemoryFlags, InstructionContext, T_RelativeAddress, \
T_AbsoluteAddress
class SimpleInstruction(Instruction):
def __init__(self, name: str, args: Tuple[str], context: InstructionContext, addr: T_RelativeAddress):
self.context = context
self.name = name
self.args = args
self.addr = addr
def get_imm(self, num: int) -> int:
resolved_label = self.context.resolve_label(self.args[num], self.addr)
if resolved_label is None:
return parse_numeric_argument(self.args[num])
return resolved_label
def get_imm_reg(self, num: int) -> Tuple[int, str]:
return self.get_imm(num + 1), self.get_reg(num)
def get_reg(self, num: int) -> str:
return self.args[num]
class InstructionMemorySection(MemorySection):
def __init__(self, instructions: List[Instruction], name: str, context: InstructionContext, base: int = 0):
self.name = name
self.base = base
self.context = context
self.size = len(instructions) * 4
self.flags = MemoryFlags(True, True)
self.instructions = instructions
def read(self, offset: T_RelativeAddress, size: int) -> bytearray:
raise MemoryAccessException("Cannot read raw bytes from instruction section", self.base + offset, size, 'read')
def write(self, offset: T_RelativeAddress, size: int, data: bytearray):
raise MemoryAccessException("Cannot write raw bytes to instruction section", self.base + offset, size, 'write')
def read_ins(self, offset: T_RelativeAddress) -> Instruction:
if offset % 4 != 0:
raise MemoryAccessException("Unaligned instruction fetch!", self.base + offset, 4, 'instruction fetch')
return self.instructions[offset // 4]
class BinaryDataMemorySection(MemorySection):
def __init__(self, data: bytearray, name: str, context: InstructionContext, base: int = 0):
self.name = name
self.base = base
self.context = context
self.size = len(data)
self.flags = MemoryFlags(False, False)
self.data = data
def read(self, offset: T_RelativeAddress, size: int) -> bytearray:
if offset + size > self.size:
raise MemoryAccessException("Out of bounds access in {}".format(self), offset, size, 'read')
return self.data[offset:offset + size]
def write(self, offset: T_RelativeAddress, size: int, data: bytearray):
if offset + size > self.size:
raise MemoryAccessException("Out of bounds access in {}".format(self), offset, size, 'write')
if len(data[0:size]) != size:
raise MemoryAccessException("Invalid write parameter sizing", offset, size, 'write')
self.data[offset:offset + size] = data[0:size]
def read_ins(self, offset: T_RelativeAddress) -> Instruction:
raise MemoryAccessException("Tried reading instruction on non-executable section {}".format(self),
offset, 4, 'instruction fetch')

View File

@ -1,6 +1,6 @@
from unittest import TestCase
from riscemu.Tokenizer import tokenize, print_tokens, Token, TokenType, NEWLINE, COMMA
from riscemu.tokenizer import tokenize, print_tokens, Token, TokenType, NEWLINE, COMMA
def ins(name: str) -> Token: