From da4ae7c4c18fe9cfd16755ff022497b141ac1eb7 Mon Sep 17 00:00:00 2001 From: Anton Lydike Date: Sat, 17 Apr 2021 13:39:56 +0200 Subject: [PATCH] parsing of tokenized asm into MemorySections works --- .idea/misc.xml | 2 +- .idea/riscemu.iml | 4 +- riscemu/Exceptions.py | 33 ++++++++ riscemu/Executable.py | 33 ++++++++ riscemu/ExecutableParser.py | 106 +++++++++++++++++++++++++ riscemu/MMU.py | 15 ++++ riscemu/{tokenizer.py => Tokenizer.py} | 3 +- riscemu/__init__.py | 10 ++- riscemu/main.py | 2 +- run.py | 7 ++ 10 files changed, 209 insertions(+), 6 deletions(-) create mode 100644 riscemu/Exceptions.py create mode 100644 riscemu/Executable.py create mode 100644 riscemu/ExecutableParser.py create mode 100644 riscemu/MMU.py rename riscemu/{tokenizer.py => Tokenizer.py} (99%) diff --git a/.idea/misc.xml b/.idea/misc.xml index d1e22ec..a15ea67 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/.idea/riscemu.iml b/.idea/riscemu.iml index 8b8c395..8e5446a 100644 --- a/.idea/riscemu.iml +++ b/.idea/riscemu.iml @@ -1,7 +1,9 @@ - + + + diff --git a/riscemu/Exceptions.py b/riscemu/Exceptions.py new file mode 100644 index 0000000..7bcb159 --- /dev/null +++ b/riscemu/Exceptions.py @@ -0,0 +1,33 @@ +class ParseException(BaseException): + def __init__(self, msg, data=None): + super().__init__() + self.msg = msg + self.data = data + + def message(self): + return "{}(\"{}\", data={})".format(self.__class__.__name__, self.msg, self.data) + + +def ASSERT_EQ(a1, a2): + if a1 != a2: + raise ParseException("ASSERTION_FAILED: Expected elements to be equal!", (a1, a2)) + + +def ASSERT_LEN(a1, size): + if len(a1) != size: + raise ParseException("ASSERTION_FAILED: Expected {} to be of length {}".format(a1, size), (a1, size)) + + +def ASSERT_NOT_NULL(a1): + if a1 is None: + raise ParseException("ASSERTION_FAILED: Expected {} to be non null".format(a1), (a1,)) + + +def ASSERT_NOT_IN(a1, a2): + if a1 in a2: + raise ParseException("ASSERTION_FAILED: Expected {} to not be in {}".format(a1, a2), (a1,a2)) + + +def ASSERT_IN(a1, a2): + if a1 not in a2: + raise ParseException("ASSERTION_FAILED: Expected {} to not be in {}".format(a1, a2), (a1,a2)) diff --git a/riscemu/Executable.py b/riscemu/Executable.py new file mode 100644 index 0000000..94fb995 --- /dev/null +++ b/riscemu/Executable.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Tuple +from . import MemoryFlags, RiscVInstructionToken, RiscVTokenizer, RiscVSymbolToken, RiscVPseudoOpToken +from .Exceptions import * + + +@dataclass +class MemorySection: + name: str + flags: MemoryFlags + size: int = 0 + start: int = -1 + content: List[bytearray] = field(default_factory=list) + + def add(self, data: bytearray): + self.content.append(data) + self.size += len(data) + + +class InstructionMemorySection(MemorySection): + insn: List[RiscVInstructionToken] = field(default_factory=list) + + def add_insn(self, insn: RiscVInstructionToken): + self.insn.append(insn) + self.size += 4 + + +@dataclass +class Executable: + run_ptr: Tuple[str, int] + sections: Dict[str, MemorySection] + symbols: Dict[str, Tuple[str, int]] + diff --git a/riscemu/ExecutableParser.py b/riscemu/ExecutableParser.py new file mode 100644 index 0000000..5eacdc2 --- /dev/null +++ b/riscemu/ExecutableParser.py @@ -0,0 +1,106 @@ +from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags +from .Exceptions import * +from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken + +from typing import Dict, Tuple, List + + +def parse_numeric_argument(arg: str): + if arg.startswith('0x') or arg.startswith('0X'): + return int(arg, 16) + return int(arg) + +class ExecutableParser: + tokenizer: RiscVTokenizer + + def __init__(self, tokenizer: RiscVTokenizer): + self.instructions: List[RiscVInstructionToken] = list() + self.symbols: Dict[str, Tuple[str, int]] = dict() + self.sections: Dict[str, MemorySection] = dict() + self.tokenizer = tokenizer + self.active_section = None + self.implicit_sections = False + + def parse(self): + for token in self.tokenizer.tokens: + if isinstance(token, RiscVInstructionToken): + self.parse_instruction(token) + elif isinstance(token, RiscVSymbolToken): + self.handle_symbol(token) + elif isinstance(token, RiscVPseudoOpToken): + self.handle_pseudo_op(token) + + def get_execuable(self): + start_ptr = ('text', 0) + if '_start' in self.symbols: + start_ptr = self.symbols['_start'] + elif 'main' in self.symbols: + start_ptr = self.symbols['main'] + return Executable(start_ptr, self.sections, self.symbols) + + def parse_instruction(self, ins: RiscVInstructionToken): + if self.active_section is None: + self.op_text() + self.implicit_sections = True + + ASSERT_EQ(self.active_section, 'text') + sec = self.curr_sec() + if isinstance(sec, InstructionMemorySection): + sec.add_insn(ins) + else: + raise ParseException("SHOULD NOT BE REACHED") + + def handle_symbol(self, token: RiscVSymbolToken): + ASSERT_NOT_IN(token.name, self.symbols) + sec_pos = self.curr_sec().size + self.symbols[token.name] = (self.active_section, sec_pos) + + def handle_pseudo_op(self, op: RiscVPseudoOpToken): + name = 'op_' + op.name + if hasattr(self, name): + getattr(self, name)(op) + else: + raise ParseException("Unknown pseudo op: {}".format(op), (op,)) + + ## Pseudo op implementations: + def op_section(self, op: RiscVPseudoOpToken): + ASSERT_LEN(op.args, 1) + name = op.args[0][1:] + ASSERT_IN(name, ('data', 'rodata', 'text')) + getattr(self, 'op_' + name)(op) + + def op_text(self, op: RiscVPseudoOpToken = None): + self.set_sec('text', MemoryFlags(read_only=True, executable=True), cls=InstructionMemorySection) + + def op_data(self, op: RiscVPseudoOpToken = None): + self.set_sec('data', MemoryFlags(read_only=False, executable=False)) + + def op_rodata(self, op: RiscVPseudoOpToken = None): + self.set_sec('rodata', MemoryFlags(read_only=True, executable=False)) + + def op_space(self, op: RiscVPseudoOpToken): + ASSERT_IN(self.active_section, ('data', 'rodata')) + ASSERT_LEN(op.args, 1) + size = parse_numeric_argument(op.args[0]) + self.curr_sec().add(bytearray(size)) + + def op_ascii(self, op: RiscVPseudoOpToken): + ASSERT_IN(self.active_section, ('data', 'rodata')) + ASSERT_LEN(op.args, 1) + str = op.args[0][1:-1] + self.curr_sec().add(bytearray(str, 'ascii')) + + def op_asciiz(self, op: RiscVPseudoOpToken): + ASSERT_IN(self.active_section, ('data', 'rodata')) + ASSERT_LEN(op.args, 1) + str = op.args[0][1:-1] + self.curr_sec().add(bytearray(str + '\0', 'ascii')) + + ## Section handler code + def set_sec(self, name: str, flags: MemoryFlags, cls=MemorySection): + if name not in self.sections: + self.sections[name] = cls(name, flags) + self.active_section = name + + def curr_sec(self): + return self.sections[self.active_section] diff --git a/riscemu/MMU.py b/riscemu/MMU.py new file mode 100644 index 0000000..c8805b8 --- /dev/null +++ b/riscemu/MMU.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass + +@dataclass(frozen=True) +class MemoryFlags: + read_only: bool + executable: bool + +class MemoryRegion: + addr:int + len:int + flags: MemoryFlags + + +class MMU: + def __init__(self): diff --git a/riscemu/tokenizer.py b/riscemu/Tokenizer.py similarity index 99% rename from riscemu/tokenizer.py rename to riscemu/Tokenizer.py index 9d5e499..00c3d16 100644 --- a/riscemu/tokenizer.py +++ b/riscemu/Tokenizer.py @@ -231,7 +231,7 @@ class RiscVPseudoOpToken(RiscVToken): class RiscVTokenizer: def __init__(self, input: RiscVInput): self.input = input - self.tokens = [] + self.tokens: List[RiscVToken] = [] def tokenize(self): while self.input.has_next(): @@ -296,4 +296,3 @@ class RiscVTokenizer: def parse_comment(self): # just consume the rest self.input.consume(regex=REG_UNTIL_NEWLINE) - diff --git a/riscemu/__init__.py b/riscemu/__init__.py index 7db1ca4..da9547e 100644 --- a/riscemu/__init__.py +++ b/riscemu/__init__.py @@ -1,2 +1,10 @@ from .CPU import CPU, Registers, Syscall, SyscallInterface -from .tokenizer import RiscVToken, RiscVInput, RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken + +from .Tokenizer import RiscVToken, RiscVInput, RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, \ + RiscVPseudoOpToken, TokenType + +from .MMU import MemoryFlags, MemoryRegion, MMU + +from .Exceptions import ASSERT_NOT_NULL, ASSERT_LEN, ASSERT_IN, ASSERT_EQ, ASSERT_NOT_IN + +from .Executable import ExecutableParser, Executable diff --git a/riscemu/main.py b/riscemu/main.py index 96420ae..66dd644 100644 --- a/riscemu/main.py +++ b/riscemu/main.py @@ -1,3 +1,3 @@ from .CPU import * -from .tokenizer import * +from .Tokenizer import * diff --git a/run.py b/run.py index 85cda37..96c492a 100644 --- a/run.py +++ b/run.py @@ -26,5 +26,12 @@ loop: tk = RiscVTokenizer(RiscVInput(example_progr)) tk.tokenize() + print("tokens:") for token in tk.tokens: print(token) + + ep = ExecutableParser(tk) + ep.parse() + + print(ep) +