parsing of tokenized asm into MemorySections works

This commit is contained in:
Anton Lydike 2021-04-17 13:39:56 +02:00
parent 2cee60a17c
commit da4ae7c4c1
10 changed files with 209 additions and 6 deletions

2
.idea/misc.xml generated
View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (riscemu)" project-jdk-type="Python SDK" />
</project>

4
.idea/riscemu.iml generated
View File

@ -1,7 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>

33
riscemu/Exceptions.py Normal file
View File

@ -0,0 +1,33 @@
class ParseException(BaseException):
def __init__(self, msg, data=None):
super().__init__()
self.msg = msg
self.data = data
def message(self):
return "{}(\"{}\", data={})".format(self.__class__.__name__, self.msg, self.data)
def ASSERT_EQ(a1, a2):
if a1 != a2:
raise ParseException("ASSERTION_FAILED: Expected elements to be equal!", (a1, a2))
def ASSERT_LEN(a1, size):
if len(a1) != size:
raise ParseException("ASSERTION_FAILED: Expected {} to be of length {}".format(a1, size), (a1, size))
def ASSERT_NOT_NULL(a1):
if a1 is None:
raise ParseException("ASSERTION_FAILED: Expected {} to be non null".format(a1), (a1,))
def ASSERT_NOT_IN(a1, a2):
if a1 in a2:
raise ParseException("ASSERTION_FAILED: Expected {} to not be in {}".format(a1, a2), (a1,a2))
def ASSERT_IN(a1, a2):
if a1 not in a2:
raise ParseException("ASSERTION_FAILED: Expected {} to not be in {}".format(a1, a2), (a1,a2))

33
riscemu/Executable.py Normal file
View File

@ -0,0 +1,33 @@
from dataclasses import dataclass, field
from typing import Dict, List, Tuple
from . import MemoryFlags, RiscVInstructionToken, RiscVTokenizer, RiscVSymbolToken, RiscVPseudoOpToken
from .Exceptions import *
@dataclass
class MemorySection:
name: str
flags: MemoryFlags
size: int = 0
start: int = -1
content: List[bytearray] = field(default_factory=list)
def add(self, data: bytearray):
self.content.append(data)
self.size += len(data)
class InstructionMemorySection(MemorySection):
insn: List[RiscVInstructionToken] = field(default_factory=list)
def add_insn(self, insn: RiscVInstructionToken):
self.insn.append(insn)
self.size += 4
@dataclass
class Executable:
run_ptr: Tuple[str, int]
sections: Dict[str, MemorySection]
symbols: Dict[str, Tuple[str, int]]

106
riscemu/ExecutableParser.py Normal file
View File

@ -0,0 +1,106 @@
from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
from .Exceptions import *
from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken
from typing import Dict, Tuple, List
def parse_numeric_argument(arg: str):
if arg.startswith('0x') or arg.startswith('0X'):
return int(arg, 16)
return int(arg)
class ExecutableParser:
tokenizer: RiscVTokenizer
def __init__(self, tokenizer: RiscVTokenizer):
self.instructions: List[RiscVInstructionToken] = list()
self.symbols: Dict[str, Tuple[str, int]] = dict()
self.sections: Dict[str, MemorySection] = dict()
self.tokenizer = tokenizer
self.active_section = None
self.implicit_sections = False
def parse(self):
for token in self.tokenizer.tokens:
if isinstance(token, RiscVInstructionToken):
self.parse_instruction(token)
elif isinstance(token, RiscVSymbolToken):
self.handle_symbol(token)
elif isinstance(token, RiscVPseudoOpToken):
self.handle_pseudo_op(token)
def get_execuable(self):
start_ptr = ('text', 0)
if '_start' in self.symbols:
start_ptr = self.symbols['_start']
elif 'main' in self.symbols:
start_ptr = self.symbols['main']
return Executable(start_ptr, self.sections, self.symbols)
def parse_instruction(self, ins: RiscVInstructionToken):
if self.active_section is None:
self.op_text()
self.implicit_sections = True
ASSERT_EQ(self.active_section, 'text')
sec = self.curr_sec()
if isinstance(sec, InstructionMemorySection):
sec.add_insn(ins)
else:
raise ParseException("SHOULD NOT BE REACHED")
def handle_symbol(self, token: RiscVSymbolToken):
ASSERT_NOT_IN(token.name, self.symbols)
sec_pos = self.curr_sec().size
self.symbols[token.name] = (self.active_section, sec_pos)
def handle_pseudo_op(self, op: RiscVPseudoOpToken):
name = 'op_' + op.name
if hasattr(self, name):
getattr(self, name)(op)
else:
raise ParseException("Unknown pseudo op: {}".format(op), (op,))
## Pseudo op implementations:
def op_section(self, op: RiscVPseudoOpToken):
ASSERT_LEN(op.args, 1)
name = op.args[0][1:]
ASSERT_IN(name, ('data', 'rodata', 'text'))
getattr(self, 'op_' + name)(op)
def op_text(self, op: RiscVPseudoOpToken = None):
self.set_sec('text', MemoryFlags(read_only=True, executable=True), cls=InstructionMemorySection)
def op_data(self, op: RiscVPseudoOpToken = None):
self.set_sec('data', MemoryFlags(read_only=False, executable=False))
def op_rodata(self, op: RiscVPseudoOpToken = None):
self.set_sec('rodata', MemoryFlags(read_only=True, executable=False))
def op_space(self, op: RiscVPseudoOpToken):
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
size = parse_numeric_argument(op.args[0])
self.curr_sec().add(bytearray(size))
def op_ascii(self, op: RiscVPseudoOpToken):
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
str = op.args[0][1:-1]
self.curr_sec().add(bytearray(str, 'ascii'))
def op_asciiz(self, op: RiscVPseudoOpToken):
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
str = op.args[0][1:-1]
self.curr_sec().add(bytearray(str + '\0', 'ascii'))
## Section handler code
def set_sec(self, name: str, flags: MemoryFlags, cls=MemorySection):
if name not in self.sections:
self.sections[name] = cls(name, flags)
self.active_section = name
def curr_sec(self):
return self.sections[self.active_section]

15
riscemu/MMU.py Normal file
View File

@ -0,0 +1,15 @@
from dataclasses import dataclass
@dataclass(frozen=True)
class MemoryFlags:
read_only: bool
executable: bool
class MemoryRegion:
addr:int
len:int
flags: MemoryFlags
class MMU:
def __init__(self):

View File

@ -231,7 +231,7 @@ class RiscVPseudoOpToken(RiscVToken):
class RiscVTokenizer:
def __init__(self, input: RiscVInput):
self.input = input
self.tokens = []
self.tokens: List[RiscVToken] = []
def tokenize(self):
while self.input.has_next():
@ -296,4 +296,3 @@ class RiscVTokenizer:
def parse_comment(self):
# just consume the rest
self.input.consume(regex=REG_UNTIL_NEWLINE)

View File

@ -1,2 +1,10 @@
from .CPU import CPU, Registers, Syscall, SyscallInterface
from .tokenizer import RiscVToken, RiscVInput, RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken
from .Tokenizer import RiscVToken, RiscVInput, RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, \
RiscVPseudoOpToken, TokenType
from .MMU import MemoryFlags, MemoryRegion, MMU
from .Exceptions import ASSERT_NOT_NULL, ASSERT_LEN, ASSERT_IN, ASSERT_EQ, ASSERT_NOT_IN
from .Executable import ExecutableParser, Executable

View File

@ -1,3 +1,3 @@
from .CPU import *
from .tokenizer import *
from .Tokenizer import *

7
run.py
View File

@ -26,5 +26,12 @@ loop:
tk = RiscVTokenizer(RiscVInput(example_progr))
tk.tokenize()
print("tokens:")
for token in tk.tokens:
print(token)
ep = ExecutableParser(tk)
ep.parse()
print(ep)