parsing of tokenized asm into MemorySections works

float_support
Anton Lydike 4 years ago
parent 2cee60a17c
commit da4ae7c4c1

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (riscemu)" project-jdk-type="Python SDK" />
</project> </project>

@ -1,7 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4"> <module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager"> <component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" /> <content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" /> <orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>

@ -0,0 +1,33 @@
class ParseException(BaseException):
def __init__(self, msg, data=None):
super().__init__()
self.msg = msg
self.data = data
def message(self):
return "{}(\"{}\", data={})".format(self.__class__.__name__, self.msg, self.data)
def ASSERT_EQ(a1, a2):
if a1 != a2:
raise ParseException("ASSERTION_FAILED: Expected elements to be equal!", (a1, a2))
def ASSERT_LEN(a1, size):
if len(a1) != size:
raise ParseException("ASSERTION_FAILED: Expected {} to be of length {}".format(a1, size), (a1, size))
def ASSERT_NOT_NULL(a1):
if a1 is None:
raise ParseException("ASSERTION_FAILED: Expected {} to be non null".format(a1), (a1,))
def ASSERT_NOT_IN(a1, a2):
if a1 in a2:
raise ParseException("ASSERTION_FAILED: Expected {} to not be in {}".format(a1, a2), (a1,a2))
def ASSERT_IN(a1, a2):
if a1 not in a2:
raise ParseException("ASSERTION_FAILED: Expected {} to not be in {}".format(a1, a2), (a1,a2))

@ -0,0 +1,33 @@
from dataclasses import dataclass, field
from typing import Dict, List, Tuple
from . import MemoryFlags, RiscVInstructionToken, RiscVTokenizer, RiscVSymbolToken, RiscVPseudoOpToken
from .Exceptions import *
@dataclass
class MemorySection:
name: str
flags: MemoryFlags
size: int = 0
start: int = -1
content: List[bytearray] = field(default_factory=list)
def add(self, data: bytearray):
self.content.append(data)
self.size += len(data)
class InstructionMemorySection(MemorySection):
insn: List[RiscVInstructionToken] = field(default_factory=list)
def add_insn(self, insn: RiscVInstructionToken):
self.insn.append(insn)
self.size += 4
@dataclass
class Executable:
run_ptr: Tuple[str, int]
sections: Dict[str, MemorySection]
symbols: Dict[str, Tuple[str, int]]

@ -0,0 +1,106 @@
from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
from .Exceptions import *
from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken
from typing import Dict, Tuple, List
def parse_numeric_argument(arg: str):
if arg.startswith('0x') or arg.startswith('0X'):
return int(arg, 16)
return int(arg)
class ExecutableParser:
tokenizer: RiscVTokenizer
def __init__(self, tokenizer: RiscVTokenizer):
self.instructions: List[RiscVInstructionToken] = list()
self.symbols: Dict[str, Tuple[str, int]] = dict()
self.sections: Dict[str, MemorySection] = dict()
self.tokenizer = tokenizer
self.active_section = None
self.implicit_sections = False
def parse(self):
for token in self.tokenizer.tokens:
if isinstance(token, RiscVInstructionToken):
self.parse_instruction(token)
elif isinstance(token, RiscVSymbolToken):
self.handle_symbol(token)
elif isinstance(token, RiscVPseudoOpToken):
self.handle_pseudo_op(token)
def get_execuable(self):
start_ptr = ('text', 0)
if '_start' in self.symbols:
start_ptr = self.symbols['_start']
elif 'main' in self.symbols:
start_ptr = self.symbols['main']
return Executable(start_ptr, self.sections, self.symbols)
def parse_instruction(self, ins: RiscVInstructionToken):
if self.active_section is None:
self.op_text()
self.implicit_sections = True
ASSERT_EQ(self.active_section, 'text')
sec = self.curr_sec()
if isinstance(sec, InstructionMemorySection):
sec.add_insn(ins)
else:
raise ParseException("SHOULD NOT BE REACHED")
def handle_symbol(self, token: RiscVSymbolToken):
ASSERT_NOT_IN(token.name, self.symbols)
sec_pos = self.curr_sec().size
self.symbols[token.name] = (self.active_section, sec_pos)
def handle_pseudo_op(self, op: RiscVPseudoOpToken):
name = 'op_' + op.name
if hasattr(self, name):
getattr(self, name)(op)
else:
raise ParseException("Unknown pseudo op: {}".format(op), (op,))
## Pseudo op implementations:
def op_section(self, op: RiscVPseudoOpToken):
ASSERT_LEN(op.args, 1)
name = op.args[0][1:]
ASSERT_IN(name, ('data', 'rodata', 'text'))
getattr(self, 'op_' + name)(op)
def op_text(self, op: RiscVPseudoOpToken = None):
self.set_sec('text', MemoryFlags(read_only=True, executable=True), cls=InstructionMemorySection)
def op_data(self, op: RiscVPseudoOpToken = None):
self.set_sec('data', MemoryFlags(read_only=False, executable=False))
def op_rodata(self, op: RiscVPseudoOpToken = None):
self.set_sec('rodata', MemoryFlags(read_only=True, executable=False))
def op_space(self, op: RiscVPseudoOpToken):
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
size = parse_numeric_argument(op.args[0])
self.curr_sec().add(bytearray(size))
def op_ascii(self, op: RiscVPseudoOpToken):
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
str = op.args[0][1:-1]
self.curr_sec().add(bytearray(str, 'ascii'))
def op_asciiz(self, op: RiscVPseudoOpToken):
ASSERT_IN(self.active_section, ('data', 'rodata'))
ASSERT_LEN(op.args, 1)
str = op.args[0][1:-1]
self.curr_sec().add(bytearray(str + '\0', 'ascii'))
## Section handler code
def set_sec(self, name: str, flags: MemoryFlags, cls=MemorySection):
if name not in self.sections:
self.sections[name] = cls(name, flags)
self.active_section = name
def curr_sec(self):
return self.sections[self.active_section]

@ -0,0 +1,15 @@
from dataclasses import dataclass
@dataclass(frozen=True)
class MemoryFlags:
read_only: bool
executable: bool
class MemoryRegion:
addr:int
len:int
flags: MemoryFlags
class MMU:
def __init__(self):

@ -231,7 +231,7 @@ class RiscVPseudoOpToken(RiscVToken):
class RiscVTokenizer: class RiscVTokenizer:
def __init__(self, input: RiscVInput): def __init__(self, input: RiscVInput):
self.input = input self.input = input
self.tokens = [] self.tokens: List[RiscVToken] = []
def tokenize(self): def tokenize(self):
while self.input.has_next(): while self.input.has_next():
@ -296,4 +296,3 @@ class RiscVTokenizer:
def parse_comment(self): def parse_comment(self):
# just consume the rest # just consume the rest
self.input.consume(regex=REG_UNTIL_NEWLINE) self.input.consume(regex=REG_UNTIL_NEWLINE)

@ -1,2 +1,10 @@
from .CPU import CPU, Registers, Syscall, SyscallInterface from .CPU import CPU, Registers, Syscall, SyscallInterface
from .tokenizer import RiscVToken, RiscVInput, RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken
from .Tokenizer import RiscVToken, RiscVInput, RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, \
RiscVPseudoOpToken, TokenType
from .MMU import MemoryFlags, MemoryRegion, MMU
from .Exceptions import ASSERT_NOT_NULL, ASSERT_LEN, ASSERT_IN, ASSERT_EQ, ASSERT_NOT_IN
from .Executable import ExecutableParser, Executable

@ -1,3 +1,3 @@
from .CPU import * from .CPU import *
from .tokenizer import * from .Tokenizer import *

@ -26,5 +26,12 @@ loop:
tk = RiscVTokenizer(RiscVInput(example_progr)) tk = RiscVTokenizer(RiscVInput(example_progr))
tk.tokenize() tk.tokenize()
print("tokens:")
for token in tk.tokens: for token in tk.tokens:
print(token) print(token)
ep = ExecutableParser(tk)
ep.parse()
print(ep)

Loading…
Cancel
Save