commit 2cee60a17c6698ac347e32e7033bbb508592cce8 Author: Anton Lydike Date: Fri Apr 16 23:52:13 2021 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d75edea --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv +__pycache__ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d1e22ec --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..a40226d --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/riscemu.iml b/.idea/riscemu.iml new file mode 100644 index 0000000..8b8c395 --- /dev/null +++ b/.idea/riscemu.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d24fa1 --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +# RiscV (userspace) emulator in python + +Implementing a basic RISC-V emulator, aimed at being easily extendable. + +Currently supported (but not implemented) instructions: + +```` +lb, lh, lw, lbu, lhu, sb, sh, sw, sll, slli, srl, srli, sra, +srai, add, addi, sub, lui, auipc, xor, xori, or, ori, and, +andi, slt, slti, sltu, sltiu, beq, bne, blt, bge, bltu, bgeu, +j, jr, jal, jalr, ret, scall, break, nop +```` + +Current register implementation (should be all standard userspace registers): +``` +Registers[read,written]( + zero=0x00000000 ra =0x00000000 sp =0x00000000 gp =0x00000000 tp =0x00000000 fp =0x00000000 + --------------- --------------- --------------- --------------- --------------- --------------- + a0 =0x00000000 s0 =0x00000000 t0 =0x00000000 ft0 =0x00000000 fa0 =0x00000000 fs0 =0x00000000 + a1 =0x00000000 s1 =0x00000000 t1 =0x00000000 ft1 =0x00000000 fa1 =0x00000000 fs1 =0x00000000 + a2 =0x00000000 s2 =0x00000000 t2 =0x00000000 ft2 =0x00000000 fa2 =0x00000000 fs2 =0x00000000 + a3 =0x00000000 s3 =0x00000000 t3 =0x00000000 ft3 =0x00000000 fa3 =0x00000000 fs3 =0x00000000 + a4 =0x00000000 s4 =0x00000000 t4 =0x00000000 ft4 =0x00000000 fa4 =0x00000000 fs4 =0x00000000 + a5 =0x00000000 s5 =0x00000000 t5 =0x00000000 ft5 =0x00000000 fa5 =0x00000000 fs5 =0x00000000 + a6 =0x00000000 s6 =0x00000000 t6 =0x00000000 ft6 =0x00000000 fa6 =0x00000000 fs6 =0x00000000 + a7 =0x00000000 s7 =0x00000000 ft7 =0x00000000 fa7 =0x00000000 fs7 =0x00000000 + s8 =0x00000000 fs8 =0x00000000 + s9 =0x00000000 fs9 =0x00000000 + s10 =0x00000000 fs10=0x00000000 + s11 =0x00000000 fs11=0x00000000 +) +``` + +Current pseudo ops: +``` +.align, .ascii, .asciiz, .byte, .data, .double, .extern, +.float, .globl, .half, .kdata, .ktext, .set, .space, .text, .word +``` + +## Resources: + * Pseudo ops: https://www.codetd.com/article/8981522 + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/riscemu/CPU.py b/riscemu/CPU.py new file mode 100644 index 0000000..969d72d --- /dev/null +++ b/riscemu/CPU.py @@ -0,0 +1,256 @@ +from dataclasses import dataclass +from collections import defaultdict + +COLOR = True + +FMT_ORANGE = '\033[33m' +FMT_GRAY = '\033[37m' +FMT_BOLD = '\033[1m' +FMT_NONE = '\033[0m' +FMT_UNDERLINE = '\033[4m' + + +class Registers: + def __init__(self, cpu: 'CPU'): + self.cpu = cpu + self.vals = defaultdict(lambda: 0) + self.last_mod = 'ft0' + self.last_access = 'a3' + + def dump(self, small=False): + named_regs = [self.reg_repr(reg) for reg in Registers.named_registers()] + + lines = [[] for i in range(12)] + if small: + regs = [('a', 8), ('s', 12), ('t', 7)] + else: + regs = [ + ('a', 8), + ('s', 12), + ('t', 7), + ('ft', 8), + ('fa', 8), + ('fs', 12), + ] + for name, s in regs: + for i in range(12): + if i >= s: + lines[i].append(" " * 15) + else: + reg = '{}{}'.format(name, i) + lines[i].append(self.reg_repr(reg)) + + print("Registers[{},{}](".format( + FMT_ORANGE + FMT_UNDERLINE + 'read' + FMT_NONE, + FMT_ORANGE + FMT_BOLD + 'written' + FMT_NONE + )) + if small: + print("\t" + " ".join(named_regs[0:3])) + print("\t" + " ".join(named_regs[3:])) + print("\t" + "--------------- " * 3) + else: + print("\t" + " ".join(named_regs)) + print("\t" + "--------------- " * 6) + for line in lines: + print("\t" + " ".join(line)) + print(")") + + def reg_repr(self, reg): + txt = '{:4}=0x{:08X}'.format(reg, self.get(reg)) + if reg == 'fp': + reg = 's0' + if reg == self.last_mod: + return FMT_ORANGE + FMT_BOLD + txt + FMT_NONE + if reg == self.last_access: + return FMT_ORANGE + FMT_UNDERLINE + txt + FMT_NONE + if reg == 'zero': + return txt + if self.get(reg) == 0 and reg not in Registers.named_registers(): + return FMT_GRAY + txt + FMT_NONE + return txt + + def set(self, reg, val): + if reg == 'zero': + print("[Registers.set] trying to set read-only register: {}".format(reg)) + return False + if reg not in Registers.all_registers(): + print("[Registers.set] invalid register name: {}".format(reg)) + return False + # replace fp register with s1, as these are the same register + if reg == 'fp': + reg = 's1' + self.last_mod = reg + setattr(self, reg, val) + + def get(self, reg): + if not reg in Registers.all_registers(): + print("[Registers.get] invalid register name: {}".format(reg)) + return 0 + if reg == 'fp': + reg = 's0' + return self.vals[reg] + + @staticmethod + def all_registers(): + return ['zero', 'ra', 'sp', 'gp', 'tp', 's0', 'fp', + 't0', 't1', 't2', 't3', 't4', 't5', 't6', + 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', + 'a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', + 'ft0', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', + 'fs0', 'fs1', 'fs2', 'fs3', 'fs4', 'fs5', 'fs6', 'fs7', 'fs8', 'fs9', 'fs10', 'fs11', + 'fa0', 'fa1', 'fa2', 'fa3', 'fa4', 'fa5', 'fa6', 'fa7'] + + @staticmethod + def named_registers(): + return ['zero', 'ra', 'sp', 'gp', 'tp', 'fp'] + +class CPU: + def instruction_lb(self, instruction): + pass + + def instruction_lh(self, instruction): + pass + + def instruction_lw(self, instruction): + pass + + def instruction_lbu(self, instruction): + pass + + def instruction_lhu(self, instruction): + pass + + def instruction_sb(self, instruction): + pass + + def instruction_sh(self, instruction): + pass + + def instruction_sw(self, instruction): + pass + + def instruction_sll(self, instruction): + pass + + def instruction_slli(self, instruction): + pass + + def instruction_srl(self, instruction): + pass + + def instruction_srli(self, instruction): + pass + + def instruction_sra(self, instruction): + pass + + def instruction_srai(self, instruction): + pass + + def instruction_add(self, instruction): + pass + + def instruction_addi(self, instruction): + pass + + def instruction_sub(self, instruction): + pass + + def instruction_lui(self, instruction): + pass + + def instruction_auipc(self, instruction): + pass + + def instruction_xor(self, instruction): + pass + + def instruction_xori(self, instruction): + pass + + def instruction_or(self, instruction): + pass + + def instruction_ori(self, instruction): + pass + + def instruction_and(self, instruction): + pass + + def instruction_andi(self, instruction): + pass + + def instruction_slt(self, instruction): + pass + + def instruction_slti(self, instruction): + pass + + def instruction_sltu(self, instruction): + pass + + def instruction_sltiu(self, instruction): + pass + + def instruction_beq(self, instruction): + pass + + def instruction_bne(self, instruction): + pass + + def instruction_blt(self, instruction): + pass + + def instruction_bge(self, instruction): + pass + + def instruction_bltu(self, instruction): + pass + + def instruction_bgeu(self, instruction): + pass + + def instruction_j(self, instruction): + pass + + def instruction_jr(self, instruction): + pass + + def instruction_jal(self, instruction): + pass + + def instruction_jalr(self, instruction): + pass + + def instruction_ret(self, instruction): + pass + + def instruction_scall(self, instruction): + pass + + def instruction_break(self, instruction): + pass + + def instruction_nop(self, instruction): + pass + + @staticmethod + def all_instructions(): + for method in vars(CPU): + if method.startswith('instruction_'): + yield method[12:] + + +@dataclass(frozen=True) +class Syscall: + id: int + registers: Registers + + +class SyscallInterface: + def handle_syscall(self, scall: Syscall): + pass + + +a = Registers(None) +a.dump() diff --git a/riscemu/__init__.py b/riscemu/__init__.py new file mode 100644 index 0000000..7db1ca4 --- /dev/null +++ b/riscemu/__init__.py @@ -0,0 +1,2 @@ +from .CPU import CPU, Registers, Syscall, SyscallInterface +from .tokenizer import RiscVToken, RiscVInput, RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken diff --git a/riscemu/main.py b/riscemu/main.py new file mode 100644 index 0000000..96420ae --- /dev/null +++ b/riscemu/main.py @@ -0,0 +1,3 @@ +from .CPU import * +from .tokenizer import * + diff --git a/riscemu/tokenizer.py b/riscemu/tokenizer.py new file mode 100644 index 0000000..9d5e499 --- /dev/null +++ b/riscemu/tokenizer.py @@ -0,0 +1,299 @@ +import re +from enum import IntEnum +from typing import List + +from . import CPU, Registers + +REGISTERS = list(Registers.all_registers()) + +INSTRUCTIONS = list(CPU.all_instructions()) + +PSEUDO_OPS = [ + '.asciiz', + '.double', + '.extern', + '.global', + '.align', + '.float', + '.kdata', + '.ktext', + '.space', + '.ascii', + '.byte', + '.data', + '.half', + '.text', + '.word' + '.set', +] + +COMMENT_START = ["#", ";"] + +REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):') + +REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n') + +REG_WHITESPACE = re.compile(r'^\s*') + +REG_NONWHITESPACE = re.compile(r'^[^\s]*') + +REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*') + +REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*') + +REG_VALID_ARGUMENT = re.compile( + r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?' +) + +REG_ARG_SPLIT = re.compile(r'^,[ \t]*') + + +def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")): + pos = 0 + last_piece = 0 + pieces = [] + in_quotes = False + if string is None: + return pieces + while pos < len(string): + match = at.match(string[pos:]) + if match is not None: + if not in_quotes: + pieces.append(string[last_piece:pos]) + pos += len(match.group(0)) + last_piece = pos + else: + pos += len(match.group(0)) + elif string[pos] in quotes: + in_quotes = not in_quotes + pos += 1 + elif string[pos] in COMMENT_START and not in_quotes: # entering comment + break + else: + pos += 1 + if in_quotes: + print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string)) + pieces.append(string[last_piece:pos]) + return pieces + + +class RiscVInput: + def __init__(self, content: str): + self.content = content + self.pos = 0 + self.len = len(content) + + def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0): + at = self.pos + offset + + if regex: + if not isinstance(regex, re.Pattern): + print("uncompiled regex passed to peek!") + reges = re.compile(regex) + match = regex.match(self.content[at:]) + if match is None: + return None + + if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)): + print("Cannot peek regex group that does not start at match start!") + return None + return match.group(regex_group) + if text: + if self.content[at:].startswith(text): + return self.content[at:at + len(text)] + return False + return self.content[at:at + size] + + def peek_one_of(self, options: List[str]): + for text in options: + if self.peek(text=text): + return text + return False + + def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0): + at = self.pos + + if regex: + if not isinstance(regex, re.Pattern): + print("uncompiled regex passed to peek!") + regex = re.compile(regex) + match = regex.match(self.content[at:]) + if match is None: + print("Regex matched none at {}!".format(self.context())) + return None + + if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)): + print("Cannot consume regex group that does not start at match start!") + return None + self.pos += len(match.group(regex_group)) + return match.group(regex_group) + + if text: + if self.content[at:].startswith(text): + self.pos += len(text) + return text + return None + + self.pos += size + return self.content[at:at + size] + + def consume_one_of(self, options: List[str]): + for text in options: + if self.peek(text=text): + return self.consume(text=text) + return False + + def seek_newline(self): + return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1) + + def consume_whitespace(self, linebreak=True): + if linebreak: + return self.consume(regex=REG_WHITESPACE) + return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK) + + def has_next(self): + return self.pos < self.len + + def context(self, size: int = 5): + """ + returns a context string: + | + """ + start = max(self.pos - size, 0) + end = min(self.pos + size, self.len - 1) + + return self.content[start:self.pos] + '|' + self.content[self.pos:end] + + +class TokenType(IntEnum): + SYMBOL = 0 + INSTRUCTION = 1 + PSEUDO_OP = 2 + + def __repr__(self): + return self.name + + def __str__(self): + return self.name + + +class RiscVToken: + type: TokenType + + def __init__(self, t_type: TokenType): + self.type = t_type + + def __repr__(self): + return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text()) + + def text(self): + """ + create text representation of instruction + """ + return "unknown" + + +class RiscVInstructionToken(RiscVToken): + def __init__(self, name, args): + super().__init__(TokenType.INSTRUCTION) + self.instruction = name + self.args = args + + def text(self): + if len(self.args) == 0: + return self.instruction + if len(self.args) == 1: + return "{} {}".format(self.instruction, self.args[0]) + if len(self.args) == 2: + return "{} {}, {}".format(self.instruction, *self.args) + return "{} {}, {}, {}".format(self.instruction, *self.args) + + +class RiscVSymbolToken(RiscVToken): + def __init__(self, name): + super().__init__(TokenType.SYMBOL) + self.name = name + + def text(self): + return self.name + + +class RiscVPseudoOpToken(RiscVToken): + def __init__(self, name, args): + super().__init__(TokenType.PSEUDO_OP) + self.name = name + self.args = args + + def text(self): + return "{} {}".format(self.name, self.args) + + +class RiscVTokenizer: + def __init__(self, input: RiscVInput): + self.input = input + self.tokens = [] + + def tokenize(self): + while self.input.has_next(): + # remove leading whitespaces, place cursor at text start + self.input.consume_whitespace() + + # check if we have a pseudo op + if self.input.peek_one_of(PSEUDO_OPS): + self.parse_pseudo_op() + + # check if we have a symbol (like main:) + elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL): + self.parse_symbol() + + # comment + elif self.input.peek() in COMMENT_START: + self.parse_comment() + + # must be instruction + elif self.input.peek_one_of(INSTRUCTIONS): + self.parse_instruction() + else: + token = self.input.peek(size=5) + print("Unknown token around {} at: {}".format(repr(token), repr(self.input.context()))) + self.input.consume_whitespace() + print("After whitespace at: {}".format(repr(self.input.context()))) + self.input.consume_whitespace() + + def parse_pseudo_op(self): + name = self.input.consume_one_of(PSEUDO_OPS) + self.input.consume_whitespace(linebreak=False) + + arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE) + if not arg_str: + args = [] + else: + args = split_accepting_quotes(arg_str) + + self.tokens.append(RiscVPseudoOpToken(name[1:], args)) + + def parse_symbol(self): + name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL) + self.tokens.append(RiscVSymbolToken(name[:-1])) + if not self.input.consume_whitespace(): + print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format( + self.input.context())) + + def parse_instruction(self): + ins = self.input.consume_one_of(INSTRUCTIONS) + args = [] + self.input.consume_whitespace(linebreak=False) + while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3: + arg = self.input.consume(regex=REG_VALID_ARGUMENT) + args.append(arg) + if self.input.peek(text=','): + self.input.consume(text=',') + self.input.consume_whitespace(linebreak=False) + else: + break + self.tokens.append(RiscVInstructionToken(ins, args)) + + def parse_comment(self): + # just consume the rest + self.input.consume(regex=REG_UNTIL_NEWLINE) + diff --git a/run.py b/run.py new file mode 100644 index 0000000..85cda37 --- /dev/null +++ b/run.py @@ -0,0 +1,30 @@ +from riscemu import * + +if __name__ == '__main__': + example_progr = """ + .data 0x200 +fibs: .space 56 + + .text +main: + add s1, zero, 0 # storage index + add s2, zero, 56 # last storage index + add t0, zero, 1 # t0 = F_{i} + add t1, zero, 1 # t1 = F_{i+1} +loop: + sw t0, fibs(s1) # save + add t2, t1, t0 # t2 = F_{i+2} + add t0, t1, 0 # t0 = t1 + add t1, t2, 0 # t1 = t2 + add s1, s1, 4 # increment storage pointer + blt s1, s2, loop # loop as long as we did not reach array length + # exit gracefully + add a0, zero, 0 + add a7, zero, 93 + scall # exit with code 0 + """ + tk = RiscVTokenizer(RiscVInput(example_progr)) + tk.tokenize() + + for token in tk.tokens: + print(token)