initial commit

4 years ago · 2cee60a17c
commit 2cee60a17c
14 changed files with 678 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+venv
+__pycache__
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/riscemu.iml" filepath="$PROJECT_DIR$/.idea/riscemu.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/riscemu.iml
+++ b/.idea/riscemu.iml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/README.md
+++ b/README.md
@ -0,0 +1,42 @@
+# RiscV (userspace) emulator in python
+
+Implementing a basic RISC-V emulator, aimed at being easily extendable.
+
+Currently supported (but not implemented) instructions:
+
+````
+lb, lh, lw, lbu, lhu, sb, sh, sw, sll, slli, srl, srli, sra, 
+srai, add, addi, sub, lui, auipc, xor, xori, or, ori, and, 
+andi, slt, slti, sltu, sltiu, beq, bne, blt, bge, bltu, bgeu, 
+j, jr, jal, jalr, ret, scall, break, nop
+````
+
+Current register implementation (should be all standard userspace registers): 
+```
+Registers[read,written](
+        zero=0x00000000 ra  =0x00000000 sp  =0x00000000 gp  =0x00000000 tp  =0x00000000 fp  =0x00000000
+        --------------- --------------- --------------- --------------- --------------- --------------- 
+        a0  =0x00000000 s0  =0x00000000 t0  =0x00000000 ft0 =0x00000000 fa0 =0x00000000 fs0 =0x00000000
+        a1  =0x00000000 s1  =0x00000000 t1  =0x00000000 ft1 =0x00000000 fa1 =0x00000000 fs1 =0x00000000
+        a2  =0x00000000 s2  =0x00000000 t2  =0x00000000 ft2 =0x00000000 fa2 =0x00000000 fs2 =0x00000000
+        a3  =0x00000000 s3  =0x00000000 t3  =0x00000000 ft3 =0x00000000 fa3 =0x00000000 fs3 =0x00000000
+        a4  =0x00000000 s4  =0x00000000 t4  =0x00000000 ft4 =0x00000000 fa4 =0x00000000 fs4 =0x00000000
+        a5  =0x00000000 s5  =0x00000000 t5  =0x00000000 ft5 =0x00000000 fa5 =0x00000000 fs5 =0x00000000
+        a6  =0x00000000 s6  =0x00000000 t6  =0x00000000 ft6 =0x00000000 fa6 =0x00000000 fs6 =0x00000000
+        a7  =0x00000000 s7  =0x00000000                 ft7 =0x00000000 fa7 =0x00000000 fs7 =0x00000000
+                        s8  =0x00000000                                                 fs8 =0x00000000
+                        s9  =0x00000000                                                 fs9 =0x00000000
+                        s10 =0x00000000                                                 fs10=0x00000000
+                        s11 =0x00000000                                                 fs11=0x00000000
+)
+```
+
+Current pseudo ops:
+```
+.align, .ascii, .asciiz, .byte, .data, .double, .extern,
+.float, .globl, .half, .kdata, .ktext, .set, .space, .text, .word
+```
+
+## Resources:
+  * Pseudo ops: https://www.codetd.com/article/8981522
+  
--- a/requirements.txt
+++ b/requirements.txt
--- a/riscemu/CPU.py
+++ b/riscemu/CPU.py
@ -0,0 +1,256 @@
+from dataclasses import dataclass
+from collections import defaultdict
+
+COLOR = True
+
+FMT_ORANGE = '\033[33m'
+FMT_GRAY = '\033[37m'
+FMT_BOLD = '\033[1m'
+FMT_NONE = '\033[0m'
+FMT_UNDERLINE = '\033[4m'
+
+
+class Registers:
+    def __init__(self, cpu: 'CPU'):
+        self.cpu = cpu
+        self.vals = defaultdict(lambda: 0)
+        self.last_mod = 'ft0'
+        self.last_access = 'a3'
+
+    def dump(self, small=False):
+        named_regs = [self.reg_repr(reg) for reg in Registers.named_registers()]
+
+        lines = [[] for i in range(12)]
+        if small:
+            regs = [('a', 8), ('s', 12), ('t', 7)]
+        else:
+            regs = [
+                ('a', 8),
+                ('s', 12),
+                ('t', 7),
+                ('ft', 8),
+                ('fa', 8),
+                ('fs', 12),
+            ]
+        for name, s in regs:
+            for i in range(12):
+                if i >= s:
+                    lines[i].append(" " * 15)
+                else:
+                    reg = '{}{}'.format(name, i)
+                    lines[i].append(self.reg_repr(reg))
+
+        print("Registers[{},{}](".format(
+            FMT_ORANGE + FMT_UNDERLINE + 'read' + FMT_NONE,
+            FMT_ORANGE + FMT_BOLD + 'written' + FMT_NONE
+        ))
+        if small:
+            print("\t" + " ".join(named_regs[0:3]))
+            print("\t" + " ".join(named_regs[3:]))
+            print("\t" + "--------------- " * 3)
+        else:
+            print("\t" + " ".join(named_regs))
+            print("\t" + "--------------- " * 6)
+        for line in lines:
+            print("\t" + " ".join(line))
+        print(")")
+
+    def reg_repr(self, reg):
+        txt = '{:4}=0x{:08X}'.format(reg, self.get(reg))
+        if reg == 'fp':
+            reg = 's0'
+        if reg == self.last_mod:
+            return FMT_ORANGE + FMT_BOLD + txt + FMT_NONE
+        if reg == self.last_access:
+            return FMT_ORANGE + FMT_UNDERLINE + txt + FMT_NONE
+        if reg == 'zero':
+            return txt
+        if self.get(reg) == 0 and reg not in Registers.named_registers():
+            return FMT_GRAY + txt + FMT_NONE
+        return txt
+
+    def set(self, reg, val):
+        if reg == 'zero':
+            print("[Registers.set] trying to set read-only register: {}".format(reg))
+            return False
+        if reg not in Registers.all_registers():
+            print("[Registers.set] invalid register name: {}".format(reg))
+            return False
+        # replace fp register with s1, as these are the same register
+        if reg == 'fp':
+            reg = 's1'
+        self.last_mod = reg
+        setattr(self, reg, val)
+
+    def get(self, reg):
+        if not reg in Registers.all_registers():
+            print("[Registers.get] invalid register name: {}".format(reg))
+            return 0
+        if reg == 'fp':
+            reg = 's0'
+        return self.vals[reg]
+
+    @staticmethod
+    def all_registers():
+        return ['zero', 'ra', 'sp', 'gp', 'tp', 's0', 'fp',
+                't0', 't1', 't2', 't3', 't4', 't5', 't6',
+                's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
+                'a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7',
+                'ft0', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7',
+                'fs0', 'fs1', 'fs2', 'fs3', 'fs4', 'fs5', 'fs6', 'fs7', 'fs8', 'fs9', 'fs10', 'fs11',
+                'fa0', 'fa1', 'fa2', 'fa3', 'fa4', 'fa5', 'fa6', 'fa7']
+
+    @staticmethod
+    def named_registers():
+        return ['zero', 'ra', 'sp', 'gp', 'tp', 'fp']
+
+class CPU:
+    def instruction_lb(self, instruction):
+        pass
+
+    def instruction_lh(self, instruction):
+        pass
+
+    def instruction_lw(self, instruction):
+        pass
+
+    def instruction_lbu(self, instruction):
+        pass
+
+    def instruction_lhu(self, instruction):
+        pass
+
+    def instruction_sb(self, instruction):
+        pass
+
+    def instruction_sh(self, instruction):
+        pass
+
+    def instruction_sw(self, instruction):
+        pass
+
+    def instruction_sll(self, instruction):
+        pass
+
+    def instruction_slli(self, instruction):
+        pass
+
+    def instruction_srl(self, instruction):
+        pass
+
+    def instruction_srli(self, instruction):
+        pass
+
+    def instruction_sra(self, instruction):
+        pass
+
+    def instruction_srai(self, instruction):
+        pass
+
+    def instruction_add(self, instruction):
+        pass
+
+    def instruction_addi(self, instruction):
+        pass
+
+    def instruction_sub(self, instruction):
+        pass
+
+    def instruction_lui(self, instruction):
+        pass
+
+    def instruction_auipc(self, instruction):
+        pass
+
+    def instruction_xor(self, instruction):
+        pass
+
+    def instruction_xori(self, instruction):
+        pass
+
+    def instruction_or(self, instruction):
+        pass
+
+    def instruction_ori(self, instruction):
+        pass
+
+    def instruction_and(self, instruction):
+        pass
+
+    def instruction_andi(self, instruction):
+        pass
+
+    def instruction_slt(self, instruction):
+        pass
+
+    def instruction_slti(self, instruction):
+        pass
+
+    def instruction_sltu(self, instruction):
+        pass
+
+    def instruction_sltiu(self, instruction):
+        pass
+
+    def instruction_beq(self, instruction):
+        pass
+
+    def instruction_bne(self, instruction):
+        pass
+
+    def instruction_blt(self, instruction):
+        pass
+
+    def instruction_bge(self, instruction):
+        pass
+
+    def instruction_bltu(self, instruction):
+        pass
+
+    def instruction_bgeu(self, instruction):
+        pass
+
+    def instruction_j(self, instruction):
+        pass
+
+    def instruction_jr(self, instruction):
+        pass
+
+    def instruction_jal(self, instruction):
+        pass
+
+    def instruction_jalr(self, instruction):
+        pass
+
+    def instruction_ret(self, instruction):
+        pass
+
+    def instruction_scall(self, instruction):
+        pass
+
+    def instruction_break(self, instruction):
+        pass
+
+    def instruction_nop(self, instruction):
+        pass
+
+    @staticmethod
+    def all_instructions():
+        for method in vars(CPU):
+            if method.startswith('instruction_'):
+                yield method[12:]
+
+
+@dataclass(frozen=True)
+class Syscall:
+    id: int
+    registers: Registers
+
+
+class SyscallInterface:
+    def handle_syscall(self, scall: Syscall):
+        pass
+
+
+a = Registers(None)
+a.dump()
--- a/riscemu/init.py
+++ b/riscemu/init.py
@ -0,0 +1,2 @@
+from .CPU import CPU, Registers, Syscall, SyscallInterface
+from .tokenizer import RiscVToken, RiscVInput, RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken
--- a/riscemu/main.py
+++ b/riscemu/main.py
@ -0,0 +1,3 @@
+from .CPU import *
+from .tokenizer import *
+
--- a/riscemu/tokenizer.py
+++ b/riscemu/tokenizer.py
@ -0,0 +1,299 @@
+import re
+from enum import IntEnum
+from typing import List
+
+from . import CPU, Registers
+
+REGISTERS = list(Registers.all_registers())
+
+INSTRUCTIONS = list(CPU.all_instructions())
+
+PSEUDO_OPS = [
+    '.asciiz',
+    '.double',
+    '.extern',
+    '.global',
+    '.align',
+    '.float',
+    '.kdata',
+    '.ktext',
+    '.space',
+    '.ascii',
+    '.byte',
+    '.data',
+    '.half',
+    '.text',
+    '.word'
+    '.set',
+]
+
+COMMENT_START = ["#", ";"]
+
+REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):')
+
+REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n')
+
+REG_WHITESPACE = re.compile(r'^\s*')
+
+REG_NONWHITESPACE = re.compile(r'^[^\s]*')
+
+REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*')
+
+REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*')
+
+REG_VALID_ARGUMENT = re.compile(
+    r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?'
+)
+
+REG_ARG_SPLIT = re.compile(r'^,[ \t]*')
+
+
+def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")):
+    pos = 0
+    last_piece = 0
+    pieces = []
+    in_quotes = False
+    if string is None:
+        return pieces
+    while pos < len(string):
+        match = at.match(string[pos:])
+        if match is not None:
+            if not in_quotes:
+                pieces.append(string[last_piece:pos])
+                pos += len(match.group(0))
+                last_piece = pos
+            else:
+                pos += len(match.group(0))
+        elif string[pos] in quotes:
+            in_quotes = not in_quotes
+            pos += 1
+        elif string[pos] in COMMENT_START and not in_quotes:  # entering comment
+            break
+        else:
+            pos += 1
+    if in_quotes:
+        print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string))
+    pieces.append(string[last_piece:pos])
+    return pieces
+
+
+class RiscVInput:
+    def __init__(self, content: str):
+        self.content = content
+        self.pos = 0
+        self.len = len(content)
+
+    def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
+        at = self.pos + offset
+
+        if regex:
+            if not isinstance(regex, re.Pattern):
+                print("uncompiled regex passed to peek!")
+                reges = re.compile(regex)
+            match = regex.match(self.content[at:])
+            if match is None:
+                return None
+
+            if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
+                print("Cannot peek regex group that does not start at match start!")
+                return None
+            return match.group(regex_group)
+        if text:
+            if self.content[at:].startswith(text):
+                return self.content[at:at + len(text)]
+            return False
+        return self.content[at:at + size]
+
+    def peek_one_of(self, options: List[str]):
+        for text in options:
+            if self.peek(text=text):
+                return text
+        return False
+
+    def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
+        at = self.pos
+
+        if regex:
+            if not isinstance(regex, re.Pattern):
+                print("uncompiled regex passed to peek!")
+                regex = re.compile(regex)
+            match = regex.match(self.content[at:])
+            if match is None:
+                print("Regex matched none at {}!".format(self.context()))
+                return None
+
+            if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
+                print("Cannot consume regex group that does not start at match start!")
+                return None
+            self.pos += len(match.group(regex_group))
+            return match.group(regex_group)
+
+        if text:
+            if self.content[at:].startswith(text):
+                self.pos += len(text)
+                return text
+            return None
+
+        self.pos += size
+        return self.content[at:at + size]
+
+    def consume_one_of(self, options: List[str]):
+        for text in options:
+            if self.peek(text=text):
+                return self.consume(text=text)
+        return False
+
+    def seek_newline(self):
+        return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1)
+
+    def consume_whitespace(self, linebreak=True):
+        if linebreak:
+            return self.consume(regex=REG_WHITESPACE)
+        return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK)
+
+    def has_next(self):
+        return self.pos < self.len
+
+    def context(self, size: int = 5):
+        """
+        returns a context string:
+        <local input before pos>|<local input after pos>
+        """
+        start = max(self.pos - size, 0)
+        end = min(self.pos + size, self.len - 1)
+
+        return self.content[start:self.pos] + '|' + self.content[self.pos:end]
+
+
+class TokenType(IntEnum):
+    SYMBOL = 0
+    INSTRUCTION = 1
+    PSEUDO_OP = 2
+
+    def __repr__(self):
+        return self.name
+
+    def __str__(self):
+        return self.name
+
+
+class RiscVToken:
+    type: TokenType
+
+    def __init__(self, t_type: TokenType):
+        self.type = t_type
+
+    def __repr__(self):
+        return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text())
+
+    def text(self):
+        """
+        create text representation of instruction
+        """
+        return "unknown"
+
+
+class RiscVInstructionToken(RiscVToken):
+    def __init__(self, name, args):
+        super().__init__(TokenType.INSTRUCTION)
+        self.instruction = name
+        self.args = args
+
+    def text(self):
+        if len(self.args) == 0:
+            return self.instruction
+        if len(self.args) == 1:
+            return "{} {}".format(self.instruction, self.args[0])
+        if len(self.args) == 2:
+            return "{} {}, {}".format(self.instruction, *self.args)
+        return "{} {}, {}, {}".format(self.instruction, *self.args)
+
+
+class RiscVSymbolToken(RiscVToken):
+    def __init__(self, name):
+        super().__init__(TokenType.SYMBOL)
+        self.name = name
+
+    def text(self):
+        return self.name
+
+
+class RiscVPseudoOpToken(RiscVToken):
+    def __init__(self, name, args):
+        super().__init__(TokenType.PSEUDO_OP)
+        self.name = name
+        self.args = args
+
+    def text(self):
+        return "{} {}".format(self.name, self.args)
+
+
+class RiscVTokenizer:
+    def __init__(self, input: RiscVInput):
+        self.input = input
+        self.tokens = []
+
+    def tokenize(self):
+        while self.input.has_next():
+            # remove leading whitespaces, place cursor at text start
+            self.input.consume_whitespace()
+
+            # check if we have a pseudo op
+            if self.input.peek_one_of(PSEUDO_OPS):
+                self.parse_pseudo_op()
+
+            # check if we have a symbol (like main:)
+            elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL):
+                self.parse_symbol()
+
+            # comment
+            elif self.input.peek() in COMMENT_START:
+                self.parse_comment()
+
+            # must be instruction
+            elif self.input.peek_one_of(INSTRUCTIONS):
+                self.parse_instruction()
+            else:
+                token = self.input.peek(size=5)
+                print("Unknown token around {} at: {}".format(repr(token), repr(self.input.context())))
+                self.input.consume_whitespace()
+                print("After whitespace at: {}".format(repr(self.input.context())))
+            self.input.consume_whitespace()
+
+    def parse_pseudo_op(self):
+        name = self.input.consume_one_of(PSEUDO_OPS)
+        self.input.consume_whitespace(linebreak=False)
+
+        arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE)
+        if not arg_str:
+            args = []
+        else:
+            args = split_accepting_quotes(arg_str)
+
+        self.tokens.append(RiscVPseudoOpToken(name[1:], args))
+
+    def parse_symbol(self):
+        name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL)
+        self.tokens.append(RiscVSymbolToken(name[:-1]))
+        if not self.input.consume_whitespace():
+            print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format(
+                self.input.context()))
+
+    def parse_instruction(self):
+        ins = self.input.consume_one_of(INSTRUCTIONS)
+        args = []
+        self.input.consume_whitespace(linebreak=False)
+        while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3:
+            arg = self.input.consume(regex=REG_VALID_ARGUMENT)
+            args.append(arg)
+            if self.input.peek(text=','):
+                self.input.consume(text=',')
+                self.input.consume_whitespace(linebreak=False)
+            else:
+                break
+        self.tokens.append(RiscVInstructionToken(ins, args))
+
+    def parse_comment(self):
+        # just consume the rest
+        self.input.consume(regex=REG_UNTIL_NEWLINE)
+
--- a/run.py
+++ b/run.py
@ -0,0 +1,30 @@
+from riscemu import *
+
+if __name__ == '__main__':
+    example_progr = """
+            .data 0x200
+fibs:   .space 56
+
+        .text
+main:
+        add s1, zero, 0     # storage index
+        add s2, zero, 56    # last storage index
+        add t0, zero, 1     # t0 = F_{i}
+        add t1, zero, 1     # t1 = F_{i+1}
+loop:
+        sw  t0, fibs(s1)    # save 
+        add t2, t1, t0      # t2 = F_{i+2}
+        add t0, t1, 0       # t0 = t1
+        add t1, t2, 0       # t1 = t2
+        add s1, s1, 4       # increment storage pointer
+        blt s1, s2, loop    # loop as long as we did not reach array length
+        # exit gracefully
+        add a0, zero, 0
+        add a7, zero, 93
+        scall               # exit with code 0
+    """
+    tk = RiscVTokenizer(RiscVInput(example_progr))
+    tk.tokenize()
+
+    for token in tk.tokens:
+        print(token)