From 0488a9d6bc3efdb23e51eeae9f67573a58164ee6 Mon Sep 17 00:00:00 2001 From: Anton Lydike Date: Tue, 18 Jan 2022 21:08:07 +0100 Subject: [PATCH] finished basic RISC-V parser --- .idea/riscemu.iml | 1 + riscemu/__init__.py | 4 +- riscemu/assembler.py | 71 +++++++++++++------ riscemu/base_types.py | 8 ++- riscemu/exceptions.py | 4 +- riscemu/helpers.py | 4 +- riscemu/instructions/InstructionSet.py | 17 ++--- riscemu/instructions/RV32A.py | 24 +++---- riscemu/instructions/RV32I.py | 96 +++++++++++++------------- riscemu/instructions/RV32M.py | 16 ++--- riscemu/parser.py | 27 +++++++- riscemu/tokenizer.py | 47 +++++++++++-- riscemu/types.py | 8 ++- test/__init__.py | 2 + test/test_helpers.py | 2 +- test/test_tokenizer.py | 49 ++++++++++++- 16 files changed, 260 insertions(+), 120 deletions(-) diff --git a/.idea/riscemu.iml b/.idea/riscemu.iml index 74d515a..8ed6672 100644 --- a/.idea/riscemu.iml +++ b/.idea/riscemu.iml @@ -2,6 +2,7 @@ + diff --git a/riscemu/__init__.py b/riscemu/__init__.py index d580e2b..6319b86 100644 --- a/riscemu/__init__.py +++ b/riscemu/__init__.py @@ -11,7 +11,7 @@ It contains everything needed to run assembly files, so you don't need any custo from .exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \ ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException -from .base_types import Executable, LoadedExecutable, LoadedMemorySection +#from .base_types import Executable, LoadedExecutable, LoadedMemorySection from .instructions import * @@ -22,6 +22,8 @@ from .CPU import CPU from .config import RunConfig +from .parser import tokenize, parse_tokens, parse_program_from_file + __author__ = "Anton Lydike " __copyright__ = "Copyright 2021 Anton Lydike" __version__ = '1.0.0' \ No newline at end of file diff --git a/riscemu/assembler.py b/riscemu/assembler.py index c8c7546..1ec9731 100644 --- a/riscemu/assembler.py +++ b/riscemu/assembler.py @@ -1,16 +1,14 @@ -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Union, List from enum import Enum, auto from typing import Optional, Tuple, Union -from helpers import parse_numeric_argument -from .base_types import Program, T_RelativeAddress, InstructionContext +from .helpers import parse_numeric_argument, align_addr, int_to_bytes +from .base_types import Program, T_RelativeAddress, InstructionContext, Instruction from .colors import FMT_PARSE, FMT_NONE -from .exceptions import ParseException -from .helpers import ASSERT_LEN +from .exceptions import ParseException, ASSERT_LEN, ASSERT_NOT_NULL from .tokenizer import Token from .types import BinaryDataMemorySection, InstructionMemorySection - INSTRUCTION_SECTION_NAMES = ('.text', '.init', '.fini') @@ -21,13 +19,25 @@ class MemorySectionType(Enum): class CurrentSection: name: str - data: Union[list, bytearray] + data: Union[List[Instruction], bytearray] type: MemorySectionType + base: int + + def __init__(self, name: str, type: MemorySectionType, base: int = 0): + self.name = name + self.type = type + self.base = base + if self.type == MemorySectionType.Data: + self.data = bytearray() + elif self.type == MemorySectionType.Instructions: + self.data = list() + else: + raise ParseException("Unknown section type: {}".format(type)) def current_address(self) -> T_RelativeAddress: if self.type == MemorySectionType.Data: - return len(self.data) - return len(self.data) * 4 + return len(self.data) + self.base + return len(self.data) * 4 + self.base def __repr__(self): return "{}(name={},data={},type={})".format( @@ -47,18 +57,27 @@ class ParseContext: self.section = None def finalize(self) -> Program: - self.finalize_section() + self._finalize_section() return self.program - def finalize_section(self): + def _finalize_section(self): if self.section is None: return if self.section.type == MemorySectionType.Data: - section = BinaryDataMemorySection(self.section.data, self.section.name, self.context) + section = BinaryDataMemorySection(self.section.data, self.section.name, self.context, self.program) self.program.add_section(section) elif self.section.type == MemorySectionType.Instructions: - section = InstructionMemorySection(self.section.data, self.section.name, self.context) + section = InstructionMemorySection(self.section.data, self.section.name, self.context, self.program) self.program.add_section(section) + self.section = None + + def new_section(self, name: str, type: MemorySectionType): + base = 0 + if self.section is not None: + base = align_addr(self.section.current_address(), 4) + print("base at {}".format(base)) + self._finalize_section() + self.section = CurrentSection(name, type, base) def __repr__(self): return "{}(\n\tsetion={},\n\tprogram={}\n)".format( @@ -100,21 +119,20 @@ class AssemblerDirectives: @classmethod def op_section(cls, token: Token, args: Tuple[str], context: ParseContext): ASSERT_LEN(args, 1) - context.finalize_section() - if get_section_base_name(args[0]) in INSTRUCTION_SECTION_NAMES: - context.section.type = MemorySectionType.Instructions - context.section.data = list() + context.new_section(args[0], MemorySectionType.Instructions) else: - context.section.type = MemorySectionType.Data - context.section.data = bytearray() - context.section.name = args[0] + context.new_section(args[0], MemorySectionType.Data) @classmethod def op_globl(cls, token: Token, args: Tuple[str], context: ParseContext): ASSERT_LEN(args, 1) context.program.global_labels.add(args[0]) + @classmethod + def op_global(cls, token: Token, args: Tuple[str], context: ParseContext): + cls.op_globl(token, args, context) + @classmethod def op_equ(cls, token: Token, args: Tuple[str], context: ParseContext): ASSERT_LEN(args, 2) @@ -122,6 +140,14 @@ class AssemblerDirectives: value = parse_numeric_argument(args[1]) context.context.labels[name] = value + @classmethod + def op_space(cls, token: Token, args: Tuple[str], context: ParseContext): + ASSERT_LEN(args, 1) + ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data) + + size = parse_numeric_argument(args[0]) + cls.add_bytes(size, None, context) + @classmethod def op_zero(cls, token: Token, args: Tuple[str], context: ParseContext): ASSERT_LEN(args, 1) @@ -130,11 +156,14 @@ class AssemblerDirectives: cls.add_bytes(size, bytearray(size), context) @classmethod - def add_bytes(cls, size: int, content: Union[None, int, bytearray], context: ParseContext): + def add_bytes(cls, size: int, content: Union[None, int, bytearray], context: ParseContext, unsigned=False): ASSERT_IN_SECTION_TYPE(context, MemorySectionType.Data) if content is None: content = bytearray(size) + if isinstance(context, int): + content = int_to_bytes(content, size, unsigned) + context.section.data += content @classmethod def add_text(cls, text: str, context: ParseContext, zero_terminate: bool = True): diff --git a/riscemu/base_types.py b/riscemu/base_types.py index 0bf92b7..43dfb73 100644 --- a/riscemu/base_types.py +++ b/riscemu/base_types.py @@ -7,12 +7,14 @@ This file contains base classes which represent loaded programs """ import re -from abc import ABC +from abc import ABC, abstractmethod +from collections import defaultdict from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Set -from collections import defaultdict -from .helpers import * +from .colors import FMT_MEM, FMT_NONE, FMT_UNDERLINE, FMT_ORANGE +from .exceptions import ParseException +from .helpers import format_bytes T_RelativeAddress = int T_AbsoluteAddress = int diff --git a/riscemu/exceptions.py b/riscemu/exceptions.py index b75b358..fd6f130 100644 --- a/riscemu/exceptions.py +++ b/riscemu/exceptions.py @@ -4,8 +4,6 @@ RiscEmu (c) 2021 Anton Lydike SPDX-License-Identifier: MIT """ -import typing - from abc import abstractmethod from .base_types import Instruction from .colors import * @@ -21,7 +19,7 @@ class RiscemuBaseException(BaseException): class ParseException(RiscemuBaseException): def __init__(self, msg, data=None): - super().__init__() + super().__init__(msg, data) self.msg = msg self.data = data diff --git a/riscemu/helpers.py b/riscemu/helpers.py index ad8ac06..9946b5d 100644 --- a/riscemu/helpers.py +++ b/riscemu/helpers.py @@ -6,7 +6,7 @@ SPDX-License-Identifier: MIT from math import log10, ceil from .exceptions import * -from typing import Iterable, Iterator, TypeVar, Generic, List +from typing import Iterable, Iterator, TypeVar, Generic, List, Optional def align_addr(addr: int, to_bytes: int = 8) -> int: @@ -124,7 +124,7 @@ class Peekable(Generic[T], Iterator[T]): return self.cache.pop() return next(self.iterable) - def peek(self) -> T: + def peek(self) -> Optional[T]: try: if self.cache: return self.cache[0] diff --git a/riscemu/instructions/InstructionSet.py b/riscemu/instructions/InstructionSet.py index 6666dc9..b6a19b7 100644 --- a/riscemu/instructions/InstructionSet.py +++ b/riscemu/instructions/InstructionSet.py @@ -8,8 +8,9 @@ from typing import Tuple, Callable, Dict from abc import ABC from ..CPU import CPU -from ..helpers import ASSERT_LEN, ASSERT_IN, to_unsigned -from ..base_types import LoadedInstruction +from ..helpers import to_unsigned +from ..exceptions import ASSERT_LEN, ASSERT_IN +from ..base_types import Instruction class InstructionSet(ABC): @@ -30,7 +31,7 @@ class InstructionSet(ABC): self.name = self.__class__.__name__ self.cpu = cpu - def load(self) -> Dict[str, Callable[['LoadedInstruction'], None]]: + def load(self) -> Dict[str, Callable[['Instruction'], None]]: """ This is called by the CPU once it instantiates this instruction set @@ -51,7 +52,7 @@ class InstructionSet(ABC): if member.startswith('instruction_'): yield member[12:].replace('_', '.'), getattr(self, member) - def parse_mem_ins(self, ins: 'LoadedInstruction') -> Tuple[str, int]: + def parse_mem_ins(self, ins: 'Instruction') -> Tuple[str, int]: """ parses both rd, rs, imm and rd, imm(rs) argument format and returns (rd, imm+rs1) (so a register and address tuple for memory instructions) @@ -69,7 +70,7 @@ class InstructionSet(ABC): rd = ins.get_reg(0) return rd, rs + imm - def parse_rd_rs_rs(self, ins: 'LoadedInstruction', signed=True) -> Tuple[str, int, int]: + def parse_rd_rs_rs(self, ins: 'Instruction', signed=True) -> Tuple[str, int, int]: """ Assumes the command is in rd, rs1, rs2 format Returns the name of rd, and the values in rs1 and rs2 @@ -84,7 +85,7 @@ class InstructionSet(ABC): to_unsigned(self.get_reg_content(ins, 1)), \ to_unsigned(self.get_reg_content(ins, 2)) - def parse_rd_rs_imm(self, ins: 'LoadedInstruction', signed=True) -> Tuple[str, int, int]: + def parse_rd_rs_imm(self, ins: 'Instruction', signed=True) -> Tuple[str, int, int]: """ Assumes the command is in rd, rs, imm format Returns the name of rd, the value in rs and the immediate imm @@ -99,7 +100,7 @@ class InstructionSet(ABC): to_unsigned(self.get_reg_content(ins, 1)), \ to_unsigned(ins.get_imm(2)) - def parse_rs_rs_imm(self, ins: 'LoadedInstruction', signed=True) -> Tuple[int, int, int]: + def parse_rs_rs_imm(self, ins: 'Instruction', signed=True) -> Tuple[int, int, int]: """ Assumes the command is in rs1, rs2, imm format Returns the values in rs1, rs2 and the immediate imm @@ -113,7 +114,7 @@ class InstructionSet(ABC): to_unsigned(self.get_reg_content(ins, 1)), \ to_unsigned(ins.get_imm(2)) - def get_reg_content(self, ins: 'LoadedInstruction', ind: int) -> int: + def get_reg_content(self, ins: 'Instruction', ind: int) -> int: """ get the register name from ins and then return the register contents """ diff --git a/riscemu/instructions/RV32A.py b/riscemu/instructions/RV32A.py index 3de2383..ba6a8a6 100644 --- a/riscemu/instructions/RV32A.py +++ b/riscemu/instructions/RV32A.py @@ -1,4 +1,4 @@ -from .InstructionSet import InstructionSet, LoadedInstruction +from .InstructionSet import InstructionSet, Instruction from ..exceptions import INS_NOT_IMPLEMENTED from ..helpers import int_from_bytes, int_to_bytes, to_unsigned, to_signed @@ -10,13 +10,13 @@ class RV32A(InstructionSet): for this? """ - def instruction_lr_w(self, ins: 'LoadedInstruction'): + def instruction_lr_w(self, ins: 'Instruction'): INS_NOT_IMPLEMENTED(ins) - def instruction_sc_w(self, ins: 'LoadedInstruction'): + def instruction_sc_w(self, ins: 'Instruction'): INS_NOT_IMPLEMENTED(ins) - def instruction_amoswap_w(self, ins: 'LoadedInstruction'): + def instruction_amoswap_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) if dest == 'zero': self.mmu.write(addr, int_to_bytes(addr, 4)) @@ -25,37 +25,37 @@ class RV32A(InstructionSet): self.mmu.write(addr, int_to_bytes(val, 4)) self.regs.set(dest, old) - def instruction_amoadd_w(self, ins: 'LoadedInstruction'): + def instruction_amoadd_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) old = int_from_bytes(self.mmu.read(addr, 4)) self.mmu.write(addr, int_to_bytes(old + val, 4)) self.regs.set(dest, old) - def instruction_amoand_w(self, ins: 'LoadedInstruction'): + def instruction_amoand_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) old = int_from_bytes(self.mmu.read(addr, 4)) self.mmu.write(addr, int_to_bytes(old & val, 4)) self.regs.set(dest, old) - def instruction_amoor_w(self, ins: 'LoadedInstruction'): + def instruction_amoor_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) old = int_from_bytes(self.mmu.read(addr, 4)) self.mmu.write(addr, int_to_bytes(old | val, 4)) self.regs.set(dest, old) - def instruction_amoxor_w(self, ins: 'LoadedInstruction'): + def instruction_amoxor_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) old = int_from_bytes(self.mmu.read(addr, 4)) self.mmu.write(addr, int_to_bytes(old ^ val, 4)) self.regs.set(dest, old) - def instruction_amomax_w(self, ins: 'LoadedInstruction'): + def instruction_amomax_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) old = int_from_bytes(self.mmu.read(addr, 4)) self.mmu.write(addr, int_to_bytes(max(old, val), 4)) self.regs.set(dest, old) - def instruction_amomaxu_w(self, ins: 'LoadedInstruction'): + def instruction_amomaxu_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) val = to_unsigned(val) old = int_from_bytes(self.mmu.read(addr, 4), unsigned=True) @@ -63,13 +63,13 @@ class RV32A(InstructionSet): self.mmu.write(addr, int_to_bytes(to_signed(max(old, val)), 4)) self.regs.set(dest, old) - def instruction_amomin_w(self, ins: 'LoadedInstruction'): + def instruction_amomin_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) old = int_from_bytes(self.mmu.read(addr, 4)) self.mmu.write(addr, int_to_bytes(min(old, val), 4)) self.regs.set(dest, old) - def instruction_amominu_w(self, ins: 'LoadedInstruction'): + def instruction_amominu_w(self, ins: 'Instruction'): dest, addr, val = self.parse_rd_rs_rs(ins) val = to_unsigned(val) old = int_from_bytes(self.mmu.read(addr, 4), unsigned=True) diff --git a/riscemu/instructions/RV32I.py b/riscemu/instructions/RV32I.py index cb19af6..5a30b5f 100644 --- a/riscemu/instructions/RV32I.py +++ b/riscemu/instructions/RV32I.py @@ -11,7 +11,7 @@ from ..colors import FMT_DEBUG, FMT_NONE from ..debug import launch_debug_session from ..exceptions import LaunchDebuggerException from ..syscall import Syscall -from ..base_types import LoadedInstruction +from ..base_types import Instruction class RV32I(InstructionSet): @@ -23,39 +23,39 @@ class RV32I(InstructionSet): See https://maxvytech.com/images/RV32I-11-2018.pdf for a more detailed overview """ - def instruction_lb(self, ins: 'LoadedInstruction'): + def instruction_lb(self, ins: 'Instruction'): rd, addr = self.parse_mem_ins(ins) self.regs.set(rd, int_from_bytes(self.mmu.read(addr, 1))) - def instruction_lh(self, ins: 'LoadedInstruction'): + def instruction_lh(self, ins: 'Instruction'): rd, addr = self.parse_mem_ins(ins) self.regs.set(rd, int_from_bytes(self.mmu.read(addr, 2))) - def instruction_lw(self, ins: 'LoadedInstruction'): + def instruction_lw(self, ins: 'Instruction'): rd, addr = self.parse_mem_ins(ins) self.regs.set(rd, int_from_bytes(self.mmu.read(addr, 4))) - def instruction_lbu(self, ins: 'LoadedInstruction'): + def instruction_lbu(self, ins: 'Instruction'): rd, addr = self.parse_mem_ins(ins) self.regs.set(rd, int_from_bytes(self.mmu.read(addr, 1), unsigned=True)) - def instruction_lhu(self, ins: 'LoadedInstruction'): + def instruction_lhu(self, ins: 'Instruction'): rd, addr = self.parse_mem_ins(ins) self.regs.set(rd, int_from_bytes(self.mmu.read(addr, 2), unsigned=True)) - def instruction_sb(self, ins: 'LoadedInstruction'): + def instruction_sb(self, ins: 'Instruction'): rd, addr = self.parse_mem_ins(ins) self.mmu.write(addr, 1, int_to_bytes(self.regs.get(rd), 1)) - def instruction_sh(self, ins: 'LoadedInstruction'): + def instruction_sh(self, ins: 'Instruction'): rd, addr = self.parse_mem_ins(ins) self.mmu.write(addr, 2, int_to_bytes(self.regs.get(rd), 2)) - def instruction_sw(self, ins: 'LoadedInstruction'): + def instruction_sw(self, ins: 'Instruction'): rd, addr = self.parse_mem_ins(ins) self.mmu.write(addr, 4, int_to_bytes(self.regs.get(rd), 4)) - def instruction_sll(self, ins: 'LoadedInstruction'): + def instruction_sll(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 3) dst = ins.get_reg(0) src1 = ins.get_reg(1) @@ -65,7 +65,7 @@ class RV32I(InstructionSet): to_signed(to_unsigned(self.regs.get(src1)) << (self.regs.get(src2) & 0b11111)) ) - def instruction_slli(self, ins: 'LoadedInstruction'): + def instruction_slli(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 3) dst = ins.get_reg(0) src1 = ins.get_reg(1) @@ -75,7 +75,7 @@ class RV32I(InstructionSet): to_signed(to_unsigned(self.regs.get(src1)) << (imm & 0b11111)) ) - def instruction_srl(self, ins: 'LoadedInstruction'): + def instruction_srl(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 3) dst = ins.get_reg(0) src1 = ins.get_reg(1) @@ -85,7 +85,7 @@ class RV32I(InstructionSet): to_signed(to_unsigned(self.regs.get(src1)) >> (self.regs.get(src2) & 0b11111)) ) - def instruction_srli(self, ins: 'LoadedInstruction'): + def instruction_srli(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 3) dst = ins.get_reg(0) src1 = ins.get_reg(1) @@ -95,7 +95,7 @@ class RV32I(InstructionSet): to_signed(to_unsigned(self.regs.get(src1)) >> (imm & 0b11111)) ) - def instruction_sra(self, ins: 'LoadedInstruction'): + def instruction_sra(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 3) dst = ins.get_reg(0) src1 = ins.get_reg(1) @@ -105,7 +105,7 @@ class RV32I(InstructionSet): self.regs.get(src1) >> (self.regs.get(src2) & 0b11111) ) - def instruction_srai(self, ins: 'LoadedInstruction'): + def instruction_srai(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 3) dst = ins.get_reg(0) src1 = ins.get_reg(1) @@ -115,7 +115,7 @@ class RV32I(InstructionSet): self.regs.get(src1) >> (imm & 0b11111) ) - def instruction_add(self, ins: 'LoadedInstruction'): + def instruction_add(self, ins: 'Instruction'): dst = "" if self.cpu.conf.add_accept_imm: try: @@ -130,139 +130,139 @@ class RV32I(InstructionSet): rs1 + rs2 ) - def instruction_addi(self, ins: 'LoadedInstruction'): + def instruction_addi(self, ins: 'Instruction'): dst, rs1, imm = self.parse_rd_rs_imm(ins) self.regs.set( dst, rs1 + imm ) - def instruction_sub(self, ins: 'LoadedInstruction'): + def instruction_sub(self, ins: 'Instruction'): dst, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( dst, rs1 - rs2 ) - def instruction_lui(self, ins: 'LoadedInstruction'): + def instruction_lui(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 2) reg = ins.get_reg(0) imm = ins.get_imm(1) self.regs.set(reg, imm << 12) - def instruction_auipc(self, ins: 'LoadedInstruction'): + def instruction_auipc(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 2) reg = ins.get_reg(0) imm = to_unsigned(ins.get_imm(1)) self.regs.set(reg, self.pc + (imm << 12)) - def instruction_xor(self, ins: 'LoadedInstruction'): + def instruction_xor(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( rd, rs1 ^ rs2 ) - def instruction_xori(self, ins: 'LoadedInstruction'): + def instruction_xori(self, ins: 'Instruction'): rd, rs1, imm = self.parse_rd_rs_imm(ins) self.regs.set( rd, rs1 ^ imm ) - def instruction_or(self, ins: 'LoadedInstruction'): + def instruction_or(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( rd, rs1 | rs2 ) - def instruction_ori(self, ins: 'LoadedInstruction'): + def instruction_ori(self, ins: 'Instruction'): rd, rs1, imm = self.parse_rd_rs_imm(ins) self.regs.set( rd, rs1 | imm ) - def instruction_and(self, ins: 'LoadedInstruction'): + def instruction_and(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( rd, rs1 & rs2 ) - def instruction_andi(self, ins: 'LoadedInstruction'): + def instruction_andi(self, ins: 'Instruction'): rd, rs1, imm = self.parse_rd_rs_imm(ins) self.regs.set( rd, rs1 & imm ) - def instruction_slt(self, ins: 'LoadedInstruction'): + def instruction_slt(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( rd, int(rs1 < rs2) ) - def instruction_slti(self, ins: 'LoadedInstruction'): + def instruction_slti(self, ins: 'Instruction'): rd, rs1, imm = self.parse_rd_rs_imm(ins) self.regs.set( rd, int(rs1 < imm) ) - def instruction_sltu(self, ins: 'LoadedInstruction'): + def instruction_sltu(self, ins: 'Instruction'): dst, rs1, rs2 = self.parse_rd_rs_rs(ins, signed=False) self.regs.set( dst, int(rs1 < rs2) ) - def instruction_sltiu(self, ins: 'LoadedInstruction'): + def instruction_sltiu(self, ins: 'Instruction'): dst, rs1, imm = self.parse_rd_rs_imm(ins, signed=False) self.regs.set( dst, int(rs1 < imm) ) - def instruction_beq(self, ins: 'LoadedInstruction'): + def instruction_beq(self, ins: 'Instruction'): rs1, rs2, dst = self.parse_rs_rs_imm(ins) if rs1 == rs2: self.pc = dst - def instruction_bne(self, ins: 'LoadedInstruction'): + def instruction_bne(self, ins: 'Instruction'): rs1, rs2, dst = self.parse_rs_rs_imm(ins) if rs1 != rs2: self.pc = dst - def instruction_blt(self, ins: 'LoadedInstruction'): + def instruction_blt(self, ins: 'Instruction'): rs1, rs2, dst = self.parse_rs_rs_imm(ins) if rs1 < rs2: self.pc = dst - def instruction_bge(self, ins: 'LoadedInstruction'): + def instruction_bge(self, ins: 'Instruction'): rs1, rs2, dst = self.parse_rs_rs_imm(ins) if rs1 >= rs2: self.pc = dst - def instruction_bltu(self, ins: 'LoadedInstruction'): + def instruction_bltu(self, ins: 'Instruction'): rs1, rs2, dst = self.parse_rs_rs_imm(ins, signed=False) if rs1 < rs2: self.pc = dst - def instruction_bgeu(self, ins: 'LoadedInstruction'): + def instruction_bgeu(self, ins: 'Instruction'): rs1, rs2, dst = self.parse_rs_rs_imm(ins, signed=False) if rs1 >= rs2: self.pc = dst # technically deprecated - def instruction_j(self, ins: 'LoadedInstruction'): + def instruction_j(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 1) addr = ins.get_imm(0) self.pc = addr - def instruction_jal(self, ins: 'LoadedInstruction'): + def instruction_jal(self, ins: 'Instruction'): reg = 'ra' # default register is ra if len(ins.args) == 1: addr = ins.get_imm(0) @@ -273,29 +273,29 @@ class RV32I(InstructionSet): self.regs.set(reg, self.pc) self.pc = addr - def instruction_jalr(self, ins: 'LoadedInstruction'): + def instruction_jalr(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 2) reg = ins.get_reg(0) addr = ins.get_imm(1) self.regs.set(reg, self.pc) self.pc = addr - def instruction_ret(self, ins: 'LoadedInstruction'): + def instruction_ret(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 0) self.pc = self.regs.get('ra') - def instruction_ecall(self, ins: 'LoadedInstruction'): + def instruction_ecall(self, ins: 'Instruction'): self.instruction_scall(ins) - def instruction_ebreak(self, ins: 'LoadedInstruction'): + def instruction_ebreak(self, ins: 'Instruction'): self.instruction_sbreak(ins) - def instruction_scall(self, ins: 'LoadedInstruction'): + def instruction_scall(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 0) syscall = Syscall(self.regs.get('a7'), self.cpu) self.cpu.syscall_int.handle_syscall(syscall) - def instruction_sbreak(self, ins: 'LoadedInstruction'): + def instruction_sbreak(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 0) if self.cpu.active_debug: print(FMT_DEBUG + "Debug instruction encountered at 0x{:08X}".format(self.pc - 1) + FMT_NONE) @@ -307,23 +307,23 @@ class RV32I(InstructionSet): "Debug instruction encountered at 0x{:08X}".format(self.pc - 1) ) - def instruction_nop(self, ins: 'LoadedInstruction'): + def instruction_nop(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 0) pass - def instruction_li(self, ins: 'LoadedInstruction'): + def instruction_li(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 2) reg = ins.get_reg(0) immediate = ins.get_imm(1) self.regs.set(reg, immediate) - def instruction_la(self, ins: 'LoadedInstruction'): + def instruction_la(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 2) reg = ins.get_reg(0) immediate = ins.get_imm(1) self.regs.set(reg, immediate) - def instruction_mv(self, ins: 'LoadedInstruction'): + def instruction_mv(self, ins: 'Instruction'): ASSERT_LEN(ins.args, 2) rd, rs = ins.get_reg(0), ins.get_reg(1) self.regs.set(rd, self.regs.get(rs)) diff --git a/riscemu/instructions/RV32M.py b/riscemu/instructions/RV32M.py index 5b1412f..31b9341 100644 --- a/riscemu/instructions/RV32M.py +++ b/riscemu/instructions/RV32M.py @@ -12,48 +12,48 @@ class RV32M(InstructionSet): """ The RV32M Instruction set, containing multiplication and division instructions """ - def instruction_mul(self, ins: 'LoadedInstruction'): + def instruction_mul(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( rd, rs1 * rs2 ) - def instruction_mulh(self, ins: 'LoadedInstruction'): + def instruction_mulh(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( rd, (rs1 * rs2) >> 32 ) - def instruction_mulhsu(self, ins: 'LoadedInstruction'): + def instruction_mulhsu(self, ins: 'Instruction'): INS_NOT_IMPLEMENTED(ins) - def instruction_mulhu(self, ins: 'LoadedInstruction'): + def instruction_mulhu(self, ins: 'Instruction'): INS_NOT_IMPLEMENTED(ins) - def instruction_div(self, ins: 'LoadedInstruction'): + def instruction_div(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( rd, rs1 // rs2 ) - def instruction_divu(self, ins: 'LoadedInstruction'): + def instruction_divu(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins, signed=False) self.regs.set( rd, rs1 // rs2 ) - def instruction_rem(self, ins: 'LoadedInstruction'): + def instruction_rem(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins) self.regs.set( rd, rs1 % rs2 ) - def instruction_remu(self, ins: 'LoadedInstruction'): + def instruction_remu(self, ins: 'Instruction'): rd, rs1, rs2 = self.parse_rd_rs_rs(ins, signed=False) self.regs.set( rd, diff --git a/riscemu/parser.py b/riscemu/parser.py index 64cefe8..7186d99 100644 --- a/riscemu/parser.py +++ b/riscemu/parser.py @@ -3,15 +3,16 @@ RiscEmu (c) 2021 Anton Lydike SPDX-License-Identifier: MIT """ +import os import re from typing import Dict, Tuple, Iterable, Callable -from helpers import Peekable +from .helpers import Peekable from .assembler import MemorySectionType, ParseContext, AssemblerDirectives from .base_types import Program from .colors import FMT_PARSE from .exceptions import ParseException -from .tokenizer import Token, TokenType +from .tokenizer import Token, TokenType, tokenize from .types import SimpleInstruction @@ -41,17 +42,32 @@ PARSERS: Dict[TokenType, Callable[[Token, Tuple[str], ParseContext], None]] = { def parse_tokens(name: str, tokens_iter: Iterable[Token]) -> Program: + """ + Convert a token stream into a parsed program + :param name: the programs name + :param tokens_iter: the programs content, tokenized + :return: a parsed program + """ context = ParseContext(name) for token, args in composite_tokenizer(Peekable[Token](tokens_iter)): if token.type not in PARSERS: raise ParseException("Unexpected token type: {}, {}".format(token, args)) + print("{} {}".format(token, args)) PARSERS[token.type](token, args, context) return context.finalize() def composite_tokenizer(tokens_iter: Iterable[Token]) -> Iterable[Tuple[Token, Tuple[str]]]: + """ + Convert an iterator over tokens into an iterator over tuples: (token, list(token)) + + The first token ist either a pseudo_op, label, or instruction name. The token list are all remaining tokens before + a newline is encountered + :param tokens_iter: An iterator over tokens + :return: An iterator over a slightly more structured representation of the tokens + """ tokens: Peekable[Token] = Peekable[Token](tokens_iter) while not tokens.is_empty(): @@ -75,5 +91,10 @@ def take_arguments(tokens: Peekable[Token]) -> Iterable[str]: elif tokens.peek().type == TokenType.NEWLINE: next(tokens) break - raise ParseException("Expected newline, instead got {}".format(tokens.peek())) + break + #raise ParseException("Expected newline, instead got {}".format(tokens.peek())) + +def parse_program_from_file(path: str) -> Program: + with open(path, 'r') as f: + return parse_tokens(os.path.split(path)[-1], tokenize(f)) diff --git a/riscemu/tokenizer.py b/riscemu/tokenizer.py index 6e29dd7..2820a09 100644 --- a/riscemu/tokenizer.py +++ b/riscemu/tokenizer.py @@ -7,7 +7,7 @@ SPDX-License-Identifier: MIT import re from dataclasses import dataclass from enum import Enum, auto -from typing import List, Iterable +from typing import List, Iterable, Optional from riscemu.decoder import RISCV_REGS from .exceptions import ParseException @@ -17,8 +17,6 @@ WHITESPACE_PATTERN = re.compile(r'\s+') MEMORY_ADDRESS_PATTERN = re.compile(r'^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$') REGISTER_NAMES = RISCV_REGS -I = lambda x: x - class TokenType(Enum): COMMA = auto() @@ -39,7 +37,7 @@ class Token: return '\\n' if self.type == TokenType.COMMA: return ', ' - return '{}({}) '.format(self.type.name[0:3], self.value) + return '{}({})'.format(self.type.name[0:3], self.value) NEWLINE = Token(TokenType.NEWLINE, '\n') @@ -55,7 +53,7 @@ def tokenize(input: Iterable[str]) -> Iterable[Token]: if not line: continue - parts = list(part for part in re.split(WHITESPACE_PATTERN, line) if part) + parts = list(part for part in split_whitespace_respecting_quotes(line) if part) yield from parse_line(parts) yield NEWLINE @@ -70,6 +68,8 @@ def parse_line(parts: List[str]) -> Iterable[Token]: yield Token(TokenType.PSEUDO_OP, first_token) elif first_token[-1] == ':': yield Token(TokenType.LABEL, first_token) + yield from parse_line(parts[1:]) + return else: yield Token(TokenType.INSTRUCTION_NAME, first_token) @@ -100,3 +100,40 @@ def print_tokens(tokens: Iterable[Token]): for token in tokens: print(token, end='\n' if token == NEWLINE else '') print("", flush=True, end="") + + +def split_whitespace_respecting_quotes(line: str) -> Iterable[str]: + quote = "" + part = "" + for c in line: + if c == quote: + yield part + part = "" + quote = "" + continue + + if quote != "": + part += c + continue + + if c in "\"'": + if part: + yield part + quote = c + part = "" + continue + + if c in ' \t\n': + if part: + yield part + part = "" + continue + + part += c + + if part: + yield part + + + + diff --git a/riscemu/types.py b/riscemu/types.py index 49791b2..998eba1 100644 --- a/riscemu/types.py +++ b/riscemu/types.py @@ -2,7 +2,7 @@ from typing import List, Tuple from .exceptions import MemoryAccessException from .helpers import parse_numeric_argument from .base_types import Instruction, MemorySection, MemoryFlags, InstructionContext, T_RelativeAddress, \ - T_AbsoluteAddress + T_AbsoluteAddress, Program class SimpleInstruction(Instruction): @@ -26,13 +26,14 @@ class SimpleInstruction(Instruction): class InstructionMemorySection(MemorySection): - def __init__(self, instructions: List[Instruction], name: str, context: InstructionContext, base: int = 0): + def __init__(self, instructions: List[Instruction], name: str, context: InstructionContext, owner: Program, base: int = 0): self.name = name self.base = base self.context = context self.size = len(instructions) * 4 self.flags = MemoryFlags(True, True) self.instructions = instructions + self.owner = owner.name def read(self, offset: T_RelativeAddress, size: int) -> bytearray: raise MemoryAccessException("Cannot read raw bytes from instruction section", self.base + offset, size, 'read') @@ -47,13 +48,14 @@ class InstructionMemorySection(MemorySection): class BinaryDataMemorySection(MemorySection): - def __init__(self, data: bytearray, name: str, context: InstructionContext, base: int = 0): + def __init__(self, data: bytearray, name: str, context: InstructionContext, owner: Program, base: int = 0): self.name = name self.base = base self.context = context self.size = len(data) self.flags = MemoryFlags(False, False) self.data = data + self.owner = owner.name def read(self, offset: T_RelativeAddress, size: int) -> bytearray: if offset + size > self.size: diff --git a/test/__init__.py b/test/__init__.py index e69de29..8030002 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -0,0 +1,2 @@ +from .test_tokenizer import * +from .test_helpers import * \ No newline at end of file diff --git a/test/test_helpers.py b/test/test_helpers.py index bc8ef0d..60d93b0 100644 --- a/test/test_helpers.py +++ b/test/test_helpers.py @@ -3,7 +3,7 @@ from unittest import TestCase from riscemu.helpers import * -class Test(TestCase): +class TestHelpers(TestCase): def test_int_to_bytes(self): self.assertEqual(int_to_bytes(-1), bytearray([0xff] * 4), "-1") self.assertEqual(int_to_bytes(1), bytearray([0, 0, 0, 1]), "1") diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py index dc6c410..9eed365 100644 --- a/test/test_tokenizer.py +++ b/test/test_tokenizer.py @@ -1,6 +1,7 @@ from unittest import TestCase -from riscemu.tokenizer import tokenize, print_tokens, Token, TokenType, NEWLINE, COMMA +from riscemu.tokenizer import tokenize, print_tokens, Token, TokenType, NEWLINE, COMMA, \ + split_whitespace_respecting_quotes def ins(name: str) -> Token: @@ -19,7 +20,7 @@ def lbl(name: str) -> Token: return Token(TokenType.LABEL, name) -class Test(TestCase): +class TestTokenizer(TestCase): def test_instructions(self): program = [ @@ -79,3 +80,47 @@ section: self.assertEqual(list(tokenize(program.splitlines())), tokens) + def test_split_whitespace_respecting_quotes_single(self): + self.assertEqual( + list(split_whitespace_respecting_quotes("test")), ["test"] + ) + + def test_split_whitespace_respecting_quotes_empty(self): + self.assertEqual( + list(split_whitespace_respecting_quotes("")), [] + ) + + def test_split_whitespace_respecting_quotes_two_parts(self): + self.assertEqual( + list(split_whitespace_respecting_quotes("test 123")), ["test", "123"] + ) + + def test_split_whitespace_respecting_quotes_whole_quoted(self): + self.assertEqual( + list(split_whitespace_respecting_quotes("'test 123'")), ["test 123"] + ) + + def test_split_whitespace_respecting_quotes_double_quotes(self): + self.assertEqual( + list(split_whitespace_respecting_quotes('"test 123"')), ["test 123"] + ) + + def test_split_whitespace_respecting_quotes_quoted_then_normal(self): + self.assertEqual( + list(split_whitespace_respecting_quotes('"test 123" abc')), ["test 123", "abc"] + ) + + def test_split_whitespace_respecting_quotes_quoted_sorrounded(self): + self.assertEqual( + list(split_whitespace_respecting_quotes('hello "test 123" abc')), ["hello", "test 123", "abc"] + ) + + def test_split_whitespace_respecting_quotes_weird_spaces(self): + self.assertEqual( + list(split_whitespace_respecting_quotes('hello "test 123"\tabc')), ["hello", "test 123", "abc"] + ) + + def test_split_whitespace_respecting_quotes_quotes_no_spaces(self): + self.assertEqual( + list(split_whitespace_respecting_quotes('hello"test 123"abc')), ["hello", "test 123", "abc"] + )