diff --git a/riscemu/CPU.py b/riscemu/CPU.py index 962ef85..3df7d29 100644 --- a/riscemu/CPU.py +++ b/riscemu/CPU.py @@ -9,7 +9,6 @@ on them. import sys from typing import Tuple, List, Dict, Callable, Type -from .Tokenizer import RiscVTokenizer from .Executable import MemoryFlags from .Syscall import SyscallInterface, get_syscall_symbols from .Exceptions import RiscemuBaseException, LaunchDebuggerException diff --git a/riscemu/ExecutableParser.py b/riscemu/ExecutableParser.py index c3a9fd7..3e18c3d 100644 --- a/riscemu/ExecutableParser.py +++ b/riscemu/ExecutableParser.py @@ -10,7 +10,7 @@ from .helpers import parse_numeric_argument, int_to_bytes from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags from .Exceptions import * -from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken +from .Tokenizer import tokenize, TokenType, Token, COMMA, NEWLINE from typing import Dict, Tuple, List, Optional @@ -22,7 +22,7 @@ class ExecutableParser: tokenizer: 'RiscVTokenizer' def __init__(self, tokenizer: 'RiscVTokenizer'): - self.instructions: List[RiscVInstructionToken] = list() + self.instructions: List['RiscVInstructionToken'] = list() self.symbols: Dict[str, Tuple[str, int]] = dict() self.sections: Dict[str, MemorySection] = dict() self.tokenizer = tokenizer @@ -37,11 +37,11 @@ class ExecutableParser: :raise ParseException: Raises a ParseException when invalid input is read """ for token in self.tokenizer.tokens: - if isinstance(token, RiscVInstructionToken): + if isinstance(token, 'RiscVInstructionToken'): self.parse_instruction(token) - elif isinstance(token, RiscVSymbolToken): + elif isinstance(token, 'RiscVSymbolToken'): self.handle_symbol(token) - elif isinstance(token, RiscVPseudoOpToken): + elif isinstance(token, 'RiscVPseudoOpToken'): self.handle_pseudo_op(token) return self._get_execuable() diff --git a/riscemu/Tokenizer.py b/riscemu/Tokenizer.py index 68be1ac..db9d330 100644 --- a/riscemu/Tokenizer.py +++ b/riscemu/Tokenizer.py @@ -5,316 +5,96 @@ SPDX-License-Identifier: MIT """ import re -from enum import IntEnum -from typing import List +from dataclasses import dataclass +from enum import Enum, auto +from typing import List, Iterable +from riscemu.decoder import RISCV_REGS from .Exceptions import ParseException -PSEUDO_OPS = [ - '.asciiz', - '.double', - '.extern', - '.global', - '.align', - '.float', - '.kdata', - '.ktext', - '.space', - '.ascii', - '.byte', - '.data', - '.half', - '.text', - '.word', - '.set', -] +LINE_COMMENT_STARTERS = ('#', ';', '//') +WHITESPACE_PATTERN = re.compile(r'\s+') +MEMORY_ADDRESS_PATTERN = re.compile('^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$') +REGISTER_NAMES = RISCV_REGS -COMMENT_START = ["#", ";"] +I = lambda x: x -REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):') +class TokenType(Enum): + COMMA = auto() + ARGUMENT = auto() + PSEUDO_OP = auto() + INSTRUCTION_NAME = auto() + NEWLINE = auto() + LABEL = auto() -REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n') -REG_WHITESPACE = re.compile(r'^\s*') - -REG_NONWHITESPACE = re.compile(r'^[^\s]*') - -REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*') - -REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*') - -REG_VALID_ARGUMENT = re.compile( - r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?' -) - -REG_ARG_SPLIT = re.compile(r'^,[ \t]*') - - -def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")): - pos = 0 - last_piece = 0 - pieces = [] - in_quotes = False - if string is None: - return pieces - while pos < len(string): - match = at.match(string[pos:]) - if match is not None: - if not in_quotes: - pieces.append(string[last_piece:pos]) - pos += len(match.group(0)) - last_piece = pos - else: - pos += len(match.group(0)) - elif string[pos] in quotes: - in_quotes = not in_quotes - pos += 1 - elif string[pos] in COMMENT_START and not in_quotes: # entering comment - break - else: - pos += 1 - if in_quotes: - print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string)) - pieces.append(string[last_piece:pos]) - return pieces - - -class RiscVInput: - """ - Represents an Assembly file - """ - def __init__(self, content: str, name: str): - self.content = content - self.pos = 0 - self.len = len(content) - self.name = name - - @staticmethod - def from_file(src: str): - with open(src, 'r') as f: - return RiscVInput(f.read(), src) - - def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0): - at = self.pos + offset - - if regex: - if not isinstance(regex, re.Pattern): - print("uncompiled regex passed to peek!") - regex = re.compile(regex) - match = regex.match(self.content[at:]) - if match is None: - return None - - if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)): - print("Cannot peek regex group that does not start at match start!") - return None - return match.group(regex_group) - if text: - if self.content[at:].startswith(text): - return self.content[at:at + len(text)] - return False - return self.content[at:at + size] - - def peek_one_of(self, options: List[str]): - longest_peek = 0 - ret = False - for text in options: - if self.peek(text=text): - if len(text) > longest_peek: - longest_peek = len(text) - ret = text - return ret - - def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0): - at = self.pos - - if regex: - if not isinstance(regex, re.Pattern): - print("uncompiled regex passed to peek!") - regex = re.compile(regex) - match = regex.match(self.content[at:]) - if match is None: - return None - - if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)): - print("Cannot consume regex group that does not start at match start!") - return None - self.pos += len(match.group(regex_group)) - return match.group(regex_group) - - if text: - if self.content[at:].startswith(text): - self.pos += len(text) - return text - return None - - self.pos += size - return self.content[at:at + size] - - def consume_one_of(self, options: List[str]): - longest_peek = 0 - ret = False - for text in options: - if self.peek(text=text): - if len(text) > longest_peek: - longest_peek = len(text) - ret = text - self.consume(text=ret) - return ret - - def seek_newline(self): - return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1) - - def consume_whitespace(self, linebreak=True): - if linebreak: - return self.consume(regex=REG_WHITESPACE) - return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK) - - def has_next(self): - return self.pos < self.len - - def context(self, size: int = 5): - """ - returns a context string: - | - """ - start = max(self.pos - size, 0) - end = min(self.pos + size, self.len - 1) - - return self.content[start:self.pos] + '|' + self.content[self.pos:end] - - -class TokenType(IntEnum): - SYMBOL = 0 - INSTRUCTION = 1 - PSEUDO_OP = 2 - - def __repr__(self): - return self.name - - def __str__(self): - return self.name - - -class RiscVToken: +@dataclass(frozen=True) +class Token: type: TokenType + value: str - def __init__(self, t_type: TokenType): - self.type = t_type - - def __repr__(self): - return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text()) - - def text(self): - """ - create text representation of instruction - """ - return "unknown" - - -class RiscVInstructionToken(RiscVToken): - def __init__(self, name, args): - super().__init__(TokenType.INSTRUCTION) - self.instruction = name - self.args = args - - def text(self): - if len(self.args) == 0: - return self.instruction - if len(self.args) == 1: - return "{} {}".format(self.instruction, self.args[0]) - if len(self.args) == 2: - return "{} {}, {}".format(self.instruction, *self.args) - return "{} {}, {}, {}".format(self.instruction, *self.args) - - -class RiscVSymbolToken(RiscVToken): - def __init__(self, name): - super().__init__(TokenType.SYMBOL) - self.name = name - - def text(self): - return self.name - - -class RiscVPseudoOpToken(RiscVToken): - def __init__(self, name, args): - super().__init__(TokenType.PSEUDO_OP) - self.name = name - self.args = args - - def text(self): - return "{} {}".format(self.name, self.args) - - -class RiscVTokenizer: - """ - A tokenizer for the RISC-V syntax of a given CPU - """ - def __init__(self, input_assembly: RiscVInput, instructions: List[str]): - self.input = input_assembly - self.tokens: List[RiscVToken] = [] - self.name = input_assembly.name - self.instructions = instructions - - def tokenize(self): - while self.input.has_next(): - # remove leading whitespaces, place cursor at text start - self.input.consume_whitespace() - - # check if we have a pseudo op - if self.input.peek_one_of(PSEUDO_OPS): - self.parse_pseudo_op() - - # check if we have a symbol (like main:) - elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL): - self.parse_symbol() - - # comment - elif self.input.peek() in COMMENT_START: - self.parse_comment() - - # must be instruction - elif self.input.peek_one_of(self.instructions): - self.parse_instruction() - else: - token = self.input.peek(size=5) - raise ParseException("Unknown token around {} at: {}".format(repr(token), repr(self.input.context()))) - self.input.consume_whitespace() - - def parse_pseudo_op(self): - name = self.input.consume_one_of(PSEUDO_OPS) - self.input.consume_whitespace(linebreak=False) - - arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE) - if not arg_str: - args = [] - else: - args = split_accepting_quotes(arg_str) - - self.tokens.append(RiscVPseudoOpToken(name[1:], args)) - - def parse_symbol(self): - name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL) - self.tokens.append(RiscVSymbolToken(name[:-1])) - if not self.input.consume_whitespace(): - print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format( - self.input.context())) - - def parse_instruction(self): - ins = self.input.consume_one_of(self.instructions) - args = [] - self.input.consume_whitespace(linebreak=False) - while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3: - arg = self.input.consume(regex=REG_VALID_ARGUMENT) - args.append(arg) - if self.input.peek(text=','): - self.input.consume(text=',') - self.input.consume_whitespace(linebreak=False) - else: - break - self.tokens.append(RiscVInstructionToken(ins, args)) - - def parse_comment(self): - # just consume the rest - self.input.consume(regex=REG_UNTIL_NEWLINE) + def __str__(self): + if self.type == TokenType.NEWLINE: + return '\\n' + if self.type == TokenType.COMMA: + return ', ' + return '{}({}) '.format(self.type.name[0:3], self.value) + +NEWLINE = Token(TokenType.NEWLINE, '\n') +COMMA = Token(TokenType.COMMA, ',') + + +def tokenize(input: Iterable[str]) -> Iterable[Token]: + for line in input: + for line_comment_start in LINE_COMMENT_STARTERS: + if line_comment_start in line: + line = line[:line.index(line_comment_start)] + line.strip(' \t\n') + if not line: + continue + + parts = list(part for part in re.split(WHITESPACE_PATTERN, line) if part) + + yield from parse_line(parts) + yield NEWLINE + + +def parse_line(parts: List[str]) -> Iterable[Token]: + if len(parts) == 0: + return () + first_token = parts[0] + + if first_token[0] == '.': + yield Token(TokenType.PSEUDO_OP, first_token) + elif first_token[-1] == ':': + yield Token(TokenType.LABEL, first_token) + else: + yield Token(TokenType.INSTRUCTION_NAME, first_token) + + for part in parts[1:]: + if part == ',': + yield COMMA + continue + yield from parse_arg(part) + + +def parse_arg(arg: str) -> Iterable[Token]: + comma = arg[-1] == ',' + arg = arg[:-1] if comma else arg + mem_match_resul = re.match(MEMORY_ADDRESS_PATTERN, arg) + if mem_match_resul: + register = mem_match_resul.group(2).lower() + if register not in RISCV_REGS: + raise ParseException(f'"{register}" is not a valid register!') + yield Token(TokenType.ARGUMENT, register) + yield Token(TokenType.ARGUMENT, mem_match_resul.group(1)) + else: + yield Token(TokenType.ARGUMENT, arg) + if comma: + yield COMMA + + +def print_tokens(tokens: Iterable[Token]): + for token in tokens: + print(token, end='\n' if token == NEWLINE else '') + print("", flush=True, end="") diff --git a/riscemu/__init__.py b/riscemu/__init__.py index e006fb3..fc080cf 100644 --- a/riscemu/__init__.py +++ b/riscemu/__init__.py @@ -11,8 +11,6 @@ It contains everything needed to run assembly files, so you don't need any custo from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \ ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException -from .Tokenizer import RiscVInput, RiscVTokenizer - from .Executable import Executable, LoadedExecutable, LoadedMemorySection from .ExecutableParser import ExecutableParser diff --git a/test/test_helpers.py b/test/test_helpers.py index 1166e50..bc8ef0d 100644 --- a/test/test_helpers.py +++ b/test/test_helpers.py @@ -28,3 +28,15 @@ class Test(TestCase): self.assertEqual(to_signed(0xffed36e4), -1231132) self.assertEqual(to_signed(0x0FFFFFFF), 0x0FFFFFFF) + def test_bind_twos_complement(self): + minval = -(1 << 31) + maxval = ((1 << 31)-1) + + self.assertEqual(bind_twos_complement(minval), minval, "minval preserves") + self.assertEqual(bind_twos_complement(minval), minval, ) + self.assertEqual(bind_twos_complement(maxval), maxval, "maxval preserves") + self.assertEqual(bind_twos_complement(minval - 1), maxval, "minval-1 wraps") + self.assertEqual(bind_twos_complement(maxval + 1), minval, "maxval+1 wraps") + self.assertEqual(bind_twos_complement(0), 0, "0 is 0") + self.assertEqual(bind_twos_complement(1), 1, "1 is 1") + self.assertEqual(bind_twos_complement(-1), -1, "-1 is -1") \ No newline at end of file