tokenizer reimplemented

3 years ago · d5a4acef67
parent 52e189c226
commit d5a4acef67
5 changed files with 101 additions and 312 deletions
--- a/riscemu/CPU.py
+++ b/riscemu/CPU.py
@ -9,7 +9,6 @@ on them.
 import sys
 from typing import Tuple, List, Dict, Callable, Type

-from .Tokenizer import RiscVTokenizer
 from .Executable import MemoryFlags
 from .Syscall import SyscallInterface, get_syscall_symbols
 from .Exceptions import RiscemuBaseException, LaunchDebuggerException
--- a/riscemu/ExecutableParser.py
+++ b/riscemu/ExecutableParser.py
@ -10,7 +10,7 @@ from .helpers import parse_numeric_argument, int_to_bytes
 from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
 from .Exceptions import *

-from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken
+from .Tokenizer import tokenize, TokenType, Token, COMMA, NEWLINE

 from typing import Dict, Tuple, List, Optional

@ -22,7 +22,7 @@ class ExecutableParser:
    tokenizer: 'RiscVTokenizer'

    def __init__(self, tokenizer: 'RiscVTokenizer'):
-        self.instructions: List[RiscVInstructionToken] = list()
+        self.instructions: List['RiscVInstructionToken'] = list()
        self.symbols: Dict[str, Tuple[str, int]] = dict()
        self.sections: Dict[str, MemorySection] = dict()
        self.tokenizer = tokenizer
@ -37,11 +37,11 @@ class ExecutableParser:
        :raise ParseException: Raises a ParseException when invalid input is read
        """
        for token in self.tokenizer.tokens:
-            if isinstance(token, RiscVInstructionToken):
+            if isinstance(token, 'RiscVInstructionToken'):
                self.parse_instruction(token)
-            elif isinstance(token, RiscVSymbolToken):
+            elif isinstance(token, 'RiscVSymbolToken'):
                self.handle_symbol(token)
-            elif isinstance(token, RiscVPseudoOpToken):
+            elif isinstance(token, 'RiscVPseudoOpToken'):
                self.handle_pseudo_op(token)
        return self._get_execuable()

--- a/riscemu/Tokenizer.py
+++ b/riscemu/Tokenizer.py
@ -5,316 +5,96 @@ SPDX-License-Identifier: MIT
 """

 import re
-from enum import IntEnum
-from typing import List
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import List, Iterable
+from riscemu.decoder import RISCV_REGS

 from .Exceptions import ParseException

-PSEUDO_OPS = [
-    '.asciiz',
-    '.double',
-    '.extern',
-    '.global',
-    '.align',
-    '.float',
-    '.kdata',
-    '.ktext',
-    '.space',
-    '.ascii',
-    '.byte',
-    '.data',
-    '.half',
-    '.text',
-    '.word',
-    '.set',
-]
+LINE_COMMENT_STARTERS = ('#', ';', '//')
+WHITESPACE_PATTERN = re.compile(r'\s+')
+MEMORY_ADDRESS_PATTERN = re.compile('^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$')
+REGISTER_NAMES = RISCV_REGS

-COMMENT_START = ["#", ";"]
+I = lambda x: x

-REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):')
+class TokenType(Enum):
+    COMMA = auto()
+    ARGUMENT = auto()
+    PSEUDO_OP = auto()
+    INSTRUCTION_NAME = auto()
+    NEWLINE = auto()
+    LABEL = auto()

-REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n')

-REG_WHITESPACE = re.compile(r'^\s*')
-
-REG_NONWHITESPACE = re.compile(r'^[^\s]*')
-
-REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*')
-
-REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*')
-
-REG_VALID_ARGUMENT = re.compile(
-    r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?'
-)
-
-REG_ARG_SPLIT = re.compile(r'^,[ \t]*')
-
-
-def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")):
-    pos = 0
-    last_piece = 0
-    pieces = []
-    in_quotes = False
-    if string is None:
-        return pieces
-    while pos < len(string):
-        match = at.match(string[pos:])
-        if match is not None:
-            if not in_quotes:
-                pieces.append(string[last_piece:pos])
-                pos += len(match.group(0))
-                last_piece = pos
-            else:
-                pos += len(match.group(0))
-        elif string[pos] in quotes:
-            in_quotes = not in_quotes
-            pos += 1
-        elif string[pos] in COMMENT_START and not in_quotes:  # entering comment
-            break
-        else:
-            pos += 1
-    if in_quotes:
-        print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string))
-    pieces.append(string[last_piece:pos])
-    return pieces
-
-
-class RiscVInput:
-    """
-    Represents an Assembly file
-    """
-    def __init__(self, content: str, name: str):
-        self.content = content
-        self.pos = 0
-        self.len = len(content)
-        self.name = name
-
-    @staticmethod
-    def from_file(src: str):
-        with open(src, 'r') as f:
-            return RiscVInput(f.read(), src)
-
-    def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
-        at = self.pos + offset
-
-        if regex:
-            if not isinstance(regex, re.Pattern):
-                print("uncompiled regex passed to peek!")
-                regex = re.compile(regex)
-            match = regex.match(self.content[at:])
-            if match is None:
-                return None
-
-            if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
-                print("Cannot peek regex group that does not start at match start!")
-                return None
-            return match.group(regex_group)
-        if text:
-            if self.content[at:].startswith(text):
-                return self.content[at:at + len(text)]
-            return False
-        return self.content[at:at + size]
-
-    def peek_one_of(self, options: List[str]):
-        longest_peek = 0
-        ret = False
-        for text in options:
-            if self.peek(text=text):
-                if len(text) > longest_peek:
-                    longest_peek = len(text)
-                    ret = text
-        return ret
-
-    def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
-        at = self.pos
-
-        if regex:
-            if not isinstance(regex, re.Pattern):
-                print("uncompiled regex passed to peek!")
-                regex = re.compile(regex)
-            match = regex.match(self.content[at:])
-            if match is None:
-                return None
-
-            if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
-                print("Cannot consume regex group that does not start at match start!")
-                return None
-            self.pos += len(match.group(regex_group))
-            return match.group(regex_group)
-
-        if text:
-            if self.content[at:].startswith(text):
-                self.pos += len(text)
-                return text
-            return None
-
-        self.pos += size
-        return self.content[at:at + size]
-
-    def consume_one_of(self, options: List[str]):
-        longest_peek = 0
-        ret = False
-        for text in options:
-            if self.peek(text=text):
-                if len(text) > longest_peek:
-                    longest_peek = len(text)
-                    ret = text
-        self.consume(text=ret)
-        return ret
-
-    def seek_newline(self):
-        return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1)
-
-    def consume_whitespace(self, linebreak=True):
-        if linebreak:
-            return self.consume(regex=REG_WHITESPACE)
-        return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK)
-
-    def has_next(self):
-        return self.pos < self.len
-
-    def context(self, size: int = 5):
-        """
-        returns a context string:
-        <local input before pos>|<local input after pos>
-        """
-        start = max(self.pos - size, 0)
-        end = min(self.pos + size, self.len - 1)
-
-        return self.content[start:self.pos] + '|' + self.content[self.pos:end]
-
-
-class TokenType(IntEnum):
-    SYMBOL = 0
-    INSTRUCTION = 1
-    PSEUDO_OP = 2
-
-    def __repr__(self):
-        return self.name
-
-    def __str__(self):
-        return self.name
-
-
-class RiscVToken:
+@dataclass(frozen=True)
+class Token:
    type: TokenType
+    value: str

-    def __init__(self, t_type: TokenType):
-        self.type = t_type
-
-    def __repr__(self):
-        return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text())
-
-    def text(self):
-        """
-        create text representation of instruction
-        """
-        return "unknown"
-
-
-class RiscVInstructionToken(RiscVToken):
-    def __init__(self, name, args):
-        super().__init__(TokenType.INSTRUCTION)
-        self.instruction = name
-        self.args = args
-
-    def text(self):
-        if len(self.args) == 0:
-            return self.instruction
-        if len(self.args) == 1:
-            return "{} {}".format(self.instruction, self.args[0])
-        if len(self.args) == 2:
-            return "{} {}, {}".format(self.instruction, *self.args)
-        return "{} {}, {}, {}".format(self.instruction, *self.args)
-
-
-class RiscVSymbolToken(RiscVToken):
-    def __init__(self, name):
-        super().__init__(TokenType.SYMBOL)
-        self.name = name
-
-    def text(self):
-        return self.name
-
-
-class RiscVPseudoOpToken(RiscVToken):
-    def __init__(self, name, args):
-        super().__init__(TokenType.PSEUDO_OP)
-        self.name = name
-        self.args = args
-
-    def text(self):
-        return "{} {}".format(self.name, self.args)
-
-
-class RiscVTokenizer:
-    """
-    A tokenizer for the RISC-V syntax of a given CPU
-    """
-    def __init__(self, input_assembly: RiscVInput, instructions: List[str]):
-        self.input = input_assembly
-        self.tokens: List[RiscVToken] = []
-        self.name = input_assembly.name
-        self.instructions = instructions
-
-    def tokenize(self):
-        while self.input.has_next():
-            # remove leading whitespaces, place cursor at text start
-            self.input.consume_whitespace()
-
-            # check if we have a pseudo op
-            if self.input.peek_one_of(PSEUDO_OPS):
-                self.parse_pseudo_op()
-
-            # check if we have a symbol (like main:)
-            elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL):
-                self.parse_symbol()
-
-            # comment
-            elif self.input.peek() in COMMENT_START:
-                self.parse_comment()
-
-            # must be instruction
-            elif self.input.peek_one_of(self.instructions):
-                self.parse_instruction()
-            else:
-                token = self.input.peek(size=5)
-                raise ParseException("Unknown token around {} at: {}".format(repr(token), repr(self.input.context())))
-            self.input.consume_whitespace()
-
-    def parse_pseudo_op(self):
-        name = self.input.consume_one_of(PSEUDO_OPS)
-        self.input.consume_whitespace(linebreak=False)
-
-        arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE)
-        if not arg_str:
-            args = []
-        else:
-            args = split_accepting_quotes(arg_str)
-
-        self.tokens.append(RiscVPseudoOpToken(name[1:], args))
-
-    def parse_symbol(self):
-        name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL)
-        self.tokens.append(RiscVSymbolToken(name[:-1]))
-        if not self.input.consume_whitespace():
-            print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format(
-                self.input.context()))
-
-    def parse_instruction(self):
-        ins = self.input.consume_one_of(self.instructions)
-        args = []
-        self.input.consume_whitespace(linebreak=False)
-        while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3:
-            arg = self.input.consume(regex=REG_VALID_ARGUMENT)
-            args.append(arg)
-            if self.input.peek(text=','):
-                self.input.consume(text=',')
-                self.input.consume_whitespace(linebreak=False)
-            else:
-                break
-        self.tokens.append(RiscVInstructionToken(ins, args))
-
-    def parse_comment(self):
-        # just consume the rest
-        self.input.consume(regex=REG_UNTIL_NEWLINE)
+    def __str__(self):
+        if self.type == TokenType.NEWLINE:
+            return '\\n'
+        if self.type == TokenType.COMMA:
+            return ', '
+        return '{}({}) '.format(self.type.name[0:3], self.value)
+
+NEWLINE = Token(TokenType.NEWLINE, '\n')
+COMMA = Token(TokenType.COMMA, ',')
+
+
+def tokenize(input: Iterable[str]) -> Iterable[Token]:
+    for line in input:
+        for line_comment_start in LINE_COMMENT_STARTERS:
+            if line_comment_start in line:
+                line = line[:line.index(line_comment_start)]
+        line.strip(' \t\n')
+        if not line:
+            continue
+
+        parts = list(part for part in re.split(WHITESPACE_PATTERN, line) if part)
+
+        yield from parse_line(parts)
+        yield NEWLINE
+
+
+def parse_line(parts: List[str]) -> Iterable[Token]:
+    if len(parts) == 0:
+        return ()
+    first_token = parts[0]
+
+    if first_token[0] == '.':
+        yield Token(TokenType.PSEUDO_OP, first_token)
+    elif first_token[-1] == ':':
+        yield Token(TokenType.LABEL, first_token)
+    else:
+        yield Token(TokenType.INSTRUCTION_NAME, first_token)
+
+    for part in parts[1:]:
+        if part == ',':
+            yield COMMA
+            continue
+        yield from parse_arg(part)
+
+
+def parse_arg(arg: str) -> Iterable[Token]:
+    comma = arg[-1] == ','
+    arg = arg[:-1] if comma else arg
+    mem_match_resul = re.match(MEMORY_ADDRESS_PATTERN, arg)
+    if mem_match_resul:
+        register = mem_match_resul.group(2).lower()
+        if register not in RISCV_REGS:
+            raise ParseException(f'"{register}" is not a valid register!')
+        yield Token(TokenType.ARGUMENT, register)
+        yield Token(TokenType.ARGUMENT, mem_match_resul.group(1))
+    else:
+        yield Token(TokenType.ARGUMENT, arg)
+    if comma:
+        yield COMMA
+
+
+def print_tokens(tokens: Iterable[Token]):
+    for token in tokens:
+        print(token, end='\n' if token == NEWLINE else '')
+    print("", flush=True, end="")
--- a/riscemu/init.py
+++ b/riscemu/init.py
@ -11,8 +11,6 @@ It contains everything needed to run assembly files, so you don't need any custo
 from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \
    ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException

-from .Tokenizer import RiscVInput, RiscVTokenizer
-
 from .Executable import Executable, LoadedExecutable, LoadedMemorySection

 from .ExecutableParser import ExecutableParser
--- a/test/test_helpers.py
+++ b/test/test_helpers.py
@ -28,3 +28,15 @@ class Test(TestCase):
        self.assertEqual(to_signed(0xffed36e4), -1231132)
        self.assertEqual(to_signed(0x0FFFFFFF), 0x0FFFFFFF)

+    def test_bind_twos_complement(self):
+        minval = -(1 << 31)
+        maxval = ((1 << 31)-1)
+
+        self.assertEqual(bind_twos_complement(minval), minval, "minval preserves")
+        self.assertEqual(bind_twos_complement(minval), minval, )
+        self.assertEqual(bind_twos_complement(maxval), maxval, "maxval preserves")
+        self.assertEqual(bind_twos_complement(minval - 1), maxval, "minval-1 wraps")
+        self.assertEqual(bind_twos_complement(maxval + 1), minval, "maxval+1 wraps")
+        self.assertEqual(bind_twos_complement(0), 0, "0 is 0")
+        self.assertEqual(bind_twos_complement(1), 1, "1 is 1")
+        self.assertEqual(bind_twos_complement(-1), -1, "-1 is -1")