tokenizer reimplemented

This commit is contained in:
Anton Lydike 2021-12-13 23:23:55 +01:00
parent 52e189c226
commit d5a4acef67
5 changed files with 90 additions and 301 deletions

View File

@ -9,7 +9,6 @@ on them.
import sys
from typing import Tuple, List, Dict, Callable, Type
from .Tokenizer import RiscVTokenizer
from .Executable import MemoryFlags
from .Syscall import SyscallInterface, get_syscall_symbols
from .Exceptions import RiscemuBaseException, LaunchDebuggerException

View File

@ -10,7 +10,7 @@ from .helpers import parse_numeric_argument, int_to_bytes
from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
from .Exceptions import *
from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken
from .Tokenizer import tokenize, TokenType, Token, COMMA, NEWLINE
from typing import Dict, Tuple, List, Optional
@ -22,7 +22,7 @@ class ExecutableParser:
tokenizer: 'RiscVTokenizer'
def __init__(self, tokenizer: 'RiscVTokenizer'):
self.instructions: List[RiscVInstructionToken] = list()
self.instructions: List['RiscVInstructionToken'] = list()
self.symbols: Dict[str, Tuple[str, int]] = dict()
self.sections: Dict[str, MemorySection] = dict()
self.tokenizer = tokenizer
@ -37,11 +37,11 @@ class ExecutableParser:
:raise ParseException: Raises a ParseException when invalid input is read
"""
for token in self.tokenizer.tokens:
if isinstance(token, RiscVInstructionToken):
if isinstance(token, 'RiscVInstructionToken'):
self.parse_instruction(token)
elif isinstance(token, RiscVSymbolToken):
elif isinstance(token, 'RiscVSymbolToken'):
self.handle_symbol(token)
elif isinstance(token, RiscVPseudoOpToken):
elif isinstance(token, 'RiscVPseudoOpToken'):
self.handle_pseudo_op(token)
return self._get_execuable()

View File

@ -5,316 +5,96 @@ SPDX-License-Identifier: MIT
"""
import re
from enum import IntEnum
from typing import List
from dataclasses import dataclass
from enum import Enum, auto
from typing import List, Iterable
from riscemu.decoder import RISCV_REGS
from .Exceptions import ParseException
PSEUDO_OPS = [
'.asciiz',
'.double',
'.extern',
'.global',
'.align',
'.float',
'.kdata',
'.ktext',
'.space',
'.ascii',
'.byte',
'.data',
'.half',
'.text',
'.word',
'.set',
]
LINE_COMMENT_STARTERS = ('#', ';', '//')
WHITESPACE_PATTERN = re.compile(r'\s+')
MEMORY_ADDRESS_PATTERN = re.compile('^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$')
REGISTER_NAMES = RISCV_REGS
COMMENT_START = ["#", ";"]
I = lambda x: x
REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):')
REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n')
REG_WHITESPACE = re.compile(r'^\s*')
REG_NONWHITESPACE = re.compile(r'^[^\s]*')
REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*')
REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*')
REG_VALID_ARGUMENT = re.compile(
r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?'
)
REG_ARG_SPLIT = re.compile(r'^,[ \t]*')
class TokenType(Enum):
COMMA = auto()
ARGUMENT = auto()
PSEUDO_OP = auto()
INSTRUCTION_NAME = auto()
NEWLINE = auto()
LABEL = auto()
def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")):
pos = 0
last_piece = 0
pieces = []
in_quotes = False
if string is None:
return pieces
while pos < len(string):
match = at.match(string[pos:])
if match is not None:
if not in_quotes:
pieces.append(string[last_piece:pos])
pos += len(match.group(0))
last_piece = pos
else:
pos += len(match.group(0))
elif string[pos] in quotes:
in_quotes = not in_quotes
pos += 1
elif string[pos] in COMMENT_START and not in_quotes: # entering comment
break
else:
pos += 1
if in_quotes:
print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string))
pieces.append(string[last_piece:pos])
return pieces
class RiscVInput:
"""
Represents an Assembly file
"""
def __init__(self, content: str, name: str):
self.content = content
self.pos = 0
self.len = len(content)
self.name = name
@staticmethod
def from_file(src: str):
with open(src, 'r') as f:
return RiscVInput(f.read(), src)
def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
at = self.pos + offset
if regex:
if not isinstance(regex, re.Pattern):
print("uncompiled regex passed to peek!")
regex = re.compile(regex)
match = regex.match(self.content[at:])
if match is None:
return None
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
print("Cannot peek regex group that does not start at match start!")
return None
return match.group(regex_group)
if text:
if self.content[at:].startswith(text):
return self.content[at:at + len(text)]
return False
return self.content[at:at + size]
def peek_one_of(self, options: List[str]):
longest_peek = 0
ret = False
for text in options:
if self.peek(text=text):
if len(text) > longest_peek:
longest_peek = len(text)
ret = text
return ret
def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
at = self.pos
if regex:
if not isinstance(regex, re.Pattern):
print("uncompiled regex passed to peek!")
regex = re.compile(regex)
match = regex.match(self.content[at:])
if match is None:
return None
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
print("Cannot consume regex group that does not start at match start!")
return None
self.pos += len(match.group(regex_group))
return match.group(regex_group)
if text:
if self.content[at:].startswith(text):
self.pos += len(text)
return text
return None
self.pos += size
return self.content[at:at + size]
def consume_one_of(self, options: List[str]):
longest_peek = 0
ret = False
for text in options:
if self.peek(text=text):
if len(text) > longest_peek:
longest_peek = len(text)
ret = text
self.consume(text=ret)
return ret
def seek_newline(self):
return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1)
def consume_whitespace(self, linebreak=True):
if linebreak:
return self.consume(regex=REG_WHITESPACE)
return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK)
def has_next(self):
return self.pos < self.len
def context(self, size: int = 5):
"""
returns a context string:
<local input before pos>|<local input after pos>
"""
start = max(self.pos - size, 0)
end = min(self.pos + size, self.len - 1)
return self.content[start:self.pos] + '|' + self.content[self.pos:end]
class TokenType(IntEnum):
SYMBOL = 0
INSTRUCTION = 1
PSEUDO_OP = 2
def __repr__(self):
return self.name
@dataclass(frozen=True)
class Token:
type: TokenType
value: str
def __str__(self):
return self.name
if self.type == TokenType.NEWLINE:
return '\\n'
if self.type == TokenType.COMMA:
return ', '
return '{}({}) '.format(self.type.name[0:3], self.value)
NEWLINE = Token(TokenType.NEWLINE, '\n')
COMMA = Token(TokenType.COMMA, ',')
class RiscVToken:
type: TokenType
def tokenize(input: Iterable[str]) -> Iterable[Token]:
for line in input:
for line_comment_start in LINE_COMMENT_STARTERS:
if line_comment_start in line:
line = line[:line.index(line_comment_start)]
line.strip(' \t\n')
if not line:
continue
def __init__(self, t_type: TokenType):
self.type = t_type
parts = list(part for part in re.split(WHITESPACE_PATTERN, line) if part)
def __repr__(self):
return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text())
def text(self):
"""
create text representation of instruction
"""
return "unknown"
yield from parse_line(parts)
yield NEWLINE
class RiscVInstructionToken(RiscVToken):
def __init__(self, name, args):
super().__init__(TokenType.INSTRUCTION)
self.instruction = name
self.args = args
def parse_line(parts: List[str]) -> Iterable[Token]:
if len(parts) == 0:
return ()
first_token = parts[0]
def text(self):
if len(self.args) == 0:
return self.instruction
if len(self.args) == 1:
return "{} {}".format(self.instruction, self.args[0])
if len(self.args) == 2:
return "{} {}, {}".format(self.instruction, *self.args)
return "{} {}, {}, {}".format(self.instruction, *self.args)
if first_token[0] == '.':
yield Token(TokenType.PSEUDO_OP, first_token)
elif first_token[-1] == ':':
yield Token(TokenType.LABEL, first_token)
else:
yield Token(TokenType.INSTRUCTION_NAME, first_token)
for part in parts[1:]:
if part == ',':
yield COMMA
continue
yield from parse_arg(part)
class RiscVSymbolToken(RiscVToken):
def __init__(self, name):
super().__init__(TokenType.SYMBOL)
self.name = name
def text(self):
return self.name
def parse_arg(arg: str) -> Iterable[Token]:
comma = arg[-1] == ','
arg = arg[:-1] if comma else arg
mem_match_resul = re.match(MEMORY_ADDRESS_PATTERN, arg)
if mem_match_resul:
register = mem_match_resul.group(2).lower()
if register not in RISCV_REGS:
raise ParseException(f'"{register}" is not a valid register!')
yield Token(TokenType.ARGUMENT, register)
yield Token(TokenType.ARGUMENT, mem_match_resul.group(1))
else:
yield Token(TokenType.ARGUMENT, arg)
if comma:
yield COMMA
class RiscVPseudoOpToken(RiscVToken):
def __init__(self, name, args):
super().__init__(TokenType.PSEUDO_OP)
self.name = name
self.args = args
def text(self):
return "{} {}".format(self.name, self.args)
class RiscVTokenizer:
"""
A tokenizer for the RISC-V syntax of a given CPU
"""
def __init__(self, input_assembly: RiscVInput, instructions: List[str]):
self.input = input_assembly
self.tokens: List[RiscVToken] = []
self.name = input_assembly.name
self.instructions = instructions
def tokenize(self):
while self.input.has_next():
# remove leading whitespaces, place cursor at text start
self.input.consume_whitespace()
# check if we have a pseudo op
if self.input.peek_one_of(PSEUDO_OPS):
self.parse_pseudo_op()
# check if we have a symbol (like main:)
elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL):
self.parse_symbol()
# comment
elif self.input.peek() in COMMENT_START:
self.parse_comment()
# must be instruction
elif self.input.peek_one_of(self.instructions):
self.parse_instruction()
else:
token = self.input.peek(size=5)
raise ParseException("Unknown token around {} at: {}".format(repr(token), repr(self.input.context())))
self.input.consume_whitespace()
def parse_pseudo_op(self):
name = self.input.consume_one_of(PSEUDO_OPS)
self.input.consume_whitespace(linebreak=False)
arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE)
if not arg_str:
args = []
else:
args = split_accepting_quotes(arg_str)
self.tokens.append(RiscVPseudoOpToken(name[1:], args))
def parse_symbol(self):
name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL)
self.tokens.append(RiscVSymbolToken(name[:-1]))
if not self.input.consume_whitespace():
print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format(
self.input.context()))
def parse_instruction(self):
ins = self.input.consume_one_of(self.instructions)
args = []
self.input.consume_whitespace(linebreak=False)
while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3:
arg = self.input.consume(regex=REG_VALID_ARGUMENT)
args.append(arg)
if self.input.peek(text=','):
self.input.consume(text=',')
self.input.consume_whitespace(linebreak=False)
else:
break
self.tokens.append(RiscVInstructionToken(ins, args))
def parse_comment(self):
# just consume the rest
self.input.consume(regex=REG_UNTIL_NEWLINE)
def print_tokens(tokens: Iterable[Token]):
for token in tokens:
print(token, end='\n' if token == NEWLINE else '')
print("", flush=True, end="")

View File

@ -11,8 +11,6 @@ It contains everything needed to run assembly files, so you don't need any custo
from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \
ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException
from .Tokenizer import RiscVInput, RiscVTokenizer
from .Executable import Executable, LoadedExecutable, LoadedMemorySection
from .ExecutableParser import ExecutableParser

View File

@ -28,3 +28,15 @@ class Test(TestCase):
self.assertEqual(to_signed(0xffed36e4), -1231132)
self.assertEqual(to_signed(0x0FFFFFFF), 0x0FFFFFFF)
def test_bind_twos_complement(self):
minval = -(1 << 31)
maxval = ((1 << 31)-1)
self.assertEqual(bind_twos_complement(minval), minval, "minval preserves")
self.assertEqual(bind_twos_complement(minval), minval, )
self.assertEqual(bind_twos_complement(maxval), maxval, "maxval preserves")
self.assertEqual(bind_twos_complement(minval - 1), maxval, "minval-1 wraps")
self.assertEqual(bind_twos_complement(maxval + 1), minval, "maxval+1 wraps")
self.assertEqual(bind_twos_complement(0), 0, "0 is 0")
self.assertEqual(bind_twos_complement(1), 1, "1 is 1")
self.assertEqual(bind_twos_complement(-1), -1, "-1 is -1")