tokenizer reimplemented

assembly-parser-rework
Anton Lydike 3 years ago
parent 52e189c226
commit d5a4acef67

@ -9,7 +9,6 @@ on them.
import sys import sys
from typing import Tuple, List, Dict, Callable, Type from typing import Tuple, List, Dict, Callable, Type
from .Tokenizer import RiscVTokenizer
from .Executable import MemoryFlags from .Executable import MemoryFlags
from .Syscall import SyscallInterface, get_syscall_symbols from .Syscall import SyscallInterface, get_syscall_symbols
from .Exceptions import RiscemuBaseException, LaunchDebuggerException from .Exceptions import RiscemuBaseException, LaunchDebuggerException

@ -10,7 +10,7 @@ from .helpers import parse_numeric_argument, int_to_bytes
from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
from .Exceptions import * from .Exceptions import *
from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken from .Tokenizer import tokenize, TokenType, Token, COMMA, NEWLINE
from typing import Dict, Tuple, List, Optional from typing import Dict, Tuple, List, Optional
@ -22,7 +22,7 @@ class ExecutableParser:
tokenizer: 'RiscVTokenizer' tokenizer: 'RiscVTokenizer'
def __init__(self, tokenizer: 'RiscVTokenizer'): def __init__(self, tokenizer: 'RiscVTokenizer'):
self.instructions: List[RiscVInstructionToken] = list() self.instructions: List['RiscVInstructionToken'] = list()
self.symbols: Dict[str, Tuple[str, int]] = dict() self.symbols: Dict[str, Tuple[str, int]] = dict()
self.sections: Dict[str, MemorySection] = dict() self.sections: Dict[str, MemorySection] = dict()
self.tokenizer = tokenizer self.tokenizer = tokenizer
@ -37,11 +37,11 @@ class ExecutableParser:
:raise ParseException: Raises a ParseException when invalid input is read :raise ParseException: Raises a ParseException when invalid input is read
""" """
for token in self.tokenizer.tokens: for token in self.tokenizer.tokens:
if isinstance(token, RiscVInstructionToken): if isinstance(token, 'RiscVInstructionToken'):
self.parse_instruction(token) self.parse_instruction(token)
elif isinstance(token, RiscVSymbolToken): elif isinstance(token, 'RiscVSymbolToken'):
self.handle_symbol(token) self.handle_symbol(token)
elif isinstance(token, RiscVPseudoOpToken): elif isinstance(token, 'RiscVPseudoOpToken'):
self.handle_pseudo_op(token) self.handle_pseudo_op(token)
return self._get_execuable() return self._get_execuable()

@ -5,316 +5,96 @@ SPDX-License-Identifier: MIT
""" """
import re import re
from enum import IntEnum from dataclasses import dataclass
from typing import List from enum import Enum, auto
from typing import List, Iterable
from riscemu.decoder import RISCV_REGS
from .Exceptions import ParseException from .Exceptions import ParseException
PSEUDO_OPS = [ LINE_COMMENT_STARTERS = ('#', ';', '//')
'.asciiz', WHITESPACE_PATTERN = re.compile(r'\s+')
'.double', MEMORY_ADDRESS_PATTERN = re.compile('^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$')
'.extern', REGISTER_NAMES = RISCV_REGS
'.global',
'.align',
'.float',
'.kdata',
'.ktext',
'.space',
'.ascii',
'.byte',
'.data',
'.half',
'.text',
'.word',
'.set',
]
COMMENT_START = ["#", ";"] I = lambda x: x
REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):') class TokenType(Enum):
COMMA = auto()
ARGUMENT = auto()
PSEUDO_OP = auto()
INSTRUCTION_NAME = auto()
NEWLINE = auto()
LABEL = auto()
REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n')
REG_WHITESPACE = re.compile(r'^\s*') @dataclass(frozen=True)
class Token:
REG_NONWHITESPACE = re.compile(r'^[^\s]*')
REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*')
REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*')
REG_VALID_ARGUMENT = re.compile(
r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?'
)
REG_ARG_SPLIT = re.compile(r'^,[ \t]*')
def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")):
pos = 0
last_piece = 0
pieces = []
in_quotes = False
if string is None:
return pieces
while pos < len(string):
match = at.match(string[pos:])
if match is not None:
if not in_quotes:
pieces.append(string[last_piece:pos])
pos += len(match.group(0))
last_piece = pos
else:
pos += len(match.group(0))
elif string[pos] in quotes:
in_quotes = not in_quotes
pos += 1
elif string[pos] in COMMENT_START and not in_quotes: # entering comment
break
else:
pos += 1
if in_quotes:
print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string))
pieces.append(string[last_piece:pos])
return pieces
class RiscVInput:
"""
Represents an Assembly file
"""
def __init__(self, content: str, name: str):
self.content = content
self.pos = 0
self.len = len(content)
self.name = name
@staticmethod
def from_file(src: str):
with open(src, 'r') as f:
return RiscVInput(f.read(), src)
def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
at = self.pos + offset
if regex:
if not isinstance(regex, re.Pattern):
print("uncompiled regex passed to peek!")
regex = re.compile(regex)
match = regex.match(self.content[at:])
if match is None:
return None
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
print("Cannot peek regex group that does not start at match start!")
return None
return match.group(regex_group)
if text:
if self.content[at:].startswith(text):
return self.content[at:at + len(text)]
return False
return self.content[at:at + size]
def peek_one_of(self, options: List[str]):
longest_peek = 0
ret = False
for text in options:
if self.peek(text=text):
if len(text) > longest_peek:
longest_peek = len(text)
ret = text
return ret
def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
at = self.pos
if regex:
if not isinstance(regex, re.Pattern):
print("uncompiled regex passed to peek!")
regex = re.compile(regex)
match = regex.match(self.content[at:])
if match is None:
return None
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
print("Cannot consume regex group that does not start at match start!")
return None
self.pos += len(match.group(regex_group))
return match.group(regex_group)
if text:
if self.content[at:].startswith(text):
self.pos += len(text)
return text
return None
self.pos += size
return self.content[at:at + size]
def consume_one_of(self, options: List[str]):
longest_peek = 0
ret = False
for text in options:
if self.peek(text=text):
if len(text) > longest_peek:
longest_peek = len(text)
ret = text
self.consume(text=ret)
return ret
def seek_newline(self):
return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1)
def consume_whitespace(self, linebreak=True):
if linebreak:
return self.consume(regex=REG_WHITESPACE)
return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK)
def has_next(self):
return self.pos < self.len
def context(self, size: int = 5):
"""
returns a context string:
<local input before pos>|<local input after pos>
"""
start = max(self.pos - size, 0)
end = min(self.pos + size, self.len - 1)
return self.content[start:self.pos] + '|' + self.content[self.pos:end]
class TokenType(IntEnum):
SYMBOL = 0
INSTRUCTION = 1
PSEUDO_OP = 2
def __repr__(self):
return self.name
def __str__(self):
return self.name
class RiscVToken:
type: TokenType type: TokenType
value: str
def __init__(self, t_type: TokenType): def __str__(self):
self.type = t_type if self.type == TokenType.NEWLINE:
return '\\n'
def __repr__(self): if self.type == TokenType.COMMA:
return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text()) return ', '
return '{}({}) '.format(self.type.name[0:3], self.value)
def text(self):
""" NEWLINE = Token(TokenType.NEWLINE, '\n')
create text representation of instruction COMMA = Token(TokenType.COMMA, ',')
"""
return "unknown"
def tokenize(input: Iterable[str]) -> Iterable[Token]:
for line in input:
class RiscVInstructionToken(RiscVToken): for line_comment_start in LINE_COMMENT_STARTERS:
def __init__(self, name, args): if line_comment_start in line:
super().__init__(TokenType.INSTRUCTION) line = line[:line.index(line_comment_start)]
self.instruction = name line.strip(' \t\n')
self.args = args if not line:
continue
def text(self):
if len(self.args) == 0: parts = list(part for part in re.split(WHITESPACE_PATTERN, line) if part)
return self.instruction
if len(self.args) == 1: yield from parse_line(parts)
return "{} {}".format(self.instruction, self.args[0]) yield NEWLINE
if len(self.args) == 2:
return "{} {}, {}".format(self.instruction, *self.args)
return "{} {}, {}, {}".format(self.instruction, *self.args) def parse_line(parts: List[str]) -> Iterable[Token]:
if len(parts) == 0:
return ()
class RiscVSymbolToken(RiscVToken): first_token = parts[0]
def __init__(self, name):
super().__init__(TokenType.SYMBOL) if first_token[0] == '.':
self.name = name yield Token(TokenType.PSEUDO_OP, first_token)
elif first_token[-1] == ':':
def text(self): yield Token(TokenType.LABEL, first_token)
return self.name else:
yield Token(TokenType.INSTRUCTION_NAME, first_token)
class RiscVPseudoOpToken(RiscVToken): for part in parts[1:]:
def __init__(self, name, args): if part == ',':
super().__init__(TokenType.PSEUDO_OP) yield COMMA
self.name = name continue
self.args = args yield from parse_arg(part)
def text(self):
return "{} {}".format(self.name, self.args) def parse_arg(arg: str) -> Iterable[Token]:
comma = arg[-1] == ','
arg = arg[:-1] if comma else arg
class RiscVTokenizer: mem_match_resul = re.match(MEMORY_ADDRESS_PATTERN, arg)
""" if mem_match_resul:
A tokenizer for the RISC-V syntax of a given CPU register = mem_match_resul.group(2).lower()
""" if register not in RISCV_REGS:
def __init__(self, input_assembly: RiscVInput, instructions: List[str]): raise ParseException(f'"{register}" is not a valid register!')
self.input = input_assembly yield Token(TokenType.ARGUMENT, register)
self.tokens: List[RiscVToken] = [] yield Token(TokenType.ARGUMENT, mem_match_resul.group(1))
self.name = input_assembly.name else:
self.instructions = instructions yield Token(TokenType.ARGUMENT, arg)
if comma:
def tokenize(self): yield COMMA
while self.input.has_next():
# remove leading whitespaces, place cursor at text start
self.input.consume_whitespace() def print_tokens(tokens: Iterable[Token]):
for token in tokens:
# check if we have a pseudo op print(token, end='\n' if token == NEWLINE else '')
if self.input.peek_one_of(PSEUDO_OPS): print("", flush=True, end="")
self.parse_pseudo_op()
# check if we have a symbol (like main:)
elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL):
self.parse_symbol()
# comment
elif self.input.peek() in COMMENT_START:
self.parse_comment()
# must be instruction
elif self.input.peek_one_of(self.instructions):
self.parse_instruction()
else:
token = self.input.peek(size=5)
raise ParseException("Unknown token around {} at: {}".format(repr(token), repr(self.input.context())))
self.input.consume_whitespace()
def parse_pseudo_op(self):
name = self.input.consume_one_of(PSEUDO_OPS)
self.input.consume_whitespace(linebreak=False)
arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE)
if not arg_str:
args = []
else:
args = split_accepting_quotes(arg_str)
self.tokens.append(RiscVPseudoOpToken(name[1:], args))
def parse_symbol(self):
name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL)
self.tokens.append(RiscVSymbolToken(name[:-1]))
if not self.input.consume_whitespace():
print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format(
self.input.context()))
def parse_instruction(self):
ins = self.input.consume_one_of(self.instructions)
args = []
self.input.consume_whitespace(linebreak=False)
while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3:
arg = self.input.consume(regex=REG_VALID_ARGUMENT)
args.append(arg)
if self.input.peek(text=','):
self.input.consume(text=',')
self.input.consume_whitespace(linebreak=False)
else:
break
self.tokens.append(RiscVInstructionToken(ins, args))
def parse_comment(self):
# just consume the rest
self.input.consume(regex=REG_UNTIL_NEWLINE)

@ -11,8 +11,6 @@ It contains everything needed to run assembly files, so you don't need any custo
from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \ from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \
ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException
from .Tokenizer import RiscVInput, RiscVTokenizer
from .Executable import Executable, LoadedExecutable, LoadedMemorySection from .Executable import Executable, LoadedExecutable, LoadedMemorySection
from .ExecutableParser import ExecutableParser from .ExecutableParser import ExecutableParser

@ -28,3 +28,15 @@ class Test(TestCase):
self.assertEqual(to_signed(0xffed36e4), -1231132) self.assertEqual(to_signed(0xffed36e4), -1231132)
self.assertEqual(to_signed(0x0FFFFFFF), 0x0FFFFFFF) self.assertEqual(to_signed(0x0FFFFFFF), 0x0FFFFFFF)
def test_bind_twos_complement(self):
minval = -(1 << 31)
maxval = ((1 << 31)-1)
self.assertEqual(bind_twos_complement(minval), minval, "minval preserves")
self.assertEqual(bind_twos_complement(minval), minval, )
self.assertEqual(bind_twos_complement(maxval), maxval, "maxval preserves")
self.assertEqual(bind_twos_complement(minval - 1), maxval, "minval-1 wraps")
self.assertEqual(bind_twos_complement(maxval + 1), minval, "maxval+1 wraps")
self.assertEqual(bind_twos_complement(0), 0, "0 is 0")
self.assertEqual(bind_twos_complement(1), 1, "1 is 1")
self.assertEqual(bind_twos_complement(-1), -1, "-1 is -1")
Loading…
Cancel
Save