tokenizer reimplemented

assembly-parser-rework
Anton Lydike 3 years ago
parent 52e189c226
commit d5a4acef67

@ -9,7 +9,6 @@ on them.
import sys import sys
from typing import Tuple, List, Dict, Callable, Type from typing import Tuple, List, Dict, Callable, Type
from .Tokenizer import RiscVTokenizer
from .Executable import MemoryFlags from .Executable import MemoryFlags
from .Syscall import SyscallInterface, get_syscall_symbols from .Syscall import SyscallInterface, get_syscall_symbols
from .Exceptions import RiscemuBaseException, LaunchDebuggerException from .Exceptions import RiscemuBaseException, LaunchDebuggerException

@ -10,7 +10,7 @@ from .helpers import parse_numeric_argument, int_to_bytes
from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
from .Exceptions import * from .Exceptions import *
from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken from .Tokenizer import tokenize, TokenType, Token, COMMA, NEWLINE
from typing import Dict, Tuple, List, Optional from typing import Dict, Tuple, List, Optional
@ -22,7 +22,7 @@ class ExecutableParser:
tokenizer: 'RiscVTokenizer' tokenizer: 'RiscVTokenizer'
def __init__(self, tokenizer: 'RiscVTokenizer'): def __init__(self, tokenizer: 'RiscVTokenizer'):
self.instructions: List[RiscVInstructionToken] = list() self.instructions: List['RiscVInstructionToken'] = list()
self.symbols: Dict[str, Tuple[str, int]] = dict() self.symbols: Dict[str, Tuple[str, int]] = dict()
self.sections: Dict[str, MemorySection] = dict() self.sections: Dict[str, MemorySection] = dict()
self.tokenizer = tokenizer self.tokenizer = tokenizer
@ -37,11 +37,11 @@ class ExecutableParser:
:raise ParseException: Raises a ParseException when invalid input is read :raise ParseException: Raises a ParseException when invalid input is read
""" """
for token in self.tokenizer.tokens: for token in self.tokenizer.tokens:
if isinstance(token, RiscVInstructionToken): if isinstance(token, 'RiscVInstructionToken'):
self.parse_instruction(token) self.parse_instruction(token)
elif isinstance(token, RiscVSymbolToken): elif isinstance(token, 'RiscVSymbolToken'):
self.handle_symbol(token) self.handle_symbol(token)
elif isinstance(token, RiscVPseudoOpToken): elif isinstance(token, 'RiscVPseudoOpToken'):
self.handle_pseudo_op(token) self.handle_pseudo_op(token)
return self._get_execuable() return self._get_execuable()

@ -5,316 +5,96 @@ SPDX-License-Identifier: MIT
""" """
import re import re
from enum import IntEnum from dataclasses import dataclass
from typing import List from enum import Enum, auto
from typing import List, Iterable
from riscemu.decoder import RISCV_REGS
from .Exceptions import ParseException from .Exceptions import ParseException
PSEUDO_OPS = [ LINE_COMMENT_STARTERS = ('#', ';', '//')
'.asciiz', WHITESPACE_PATTERN = re.compile(r'\s+')
'.double', MEMORY_ADDRESS_PATTERN = re.compile('^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$')
'.extern', REGISTER_NAMES = RISCV_REGS
'.global',
'.align',
'.float',
'.kdata',
'.ktext',
'.space',
'.ascii',
'.byte',
'.data',
'.half',
'.text',
'.word',
'.set',
]
COMMENT_START = ["#", ";"] I = lambda x: x
REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):') class TokenType(Enum):
COMMA = auto()
ARGUMENT = auto()
PSEUDO_OP = auto()
INSTRUCTION_NAME = auto()
NEWLINE = auto()
LABEL = auto()
REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n')
REG_WHITESPACE = re.compile(r'^\s*') @dataclass(frozen=True)
class Token:
REG_NONWHITESPACE = re.compile(r'^[^\s]*')
REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*')
REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*')
REG_VALID_ARGUMENT = re.compile(
r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?'
)
REG_ARG_SPLIT = re.compile(r'^,[ \t]*')
def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")):
pos = 0
last_piece = 0
pieces = []
in_quotes = False
if string is None:
return pieces
while pos < len(string):
match = at.match(string[pos:])
if match is not None:
if not in_quotes:
pieces.append(string[last_piece:pos])
pos += len(match.group(0))
last_piece = pos
else:
pos += len(match.group(0))
elif string[pos] in quotes:
in_quotes = not in_quotes
pos += 1
elif string[pos] in COMMENT_START and not in_quotes: # entering comment
break
else:
pos += 1
if in_quotes:
print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string))
pieces.append(string[last_piece:pos])
return pieces
class RiscVInput:
"""
Represents an Assembly file
"""
def __init__(self, content: str, name: str):
self.content = content
self.pos = 0
self.len = len(content)
self.name = name
@staticmethod
def from_file(src: str):
with open(src, 'r') as f:
return RiscVInput(f.read(), src)
def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
at = self.pos + offset
if regex:
if not isinstance(regex, re.Pattern):
print("uncompiled regex passed to peek!")
regex = re.compile(regex)
match = regex.match(self.content[at:])
if match is None:
return None
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
print("Cannot peek regex group that does not start at match start!")
return None
return match.group(regex_group)
if text:
if self.content[at:].startswith(text):
return self.content[at:at + len(text)]
return False
return self.content[at:at + size]
def peek_one_of(self, options: List[str]):
longest_peek = 0
ret = False
for text in options:
if self.peek(text=text):
if len(text) > longest_peek:
longest_peek = len(text)
ret = text
return ret
def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
at = self.pos
if regex:
if not isinstance(regex, re.Pattern):
print("uncompiled regex passed to peek!")
regex = re.compile(regex)
match = regex.match(self.content[at:])
if match is None:
return None
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
print("Cannot consume regex group that does not start at match start!")
return None
self.pos += len(match.group(regex_group))
return match.group(regex_group)
if text:
if self.content[at:].startswith(text):
self.pos += len(text)
return text
return None
self.pos += size
return self.content[at:at + size]
def consume_one_of(self, options: List[str]):
longest_peek = 0
ret = False
for text in options:
if self.peek(text=text):
if len(text) > longest_peek:
longest_peek = len(text)
ret = text
self.consume(text=ret)
return ret
def seek_newline(self):
return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1)
def consume_whitespace(self, linebreak=True):
if linebreak:
return self.consume(regex=REG_WHITESPACE)
return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK)
def has_next(self):
return self.pos < self.len
def context(self, size: int = 5):
"""
returns a context string:
<local input before pos>|<local input after pos>
"""
start = max(self.pos - size, 0)
end = min(self.pos + size, self.len - 1)
return self.content[start:self.pos] + '|' + self.content[self.pos:end]
class TokenType(IntEnum):
SYMBOL = 0
INSTRUCTION = 1
PSEUDO_OP = 2
def __repr__(self):
return self.name
def __str__(self):
return self.name
class RiscVToken:
type: TokenType type: TokenType
value: str
def __init__(self, t_type: TokenType): def __str__(self):
self.type = t_type if self.type == TokenType.NEWLINE:
return '\\n'
def __repr__(self): if self.type == TokenType.COMMA:
return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text()) return ', '
return '{}({}) '.format(self.type.name[0:3], self.value)
def text(self):
"""
create text representation of instruction
"""
return "unknown"
class RiscVInstructionToken(RiscVToken):
def __init__(self, name, args):
super().__init__(TokenType.INSTRUCTION)
self.instruction = name
self.args = args
def text(self):
if len(self.args) == 0:
return self.instruction
if len(self.args) == 1:
return "{} {}".format(self.instruction, self.args[0])
if len(self.args) == 2:
return "{} {}, {}".format(self.instruction, *self.args)
return "{} {}, {}, {}".format(self.instruction, *self.args)
class RiscVSymbolToken(RiscVToken):
def __init__(self, name):
super().__init__(TokenType.SYMBOL)
self.name = name
def text(self):
return self.name
class RiscVPseudoOpToken(RiscVToken):
def __init__(self, name, args):
super().__init__(TokenType.PSEUDO_OP)
self.name = name
self.args = args
def text(self): NEWLINE = Token(TokenType.NEWLINE, '\n')
return "{} {}".format(self.name, self.args) COMMA = Token(TokenType.COMMA, ',')
class RiscVTokenizer: def tokenize(input: Iterable[str]) -> Iterable[Token]:
""" for line in input:
A tokenizer for the RISC-V syntax of a given CPU for line_comment_start in LINE_COMMENT_STARTERS:
""" if line_comment_start in line:
def __init__(self, input_assembly: RiscVInput, instructions: List[str]): line = line[:line.index(line_comment_start)]
self.input = input_assembly line.strip(' \t\n')
self.tokens: List[RiscVToken] = [] if not line:
self.name = input_assembly.name continue
self.instructions = instructions
def tokenize(self): parts = list(part for part in re.split(WHITESPACE_PATTERN, line) if part)
while self.input.has_next():
# remove leading whitespaces, place cursor at text start
self.input.consume_whitespace()
# check if we have a pseudo op yield from parse_line(parts)
if self.input.peek_one_of(PSEUDO_OPS): yield NEWLINE
self.parse_pseudo_op()
# check if we have a symbol (like main:)
elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL):
self.parse_symbol()
# comment def parse_line(parts: List[str]) -> Iterable[Token]:
elif self.input.peek() in COMMENT_START: if len(parts) == 0:
self.parse_comment() return ()
first_token = parts[0]
# must be instruction if first_token[0] == '.':
elif self.input.peek_one_of(self.instructions): yield Token(TokenType.PSEUDO_OP, first_token)
self.parse_instruction() elif first_token[-1] == ':':
yield Token(TokenType.LABEL, first_token)
else: else:
token = self.input.peek(size=5) yield Token(TokenType.INSTRUCTION_NAME, first_token)
raise ParseException("Unknown token around {} at: {}".format(repr(token), repr(self.input.context())))
self.input.consume_whitespace() for part in parts[1:]:
if part == ',':
def parse_pseudo_op(self): yield COMMA
name = self.input.consume_one_of(PSEUDO_OPS) continue
self.input.consume_whitespace(linebreak=False) yield from parse_arg(part)
arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE)
if not arg_str: def parse_arg(arg: str) -> Iterable[Token]:
args = [] comma = arg[-1] == ','
arg = arg[:-1] if comma else arg
mem_match_resul = re.match(MEMORY_ADDRESS_PATTERN, arg)
if mem_match_resul:
register = mem_match_resul.group(2).lower()
if register not in RISCV_REGS:
raise ParseException(f'"{register}" is not a valid register!')
yield Token(TokenType.ARGUMENT, register)
yield Token(TokenType.ARGUMENT, mem_match_resul.group(1))
else: else:
args = split_accepting_quotes(arg_str) yield Token(TokenType.ARGUMENT, arg)
if comma:
yield COMMA
self.tokens.append(RiscVPseudoOpToken(name[1:], args))
def parse_symbol(self):
name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL)
self.tokens.append(RiscVSymbolToken(name[:-1]))
if not self.input.consume_whitespace():
print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format(
self.input.context()))
def parse_instruction(self):
ins = self.input.consume_one_of(self.instructions)
args = []
self.input.consume_whitespace(linebreak=False)
while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3:
arg = self.input.consume(regex=REG_VALID_ARGUMENT)
args.append(arg)
if self.input.peek(text=','):
self.input.consume(text=',')
self.input.consume_whitespace(linebreak=False)
else:
break
self.tokens.append(RiscVInstructionToken(ins, args))
def parse_comment(self): def print_tokens(tokens: Iterable[Token]):
# just consume the rest for token in tokens:
self.input.consume(regex=REG_UNTIL_NEWLINE) print(token, end='\n' if token == NEWLINE else '')
print("", flush=True, end="")

@ -11,8 +11,6 @@ It contains everything needed to run assembly files, so you don't need any custo
from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \ from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \
ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException
from .Tokenizer import RiscVInput, RiscVTokenizer
from .Executable import Executable, LoadedExecutable, LoadedMemorySection from .Executable import Executable, LoadedExecutable, LoadedMemorySection
from .ExecutableParser import ExecutableParser from .ExecutableParser import ExecutableParser

@ -28,3 +28,15 @@ class Test(TestCase):
self.assertEqual(to_signed(0xffed36e4), -1231132) self.assertEqual(to_signed(0xffed36e4), -1231132)
self.assertEqual(to_signed(0x0FFFFFFF), 0x0FFFFFFF) self.assertEqual(to_signed(0x0FFFFFFF), 0x0FFFFFFF)
def test_bind_twos_complement(self):
minval = -(1 << 31)
maxval = ((1 << 31)-1)
self.assertEqual(bind_twos_complement(minval), minval, "minval preserves")
self.assertEqual(bind_twos_complement(minval), minval, )
self.assertEqual(bind_twos_complement(maxval), maxval, "maxval preserves")
self.assertEqual(bind_twos_complement(minval - 1), maxval, "minval-1 wraps")
self.assertEqual(bind_twos_complement(maxval + 1), minval, "maxval+1 wraps")
self.assertEqual(bind_twos_complement(0), 0, "0 is 0")
self.assertEqual(bind_twos_complement(1), 1, "1 is 1")
self.assertEqual(bind_twos_complement(-1), -1, "-1 is -1")
Loading…
Cancel
Save