tokenizer reimplemented
This commit is contained in:
parent
52e189c226
commit
d5a4acef67
@ -9,7 +9,6 @@ on them.
|
||||
import sys
|
||||
from typing import Tuple, List, Dict, Callable, Type
|
||||
|
||||
from .Tokenizer import RiscVTokenizer
|
||||
from .Executable import MemoryFlags
|
||||
from .Syscall import SyscallInterface, get_syscall_symbols
|
||||
from .Exceptions import RiscemuBaseException, LaunchDebuggerException
|
||||
|
@ -10,7 +10,7 @@ from .helpers import parse_numeric_argument, int_to_bytes
|
||||
from .Executable import Executable, InstructionMemorySection, MemorySection, MemoryFlags
|
||||
from .Exceptions import *
|
||||
|
||||
from .Tokenizer import RiscVTokenizer, RiscVInstructionToken, RiscVSymbolToken, RiscVPseudoOpToken
|
||||
from .Tokenizer import tokenize, TokenType, Token, COMMA, NEWLINE
|
||||
|
||||
from typing import Dict, Tuple, List, Optional
|
||||
|
||||
@ -22,7 +22,7 @@ class ExecutableParser:
|
||||
tokenizer: 'RiscVTokenizer'
|
||||
|
||||
def __init__(self, tokenizer: 'RiscVTokenizer'):
|
||||
self.instructions: List[RiscVInstructionToken] = list()
|
||||
self.instructions: List['RiscVInstructionToken'] = list()
|
||||
self.symbols: Dict[str, Tuple[str, int]] = dict()
|
||||
self.sections: Dict[str, MemorySection] = dict()
|
||||
self.tokenizer = tokenizer
|
||||
@ -37,11 +37,11 @@ class ExecutableParser:
|
||||
:raise ParseException: Raises a ParseException when invalid input is read
|
||||
"""
|
||||
for token in self.tokenizer.tokens:
|
||||
if isinstance(token, RiscVInstructionToken):
|
||||
if isinstance(token, 'RiscVInstructionToken'):
|
||||
self.parse_instruction(token)
|
||||
elif isinstance(token, RiscVSymbolToken):
|
||||
elif isinstance(token, 'RiscVSymbolToken'):
|
||||
self.handle_symbol(token)
|
||||
elif isinstance(token, RiscVPseudoOpToken):
|
||||
elif isinstance(token, 'RiscVPseudoOpToken'):
|
||||
self.handle_pseudo_op(token)
|
||||
return self._get_execuable()
|
||||
|
||||
|
@ -5,316 +5,96 @@ SPDX-License-Identifier: MIT
|
||||
"""
|
||||
|
||||
import re
|
||||
from enum import IntEnum
|
||||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import List, Iterable
|
||||
from riscemu.decoder import RISCV_REGS
|
||||
|
||||
from .Exceptions import ParseException
|
||||
|
||||
PSEUDO_OPS = [
|
||||
'.asciiz',
|
||||
'.double',
|
||||
'.extern',
|
||||
'.global',
|
||||
'.align',
|
||||
'.float',
|
||||
'.kdata',
|
||||
'.ktext',
|
||||
'.space',
|
||||
'.ascii',
|
||||
'.byte',
|
||||
'.data',
|
||||
'.half',
|
||||
'.text',
|
||||
'.word',
|
||||
'.set',
|
||||
]
|
||||
LINE_COMMENT_STARTERS = ('#', ';', '//')
|
||||
WHITESPACE_PATTERN = re.compile(r'\s+')
|
||||
MEMORY_ADDRESS_PATTERN = re.compile('^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$')
|
||||
REGISTER_NAMES = RISCV_REGS
|
||||
|
||||
COMMENT_START = ["#", ";"]
|
||||
I = lambda x: x
|
||||
|
||||
REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):')
|
||||
|
||||
REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n')
|
||||
|
||||
REG_WHITESPACE = re.compile(r'^\s*')
|
||||
|
||||
REG_NONWHITESPACE = re.compile(r'^[^\s]*')
|
||||
|
||||
REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*')
|
||||
|
||||
REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*')
|
||||
|
||||
REG_VALID_ARGUMENT = re.compile(
|
||||
r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?'
|
||||
)
|
||||
|
||||
REG_ARG_SPLIT = re.compile(r'^,[ \t]*')
|
||||
class TokenType(Enum):
|
||||
COMMA = auto()
|
||||
ARGUMENT = auto()
|
||||
PSEUDO_OP = auto()
|
||||
INSTRUCTION_NAME = auto()
|
||||
NEWLINE = auto()
|
||||
LABEL = auto()
|
||||
|
||||
|
||||
def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")):
|
||||
pos = 0
|
||||
last_piece = 0
|
||||
pieces = []
|
||||
in_quotes = False
|
||||
if string is None:
|
||||
return pieces
|
||||
while pos < len(string):
|
||||
match = at.match(string[pos:])
|
||||
if match is not None:
|
||||
if not in_quotes:
|
||||
pieces.append(string[last_piece:pos])
|
||||
pos += len(match.group(0))
|
||||
last_piece = pos
|
||||
else:
|
||||
pos += len(match.group(0))
|
||||
elif string[pos] in quotes:
|
||||
in_quotes = not in_quotes
|
||||
pos += 1
|
||||
elif string[pos] in COMMENT_START and not in_quotes: # entering comment
|
||||
break
|
||||
else:
|
||||
pos += 1
|
||||
if in_quotes:
|
||||
print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string))
|
||||
pieces.append(string[last_piece:pos])
|
||||
return pieces
|
||||
|
||||
|
||||
class RiscVInput:
|
||||
"""
|
||||
Represents an Assembly file
|
||||
"""
|
||||
def __init__(self, content: str, name: str):
|
||||
self.content = content
|
||||
self.pos = 0
|
||||
self.len = len(content)
|
||||
self.name = name
|
||||
|
||||
@staticmethod
|
||||
def from_file(src: str):
|
||||
with open(src, 'r') as f:
|
||||
return RiscVInput(f.read(), src)
|
||||
|
||||
def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
|
||||
at = self.pos + offset
|
||||
|
||||
if regex:
|
||||
if not isinstance(regex, re.Pattern):
|
||||
print("uncompiled regex passed to peek!")
|
||||
regex = re.compile(regex)
|
||||
match = regex.match(self.content[at:])
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
|
||||
print("Cannot peek regex group that does not start at match start!")
|
||||
return None
|
||||
return match.group(regex_group)
|
||||
if text:
|
||||
if self.content[at:].startswith(text):
|
||||
return self.content[at:at + len(text)]
|
||||
return False
|
||||
return self.content[at:at + size]
|
||||
|
||||
def peek_one_of(self, options: List[str]):
|
||||
longest_peek = 0
|
||||
ret = False
|
||||
for text in options:
|
||||
if self.peek(text=text):
|
||||
if len(text) > longest_peek:
|
||||
longest_peek = len(text)
|
||||
ret = text
|
||||
return ret
|
||||
|
||||
def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
|
||||
at = self.pos
|
||||
|
||||
if regex:
|
||||
if not isinstance(regex, re.Pattern):
|
||||
print("uncompiled regex passed to peek!")
|
||||
regex = re.compile(regex)
|
||||
match = regex.match(self.content[at:])
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
|
||||
print("Cannot consume regex group that does not start at match start!")
|
||||
return None
|
||||
self.pos += len(match.group(regex_group))
|
||||
return match.group(regex_group)
|
||||
|
||||
if text:
|
||||
if self.content[at:].startswith(text):
|
||||
self.pos += len(text)
|
||||
return text
|
||||
return None
|
||||
|
||||
self.pos += size
|
||||
return self.content[at:at + size]
|
||||
|
||||
def consume_one_of(self, options: List[str]):
|
||||
longest_peek = 0
|
||||
ret = False
|
||||
for text in options:
|
||||
if self.peek(text=text):
|
||||
if len(text) > longest_peek:
|
||||
longest_peek = len(text)
|
||||
ret = text
|
||||
self.consume(text=ret)
|
||||
return ret
|
||||
|
||||
def seek_newline(self):
|
||||
return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1)
|
||||
|
||||
def consume_whitespace(self, linebreak=True):
|
||||
if linebreak:
|
||||
return self.consume(regex=REG_WHITESPACE)
|
||||
return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK)
|
||||
|
||||
def has_next(self):
|
||||
return self.pos < self.len
|
||||
|
||||
def context(self, size: int = 5):
|
||||
"""
|
||||
returns a context string:
|
||||
<local input before pos>|<local input after pos>
|
||||
"""
|
||||
start = max(self.pos - size, 0)
|
||||
end = min(self.pos + size, self.len - 1)
|
||||
|
||||
return self.content[start:self.pos] + '|' + self.content[self.pos:end]
|
||||
|
||||
|
||||
class TokenType(IntEnum):
|
||||
SYMBOL = 0
|
||||
INSTRUCTION = 1
|
||||
PSEUDO_OP = 2
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
@dataclass(frozen=True)
|
||||
class Token:
|
||||
type: TokenType
|
||||
value: str
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
if self.type == TokenType.NEWLINE:
|
||||
return '\\n'
|
||||
if self.type == TokenType.COMMA:
|
||||
return ', '
|
||||
return '{}({}) '.format(self.type.name[0:3], self.value)
|
||||
|
||||
NEWLINE = Token(TokenType.NEWLINE, '\n')
|
||||
COMMA = Token(TokenType.COMMA, ',')
|
||||
|
||||
|
||||
class RiscVToken:
|
||||
type: TokenType
|
||||
def tokenize(input: Iterable[str]) -> Iterable[Token]:
|
||||
for line in input:
|
||||
for line_comment_start in LINE_COMMENT_STARTERS:
|
||||
if line_comment_start in line:
|
||||
line = line[:line.index(line_comment_start)]
|
||||
line.strip(' \t\n')
|
||||
if not line:
|
||||
continue
|
||||
|
||||
def __init__(self, t_type: TokenType):
|
||||
self.type = t_type
|
||||
parts = list(part for part in re.split(WHITESPACE_PATTERN, line) if part)
|
||||
|
||||
def __repr__(self):
|
||||
return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text())
|
||||
|
||||
def text(self):
|
||||
"""
|
||||
create text representation of instruction
|
||||
"""
|
||||
return "unknown"
|
||||
yield from parse_line(parts)
|
||||
yield NEWLINE
|
||||
|
||||
|
||||
class RiscVInstructionToken(RiscVToken):
|
||||
def __init__(self, name, args):
|
||||
super().__init__(TokenType.INSTRUCTION)
|
||||
self.instruction = name
|
||||
self.args = args
|
||||
def parse_line(parts: List[str]) -> Iterable[Token]:
|
||||
if len(parts) == 0:
|
||||
return ()
|
||||
first_token = parts[0]
|
||||
|
||||
def text(self):
|
||||
if len(self.args) == 0:
|
||||
return self.instruction
|
||||
if len(self.args) == 1:
|
||||
return "{} {}".format(self.instruction, self.args[0])
|
||||
if len(self.args) == 2:
|
||||
return "{} {}, {}".format(self.instruction, *self.args)
|
||||
return "{} {}, {}, {}".format(self.instruction, *self.args)
|
||||
if first_token[0] == '.':
|
||||
yield Token(TokenType.PSEUDO_OP, first_token)
|
||||
elif first_token[-1] == ':':
|
||||
yield Token(TokenType.LABEL, first_token)
|
||||
else:
|
||||
yield Token(TokenType.INSTRUCTION_NAME, first_token)
|
||||
|
||||
for part in parts[1:]:
|
||||
if part == ',':
|
||||
yield COMMA
|
||||
continue
|
||||
yield from parse_arg(part)
|
||||
|
||||
|
||||
class RiscVSymbolToken(RiscVToken):
|
||||
def __init__(self, name):
|
||||
super().__init__(TokenType.SYMBOL)
|
||||
self.name = name
|
||||
|
||||
def text(self):
|
||||
return self.name
|
||||
def parse_arg(arg: str) -> Iterable[Token]:
|
||||
comma = arg[-1] == ','
|
||||
arg = arg[:-1] if comma else arg
|
||||
mem_match_resul = re.match(MEMORY_ADDRESS_PATTERN, arg)
|
||||
if mem_match_resul:
|
||||
register = mem_match_resul.group(2).lower()
|
||||
if register not in RISCV_REGS:
|
||||
raise ParseException(f'"{register}" is not a valid register!')
|
||||
yield Token(TokenType.ARGUMENT, register)
|
||||
yield Token(TokenType.ARGUMENT, mem_match_resul.group(1))
|
||||
else:
|
||||
yield Token(TokenType.ARGUMENT, arg)
|
||||
if comma:
|
||||
yield COMMA
|
||||
|
||||
|
||||
class RiscVPseudoOpToken(RiscVToken):
|
||||
def __init__(self, name, args):
|
||||
super().__init__(TokenType.PSEUDO_OP)
|
||||
self.name = name
|
||||
self.args = args
|
||||
|
||||
def text(self):
|
||||
return "{} {}".format(self.name, self.args)
|
||||
|
||||
|
||||
class RiscVTokenizer:
|
||||
"""
|
||||
A tokenizer for the RISC-V syntax of a given CPU
|
||||
"""
|
||||
def __init__(self, input_assembly: RiscVInput, instructions: List[str]):
|
||||
self.input = input_assembly
|
||||
self.tokens: List[RiscVToken] = []
|
||||
self.name = input_assembly.name
|
||||
self.instructions = instructions
|
||||
|
||||
def tokenize(self):
|
||||
while self.input.has_next():
|
||||
# remove leading whitespaces, place cursor at text start
|
||||
self.input.consume_whitespace()
|
||||
|
||||
# check if we have a pseudo op
|
||||
if self.input.peek_one_of(PSEUDO_OPS):
|
||||
self.parse_pseudo_op()
|
||||
|
||||
# check if we have a symbol (like main:)
|
||||
elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL):
|
||||
self.parse_symbol()
|
||||
|
||||
# comment
|
||||
elif self.input.peek() in COMMENT_START:
|
||||
self.parse_comment()
|
||||
|
||||
# must be instruction
|
||||
elif self.input.peek_one_of(self.instructions):
|
||||
self.parse_instruction()
|
||||
else:
|
||||
token = self.input.peek(size=5)
|
||||
raise ParseException("Unknown token around {} at: {}".format(repr(token), repr(self.input.context())))
|
||||
self.input.consume_whitespace()
|
||||
|
||||
def parse_pseudo_op(self):
|
||||
name = self.input.consume_one_of(PSEUDO_OPS)
|
||||
self.input.consume_whitespace(linebreak=False)
|
||||
|
||||
arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE)
|
||||
if not arg_str:
|
||||
args = []
|
||||
else:
|
||||
args = split_accepting_quotes(arg_str)
|
||||
|
||||
self.tokens.append(RiscVPseudoOpToken(name[1:], args))
|
||||
|
||||
def parse_symbol(self):
|
||||
name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL)
|
||||
self.tokens.append(RiscVSymbolToken(name[:-1]))
|
||||
if not self.input.consume_whitespace():
|
||||
print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format(
|
||||
self.input.context()))
|
||||
|
||||
def parse_instruction(self):
|
||||
ins = self.input.consume_one_of(self.instructions)
|
||||
args = []
|
||||
self.input.consume_whitespace(linebreak=False)
|
||||
while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3:
|
||||
arg = self.input.consume(regex=REG_VALID_ARGUMENT)
|
||||
args.append(arg)
|
||||
if self.input.peek(text=','):
|
||||
self.input.consume(text=',')
|
||||
self.input.consume_whitespace(linebreak=False)
|
||||
else:
|
||||
break
|
||||
self.tokens.append(RiscVInstructionToken(ins, args))
|
||||
|
||||
def parse_comment(self):
|
||||
# just consume the rest
|
||||
self.input.consume(regex=REG_UNTIL_NEWLINE)
|
||||
def print_tokens(tokens: Iterable[Token]):
|
||||
for token in tokens:
|
||||
print(token, end='\n' if token == NEWLINE else '')
|
||||
print("", flush=True, end="")
|
||||
|
@ -11,8 +11,6 @@ It contains everything needed to run assembly files, so you don't need any custo
|
||||
from .Exceptions import RiscemuBaseException, LaunchDebuggerException, InvalidSyscallException, LinkerException, \
|
||||
ParseException, NumberFormatException, InvalidRegisterException, MemoryAccessException, OutOfMemoryException
|
||||
|
||||
from .Tokenizer import RiscVInput, RiscVTokenizer
|
||||
|
||||
from .Executable import Executable, LoadedExecutable, LoadedMemorySection
|
||||
|
||||
from .ExecutableParser import ExecutableParser
|
||||
|
@ -28,3 +28,15 @@ class Test(TestCase):
|
||||
self.assertEqual(to_signed(0xffed36e4), -1231132)
|
||||
self.assertEqual(to_signed(0x0FFFFFFF), 0x0FFFFFFF)
|
||||
|
||||
def test_bind_twos_complement(self):
|
||||
minval = -(1 << 31)
|
||||
maxval = ((1 << 31)-1)
|
||||
|
||||
self.assertEqual(bind_twos_complement(minval), minval, "minval preserves")
|
||||
self.assertEqual(bind_twos_complement(minval), minval, )
|
||||
self.assertEqual(bind_twos_complement(maxval), maxval, "maxval preserves")
|
||||
self.assertEqual(bind_twos_complement(minval - 1), maxval, "minval-1 wraps")
|
||||
self.assertEqual(bind_twos_complement(maxval + 1), minval, "maxval+1 wraps")
|
||||
self.assertEqual(bind_twos_complement(0), 0, "0 is 0")
|
||||
self.assertEqual(bind_twos_complement(1), 1, "1 is 1")
|
||||
self.assertEqual(bind_twos_complement(-1), -1, "-1 is -1")
|
Loading…
Reference in New Issue
Block a user