|
|
|
@ -5,316 +5,96 @@ SPDX-License-Identifier: MIT
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
from enum import IntEnum
|
|
|
|
|
from typing import List
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from enum import Enum, auto
|
|
|
|
|
from typing import List, Iterable
|
|
|
|
|
from riscemu.decoder import RISCV_REGS
|
|
|
|
|
|
|
|
|
|
from .Exceptions import ParseException
|
|
|
|
|
|
|
|
|
|
PSEUDO_OPS = [
|
|
|
|
|
'.asciiz',
|
|
|
|
|
'.double',
|
|
|
|
|
'.extern',
|
|
|
|
|
'.global',
|
|
|
|
|
'.align',
|
|
|
|
|
'.float',
|
|
|
|
|
'.kdata',
|
|
|
|
|
'.ktext',
|
|
|
|
|
'.space',
|
|
|
|
|
'.ascii',
|
|
|
|
|
'.byte',
|
|
|
|
|
'.data',
|
|
|
|
|
'.half',
|
|
|
|
|
'.text',
|
|
|
|
|
'.word',
|
|
|
|
|
'.set',
|
|
|
|
|
]
|
|
|
|
|
LINE_COMMENT_STARTERS = ('#', ';', '//')
|
|
|
|
|
WHITESPACE_PATTERN = re.compile(r'\s+')
|
|
|
|
|
MEMORY_ADDRESS_PATTERN = re.compile('^(0[xX][A-f0-9]+|\d+|0b[0-1]+)\(([A-z]+[0-9]{0,2})\)$')
|
|
|
|
|
REGISTER_NAMES = RISCV_REGS
|
|
|
|
|
|
|
|
|
|
COMMENT_START = ["#", ";"]
|
|
|
|
|
I = lambda x: x
|
|
|
|
|
|
|
|
|
|
REG_VALID_SYMBOL_LABEL = re.compile(r'^([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_]):')
|
|
|
|
|
class TokenType(Enum):
|
|
|
|
|
COMMA = auto()
|
|
|
|
|
ARGUMENT = auto()
|
|
|
|
|
PSEUDO_OP = auto()
|
|
|
|
|
INSTRUCTION_NAME = auto()
|
|
|
|
|
NEWLINE = auto()
|
|
|
|
|
LABEL = auto()
|
|
|
|
|
|
|
|
|
|
REG_WHITESPACE_UNTIL_NEWLINE = re.compile(r'^(\s*)\n')
|
|
|
|
|
|
|
|
|
|
REG_WHITESPACE = re.compile(r'^\s*')
|
|
|
|
|
|
|
|
|
|
REG_NONWHITESPACE = re.compile(r'^[^\s]*')
|
|
|
|
|
|
|
|
|
|
REG_UNTIL_NEWLINE = re.compile(r'^[^\n]*')
|
|
|
|
|
|
|
|
|
|
REG_WHITESPACE_NO_LINEBREAK = re.compile(r'^[ \t]*')
|
|
|
|
|
|
|
|
|
|
REG_VALID_ARGUMENT = re.compile(
|
|
|
|
|
r'^([+-]?(0x[0-9A-f]+|[0-9]+)|[A-z_.][A-z0-9_.]*[A-z_0-9]|[A-z_])(\(([A-z_.][A-z_0-9.]*[A-z_0-9]|[A-z_])\))?'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
REG_ARG_SPLIT = re.compile(r'^,[ \t]*')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_accepting_quotes(string, at=REG_ARG_SPLIT, quotes=('"', "'")):
|
|
|
|
|
pos = 0
|
|
|
|
|
last_piece = 0
|
|
|
|
|
pieces = []
|
|
|
|
|
in_quotes = False
|
|
|
|
|
if string is None:
|
|
|
|
|
return pieces
|
|
|
|
|
while pos < len(string):
|
|
|
|
|
match = at.match(string[pos:])
|
|
|
|
|
if match is not None:
|
|
|
|
|
if not in_quotes:
|
|
|
|
|
pieces.append(string[last_piece:pos])
|
|
|
|
|
pos += len(match.group(0))
|
|
|
|
|
last_piece = pos
|
|
|
|
|
else:
|
|
|
|
|
pos += len(match.group(0))
|
|
|
|
|
elif string[pos] in quotes:
|
|
|
|
|
in_quotes = not in_quotes
|
|
|
|
|
pos += 1
|
|
|
|
|
elif string[pos] in COMMENT_START and not in_quotes: # entering comment
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
pos += 1
|
|
|
|
|
if in_quotes:
|
|
|
|
|
print("[Tokenizer.split] unbalanced quotes in \"{}\"!".format(string))
|
|
|
|
|
pieces.append(string[last_piece:pos])
|
|
|
|
|
return pieces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RiscVInput:
|
|
|
|
|
"""
|
|
|
|
|
Represents an Assembly file
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, content: str, name: str):
|
|
|
|
|
self.content = content
|
|
|
|
|
self.pos = 0
|
|
|
|
|
self.len = len(content)
|
|
|
|
|
self.name = name
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def from_file(src: str):
|
|
|
|
|
with open(src, 'r') as f:
|
|
|
|
|
return RiscVInput(f.read(), src)
|
|
|
|
|
|
|
|
|
|
def peek(self, offset: int = 0, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
|
|
|
|
|
at = self.pos + offset
|
|
|
|
|
|
|
|
|
|
if regex:
|
|
|
|
|
if not isinstance(regex, re.Pattern):
|
|
|
|
|
print("uncompiled regex passed to peek!")
|
|
|
|
|
regex = re.compile(regex)
|
|
|
|
|
match = regex.match(self.content[at:])
|
|
|
|
|
if match is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
|
|
|
|
|
print("Cannot peek regex group that does not start at match start!")
|
|
|
|
|
return None
|
|
|
|
|
return match.group(regex_group)
|
|
|
|
|
if text:
|
|
|
|
|
if self.content[at:].startswith(text):
|
|
|
|
|
return self.content[at:at + len(text)]
|
|
|
|
|
return False
|
|
|
|
|
return self.content[at:at + size]
|
|
|
|
|
|
|
|
|
|
def peek_one_of(self, options: List[str]):
|
|
|
|
|
longest_peek = 0
|
|
|
|
|
ret = False
|
|
|
|
|
for text in options:
|
|
|
|
|
if self.peek(text=text):
|
|
|
|
|
if len(text) > longest_peek:
|
|
|
|
|
longest_peek = len(text)
|
|
|
|
|
ret = text
|
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
|
def consume(self, size: int = 1, regex: re.Pattern = None, text: str = None, regex_group: int = 0):
|
|
|
|
|
at = self.pos
|
|
|
|
|
|
|
|
|
|
if regex:
|
|
|
|
|
if not isinstance(regex, re.Pattern):
|
|
|
|
|
print("uncompiled regex passed to peek!")
|
|
|
|
|
regex = re.compile(regex)
|
|
|
|
|
match = regex.match(self.content[at:])
|
|
|
|
|
if match is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
if regex_group != 0 and not match.group(0).startswith(match.group(regex_group)):
|
|
|
|
|
print("Cannot consume regex group that does not start at match start!")
|
|
|
|
|
return None
|
|
|
|
|
self.pos += len(match.group(regex_group))
|
|
|
|
|
return match.group(regex_group)
|
|
|
|
|
|
|
|
|
|
if text:
|
|
|
|
|
if self.content[at:].startswith(text):
|
|
|
|
|
self.pos += len(text)
|
|
|
|
|
return text
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
self.pos += size
|
|
|
|
|
return self.content[at:at + size]
|
|
|
|
|
|
|
|
|
|
def consume_one_of(self, options: List[str]):
|
|
|
|
|
longest_peek = 0
|
|
|
|
|
ret = False
|
|
|
|
|
for text in options:
|
|
|
|
|
if self.peek(text=text):
|
|
|
|
|
if len(text) > longest_peek:
|
|
|
|
|
longest_peek = len(text)
|
|
|
|
|
ret = text
|
|
|
|
|
self.consume(text=ret)
|
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
|
def seek_newline(self):
|
|
|
|
|
return self.consume(regex=REG_WHITESPACE_UNTIL_NEWLINE, regex_group=1)
|
|
|
|
|
|
|
|
|
|
def consume_whitespace(self, linebreak=True):
|
|
|
|
|
if linebreak:
|
|
|
|
|
return self.consume(regex=REG_WHITESPACE)
|
|
|
|
|
return self.consume(regex=REG_WHITESPACE_NO_LINEBREAK)
|
|
|
|
|
|
|
|
|
|
def has_next(self):
|
|
|
|
|
return self.pos < self.len
|
|
|
|
|
|
|
|
|
|
def context(self, size: int = 5):
|
|
|
|
|
"""
|
|
|
|
|
returns a context string:
|
|
|
|
|
<local input before pos>|<local input after pos>
|
|
|
|
|
"""
|
|
|
|
|
start = max(self.pos - size, 0)
|
|
|
|
|
end = min(self.pos + size, self.len - 1)
|
|
|
|
|
|
|
|
|
|
return self.content[start:self.pos] + '|' + self.content[self.pos:end]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TokenType(IntEnum):
|
|
|
|
|
SYMBOL = 0
|
|
|
|
|
INSTRUCTION = 1
|
|
|
|
|
PSEUDO_OP = 2
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return self.name
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
return self.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RiscVToken:
|
|
|
|
|
@dataclass(frozen=True)
|
|
|
|
|
class Token:
|
|
|
|
|
type: TokenType
|
|
|
|
|
value: str
|
|
|
|
|
|
|
|
|
|
def __init__(self, t_type: TokenType):
|
|
|
|
|
self.type = t_type
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return "{}[{}]({})".format(self.__class__.__name__, self.type, self.text())
|
|
|
|
|
|
|
|
|
|
def text(self):
|
|
|
|
|
"""
|
|
|
|
|
create text representation of instruction
|
|
|
|
|
"""
|
|
|
|
|
return "unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RiscVInstructionToken(RiscVToken):
|
|
|
|
|
def __init__(self, name, args):
|
|
|
|
|
super().__init__(TokenType.INSTRUCTION)
|
|
|
|
|
self.instruction = name
|
|
|
|
|
self.args = args
|
|
|
|
|
|
|
|
|
|
def text(self):
|
|
|
|
|
if len(self.args) == 0:
|
|
|
|
|
return self.instruction
|
|
|
|
|
if len(self.args) == 1:
|
|
|
|
|
return "{} {}".format(self.instruction, self.args[0])
|
|
|
|
|
if len(self.args) == 2:
|
|
|
|
|
return "{} {}, {}".format(self.instruction, *self.args)
|
|
|
|
|
return "{} {}, {}, {}".format(self.instruction, *self.args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RiscVSymbolToken(RiscVToken):
|
|
|
|
|
def __init__(self, name):
|
|
|
|
|
super().__init__(TokenType.SYMBOL)
|
|
|
|
|
self.name = name
|
|
|
|
|
|
|
|
|
|
def text(self):
|
|
|
|
|
return self.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RiscVPseudoOpToken(RiscVToken):
|
|
|
|
|
def __init__(self, name, args):
|
|
|
|
|
super().__init__(TokenType.PSEUDO_OP)
|
|
|
|
|
self.name = name
|
|
|
|
|
self.args = args
|
|
|
|
|
|
|
|
|
|
def text(self):
|
|
|
|
|
return "{} {}".format(self.name, self.args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RiscVTokenizer:
|
|
|
|
|
"""
|
|
|
|
|
A tokenizer for the RISC-V syntax of a given CPU
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, input_assembly: RiscVInput, instructions: List[str]):
|
|
|
|
|
self.input = input_assembly
|
|
|
|
|
self.tokens: List[RiscVToken] = []
|
|
|
|
|
self.name = input_assembly.name
|
|
|
|
|
self.instructions = instructions
|
|
|
|
|
|
|
|
|
|
def tokenize(self):
|
|
|
|
|
while self.input.has_next():
|
|
|
|
|
# remove leading whitespaces, place cursor at text start
|
|
|
|
|
self.input.consume_whitespace()
|
|
|
|
|
|
|
|
|
|
# check if we have a pseudo op
|
|
|
|
|
if self.input.peek_one_of(PSEUDO_OPS):
|
|
|
|
|
self.parse_pseudo_op()
|
|
|
|
|
|
|
|
|
|
# check if we have a symbol (like main:)
|
|
|
|
|
elif self.input.peek(regex=REG_VALID_SYMBOL_LABEL):
|
|
|
|
|
self.parse_symbol()
|
|
|
|
|
|
|
|
|
|
# comment
|
|
|
|
|
elif self.input.peek() in COMMENT_START:
|
|
|
|
|
self.parse_comment()
|
|
|
|
|
|
|
|
|
|
# must be instruction
|
|
|
|
|
elif self.input.peek_one_of(self.instructions):
|
|
|
|
|
self.parse_instruction()
|
|
|
|
|
else:
|
|
|
|
|
token = self.input.peek(size=5)
|
|
|
|
|
raise ParseException("Unknown token around {} at: {}".format(repr(token), repr(self.input.context())))
|
|
|
|
|
self.input.consume_whitespace()
|
|
|
|
|
|
|
|
|
|
def parse_pseudo_op(self):
|
|
|
|
|
name = self.input.consume_one_of(PSEUDO_OPS)
|
|
|
|
|
self.input.consume_whitespace(linebreak=False)
|
|
|
|
|
|
|
|
|
|
arg_str = self.input.consume(regex=REG_UNTIL_NEWLINE)
|
|
|
|
|
if not arg_str:
|
|
|
|
|
args = []
|
|
|
|
|
else:
|
|
|
|
|
args = split_accepting_quotes(arg_str)
|
|
|
|
|
|
|
|
|
|
self.tokens.append(RiscVPseudoOpToken(name[1:], args))
|
|
|
|
|
|
|
|
|
|
def parse_symbol(self):
|
|
|
|
|
name = self.input.consume(regex=REG_VALID_SYMBOL_LABEL)
|
|
|
|
|
self.tokens.append(RiscVSymbolToken(name[:-1]))
|
|
|
|
|
if not self.input.consume_whitespace():
|
|
|
|
|
print("[Tokenizer] symbol declaration should always be followed by whitespace (at {})!".format(
|
|
|
|
|
self.input.context()))
|
|
|
|
|
|
|
|
|
|
def parse_instruction(self):
|
|
|
|
|
ins = self.input.consume_one_of(self.instructions)
|
|
|
|
|
args = []
|
|
|
|
|
self.input.consume_whitespace(linebreak=False)
|
|
|
|
|
while self.input.peek(regex=REG_VALID_ARGUMENT) and len(args) < 3:
|
|
|
|
|
arg = self.input.consume(regex=REG_VALID_ARGUMENT)
|
|
|
|
|
args.append(arg)
|
|
|
|
|
if self.input.peek(text=','):
|
|
|
|
|
self.input.consume(text=',')
|
|
|
|
|
self.input.consume_whitespace(linebreak=False)
|
|
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
self.tokens.append(RiscVInstructionToken(ins, args))
|
|
|
|
|
|
|
|
|
|
def parse_comment(self):
|
|
|
|
|
# just consume the rest
|
|
|
|
|
self.input.consume(regex=REG_UNTIL_NEWLINE)
|
|
|
|
|
def __str__(self):
|
|
|
|
|
if self.type == TokenType.NEWLINE:
|
|
|
|
|
return '\\n'
|
|
|
|
|
if self.type == TokenType.COMMA:
|
|
|
|
|
return ', '
|
|
|
|
|
return '{}({}) '.format(self.type.name[0:3], self.value)
|
|
|
|
|
|
|
|
|
|
NEWLINE = Token(TokenType.NEWLINE, '\n')
|
|
|
|
|
COMMA = Token(TokenType.COMMA, ',')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tokenize(input: Iterable[str]) -> Iterable[Token]:
|
|
|
|
|
for line in input:
|
|
|
|
|
for line_comment_start in LINE_COMMENT_STARTERS:
|
|
|
|
|
if line_comment_start in line:
|
|
|
|
|
line = line[:line.index(line_comment_start)]
|
|
|
|
|
line.strip(' \t\n')
|
|
|
|
|
if not line:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
parts = list(part for part in re.split(WHITESPACE_PATTERN, line) if part)
|
|
|
|
|
|
|
|
|
|
yield from parse_line(parts)
|
|
|
|
|
yield NEWLINE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_line(parts: List[str]) -> Iterable[Token]:
|
|
|
|
|
if len(parts) == 0:
|
|
|
|
|
return ()
|
|
|
|
|
first_token = parts[0]
|
|
|
|
|
|
|
|
|
|
if first_token[0] == '.':
|
|
|
|
|
yield Token(TokenType.PSEUDO_OP, first_token)
|
|
|
|
|
elif first_token[-1] == ':':
|
|
|
|
|
yield Token(TokenType.LABEL, first_token)
|
|
|
|
|
else:
|
|
|
|
|
yield Token(TokenType.INSTRUCTION_NAME, first_token)
|
|
|
|
|
|
|
|
|
|
for part in parts[1:]:
|
|
|
|
|
if part == ',':
|
|
|
|
|
yield COMMA
|
|
|
|
|
continue
|
|
|
|
|
yield from parse_arg(part)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_arg(arg: str) -> Iterable[Token]:
|
|
|
|
|
comma = arg[-1] == ','
|
|
|
|
|
arg = arg[:-1] if comma else arg
|
|
|
|
|
mem_match_resul = re.match(MEMORY_ADDRESS_PATTERN, arg)
|
|
|
|
|
if mem_match_resul:
|
|
|
|
|
register = mem_match_resul.group(2).lower()
|
|
|
|
|
if register not in RISCV_REGS:
|
|
|
|
|
raise ParseException(f'"{register}" is not a valid register!')
|
|
|
|
|
yield Token(TokenType.ARGUMENT, register)
|
|
|
|
|
yield Token(TokenType.ARGUMENT, mem_match_resul.group(1))
|
|
|
|
|
else:
|
|
|
|
|
yield Token(TokenType.ARGUMENT, arg)
|
|
|
|
|
if comma:
|
|
|
|
|
yield COMMA
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_tokens(tokens: Iterable[Token]):
|
|
|
|
|
for token in tokens:
|
|
|
|
|
print(token, end='\n' if token == NEWLINE else '')
|
|
|
|
|
print("", flush=True, end="")
|
|
|
|
|