riscemu/riscemu/parser.py

"""
RiscEmu (c) 2021 Anton Lydike

SPDX-License-Identifier: MIT
"""
import re
from typing import Dict, Tuple, Iterable, Callable, List

from .assembler import MemorySectionType, ParseContext, AssemblerDirectives
from .colors import FMT_PARSE
from .helpers import Peekable
from .tokenizer import Token, TokenType, tokenize
from .types import Program, T_ParserOpts, ProgramLoader, SimpleInstruction
from .types.exceptions import ParseException


def parse_instruction(token: Token, args: Tuple[str], context: ParseContext):
    if context.section is None:
        context.new_section(".text", MemorySectionType.Instructions)
    if context.section.type != MemorySectionType.Instructions:
        raise ParseException(
            "{} {} encountered in invalid context: {}".format(token, args, context)
        )
    ins = SimpleInstruction(
        token.value, args, context.context, context.current_address()
    )
    context.section.data.append(ins)


def parse_label(token: Token, args: Tuple[str], context: ParseContext):
    name = token.value[:-1]
    if re.match(r"^\d+$", name):
        # relative label:
        context.context.numbered_labels[name].append(context.current_address())
    else:
        if name in context.context.labels:
            print(FMT_PARSE + "Warn: Symbol {} defined twice!".format(name))
        context.add_label(name, context.current_address(), is_relative=True)


PARSERS: Dict[TokenType, Callable[[Token, Tuple[str], ParseContext], None]] = {
    TokenType.PSEUDO_OP: AssemblerDirectives.handle_instruction,
    TokenType.LABEL: parse_label,
    TokenType.INSTRUCTION_NAME: parse_instruction,
}


def parse_tokens(name: str, tokens_iter: Iterable[Token]) -> Program:
    """
    Convert a token stream into a parsed program
    :param name: the programs name
    :param tokens_iter: the programs content, tokenized
    :return: a parsed program
    """
    context = ParseContext(name)

    for token, args in composite_tokenizer(Peekable[Token](tokens_iter)):
        if token.type not in PARSERS:
            raise ParseException("Unexpected token type: {}, {}".format(token, args))
        PARSERS[token.type](token, args, context)

    return context.finalize()


def composite_tokenizer(
    tokens_iter: Iterable[Token],
) -> Iterable[Tuple[Token, Tuple[str]]]:
    """
    Convert an iterator over tokens into an iterator over tuples: (token, list(token))

    The first token ist either a pseudo_op, label, or instruction name. The token list are all remaining tokens before
    a newline is encountered
    :param tokens_iter: An iterator over tokens
    :return: An iterator over a slightly more structured representation of the tokens
    """
    tokens: Peekable[Token] = Peekable[Token](tokens_iter)

    while not tokens.is_empty():
        token = next(tokens)
        if token.type in (
            TokenType.PSEUDO_OP,
            TokenType.LABEL,
            TokenType.INSTRUCTION_NAME,
        ):
            yield token, tuple(take_arguments(tokens))


def take_arguments(tokens: Peekable[Token]) -> Iterable[str]:
    """
    Consumes (argument comma)* and yields argument.value until newline is reached
    If an argument is not followed by either a newline or a comma, a parse exception is raised
    The newline at the end is consumed
    :param tokens: A Peekable iterator over some Tokens
    """
    while True:
        if tokens.peek().type == TokenType.ARGUMENT:
            yield next(tokens).value
        elif tokens.peek().type == TokenType.NEWLINE:
            next(tokens)
            break
        elif tokens.peek().type == TokenType.COMMA:
            next(tokens)
        else:
            break

        # raise ParseException("Expected newline, instead got {}".format(tokens.peek()))


class AssemblyFileLoader(ProgramLoader):
    """
    This class loads assembly files written by hand. It understands some assembler directives and supports most
    pseudo instructions. It does very little verification of source correctness.

    It also supports numbered jump targets and properly supports local and global scope (.globl assembly directive)


    The AssemblyFileLoader loads .asm, .S and .s files by default, and acts as a weak fallback to all other filetypes.
    """

    def parse(self) -> Program:
        with open(self.source_path, "r") as f:
            return parse_tokens(self.filename, tokenize(f))

    def parse_io(self, io):
        return parse_tokens(self.filename, tokenize(io))

    @classmethod
    def can_parse(cls, source_path: str) -> float:
        """

        It also acts as a weak fallback if no other loaders want to take the file.

        :param source_path: the path to the source file
        :return:
        """
        # gcc recognizes these line endings as assembly. So we will do too.
        if source_path.split(".")[-1] in ("asm", "S", "s"):
            return 1
        return 0.01

    @classmethod
    def get_options(cls, argv: List[str]) -> [List[str], T_ParserOpts]:
        return argv, {}
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`"""`
			`RiscEmu (c) 2021 Anton Lydike`

			`SPDX-License-Identifier: MIT`
			`"""`
			`import re`
started with base type overhaul 3 years ago			`from typing import Dict, Tuple, Iterable, Callable, List`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago
			`from .assembler import MemorySectionType, ParseContext, AssemblerDirectives`
			`from .colors import FMT_PARSE`
[restructured] moved all simple type definitions into riscemu.types 3 years ago			`from .helpers import Peekable`
finished basic RISC-V parser 3 years ago			`from .tokenizer import Token, TokenType, tokenize`
[restructured] moved all simple type definitions into riscemu.types 3 years ago			`from .types import Program, T_ParserOpts, ProgramLoader, SimpleInstruction`
[restructured] moved more types and exceptions to riscemu.types 3 years ago			`from .types.exceptions import ParseException`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago

			`def parse_instruction(token: Token, args: Tuple[str], context: ParseContext):`
Parser: fixes #13 - implicit start of text section when parsing assembly When an assembly file starts with instructions without explicitly declaring any section beforehand, a .text section will be created implicitly. 3 years ago			`if context.section is None:`
format black 2 years ago			`context.new_section(".text", MemorySectionType.Instructions)`
Parser: fixes #13 - implicit start of text section when parsing assembly When an assembly file starts with instructions without explicitly declaring any section beforehand, a .text section will be created implicitly. 3 years ago			`if context.section.type != MemorySectionType.Instructions:`
format black 2 years ago			`raise ParseException(`
			`"{} {} encountered in invalid context: {}".format(token, args, context)`
			`)`
			`ins = SimpleInstruction(`
			`token.value, args, context.context, context.current_address()`
			`)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`context.section.data.append(ins)`


			`def parse_label(token: Token, args: Tuple[str], context: ParseContext):`
			`name = token.value[:-1]`
format black 2 years ago			`if re.match(r"^\d+$", name):`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`# relative label:`
Parser: fixed error when labels where used outside of sections 3 years ago			`context.context.numbered_labels[name].append(context.current_address())`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`else:`
			`if name in context.context.labels:`
format black 2 years ago			`print(FMT_PARSE + "Warn: Symbol {} defined twice!".format(name))`
Parser: fixed error when labels where used outside of sections 3 years ago			`context.add_label(name, context.current_address(), is_relative=True)`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago

			`PARSERS: Dict[TokenType, Callable[[Token, Tuple[str], ParseContext], None]] = {`
			`TokenType.PSEUDO_OP: AssemblerDirectives.handle_instruction,`
			`TokenType.LABEL: parse_label,`
format black 2 years ago			`TokenType.INSTRUCTION_NAME: parse_instruction,`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`}`


			`def parse_tokens(name: str, tokens_iter: Iterable[Token]) -> Program:`
finished basic RISC-V parser 3 years ago			`"""`
			`Convert a token stream into a parsed program`
			`:param name: the programs name`
			`:param tokens_iter: the programs content, tokenized`
			`:return: a parsed program`
			`"""`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`context = ParseContext(name)`

			`for token, args in composite_tokenizer(Peekable[Token](tokens_iter)):`
			`if token.type not in PARSERS:`
			`raise ParseException("Unexpected token type: {}, {}".format(token, args))`
			`PARSERS[token.type](token, args, context)`

			`return context.finalize()`


format black 2 years ago			`def composite_tokenizer(`
			`tokens_iter: Iterable[Token],`
			`) -> Iterable[Tuple[Token, Tuple[str]]]:`
finished basic RISC-V parser 3 years ago			`"""`
			`Convert an iterator over tokens into an iterator over tuples: (token, list(token))`

			`The first token ist either a pseudo_op, label, or instruction name. The token list are all remaining tokens before`
			`a newline is encountered`
			`:param tokens_iter: An iterator over tokens`
			`:return: An iterator over a slightly more structured representation of the tokens`
			`"""`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`tokens: Peekable[Token] = Peekable[Token](tokens_iter)`

			`while not tokens.is_empty():`
			`token = next(tokens)`
format black 2 years ago			`if token.type in (`
			`TokenType.PSEUDO_OP,`
			`TokenType.LABEL,`
			`TokenType.INSTRUCTION_NAME,`
			`):`
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago			`yield token, tuple(take_arguments(tokens))`


			`def take_arguments(tokens: Peekable[Token]) -> Iterable[str]:`
			`"""`
			`Consumes (argument comma)* and yields argument.value until newline is reached`
			`If an argument is not followed by either a newline or a comma, a parse exception is raised`
			`The newline at the end is consumed`
			`:param tokens: A Peekable iterator over some Tokens`
			`"""`
			`while True:`
			`if tokens.peek().type == TokenType.ARGUMENT:`
			`yield next(tokens).value`
			`elif tokens.peek().type == TokenType.NEWLINE:`
			`next(tokens)`
			`break`
user mode emulator finally working again 3 years ago			`elif tokens.peek().type == TokenType.COMMA:`
			`next(tokens)`
			`else:`
			`break`

started with base type overhaul 3 years ago			`# raise ParseException("Expected newline, instead got {}".format(tokens.peek()))`
finished basic RISC-V parser 3 years ago
[wip] almost done with the rework of the parser and internal data structure representation of programs 3 years ago
started with base type overhaul 3 years ago			`class AssemblyFileLoader(ProgramLoader):`
			`"""`
			`This class loads assembly files written by hand. It understands some assembler directives and supports most`
			`pseudo instructions. It does very little verification of source correctness.`

			`It also supports numbered jump targets and properly supports local and global scope (.globl assembly directive)`


			`The AssemblyFileLoader loads .asm, .S and .s files by default, and acts as a weak fallback to all other filetypes.`
			`"""`
format black 2 years ago
started with base type overhaul 3 years ago			`def parse(self) -> Program:`
format black 2 years ago			`with open(self.source_path, "r") as f:`
started with base type overhaul 3 years ago			`return parse_tokens(self.filename, tokenize(f))`

add an unlimited register mode 2 years ago			`def parse_io(self, io):`
			`return parse_tokens(self.filename, tokenize(io))`

started with base type overhaul 3 years ago			`@classmethod`
			`def can_parse(cls, source_path: str) -> float:`
			`"""`

			`It also acts as a weak fallback if no other loaders want to take the file.`

			`:param source_path: the path to the source file`
			`:return:`
			`"""`
			`# gcc recognizes these line endings as assembly. So we will do too.`
format black 2 years ago			`if source_path.split(".")[-1] in ("asm", "S", "s"):`
started with base type overhaul 3 years ago			`return 1`
			`return 0.01`

			`@classmethod`
fixed type annotations type in parsers 3 years ago			`def get_options(cls, argv: List[str]) -> [List[str], T_ParserOpts]:`
started with base type overhaul 3 years ago			`return argv, {}`