riscemu/test/test_tokenizer.py

from unittest import TestCase

from riscemu.tokenizer import (
    tokenize,
    print_tokens,
    Token,
    TokenType,
    NEWLINE,
    COMMA,
    split_whitespace_respecting_quotes,
)


def ins(name: str) -> Token:
    return Token(TokenType.INSTRUCTION_NAME, name)


def arg(name: str) -> Token:
    return Token(TokenType.ARGUMENT, name)


def op(name: str) -> Token:
    return Token(TokenType.PSEUDO_OP, name)


def lbl(name: str) -> Token:
    return Token(TokenType.LABEL, name)


class TestTokenizer(TestCase):
    def test_instructions(self):
        program = ["li     a0, 144", "divi   a0, a0, 12", "xori   a1, a0, 12"]
        tokens = [
            ins("li"),
            arg("a0"),
            COMMA,
            arg("144"),
            NEWLINE,
            ins("divi"),
            arg("a0"),
            COMMA,
            arg("a0"),
            COMMA,
            arg("12"),
            NEWLINE,
            ins("xori"),
            arg("a1"),
            COMMA,
            arg("a0"),
            COMMA,
            arg("12"),
            NEWLINE,
        ]
        self.assertEqual(list(tokenize(program)), tokens)

    def test_comments(self):
        parsed_res = [ins("li"), arg("a0"), COMMA, arg("144"), NEWLINE]
        for c in ("#", "//", ";"):
            lines = [c + " this is a comment", "li a0, 144"]
            self.assertEqual(list(tokenize(lines)), parsed_res)

    def test_pseudo_ins(self):
        parsed_res = [
            Token(TokenType.PSEUDO_OP, ".section"),
            Token(TokenType.ARGUMENT, ".text"),
            NEWLINE,
            Token(TokenType.PSEUDO_OP, ".type"),
            Token(TokenType.ARGUMENT, "init"),
            COMMA,
            Token(TokenType.ARGUMENT, "@function"),
            NEWLINE,
        ]
        input_program = [".section .text", ".type init, @function"]
        self.assertEqual(list(tokenize(input_program)), parsed_res)

    def test_full_program(self):
        program = """
# a hashtag comment

; semicolon comment followed by an empty line
.section .text
// double slash comment
    addi sp, sp, -32
    sw   s0, 0(ra)
section:
    sub  s0, s0, s0
"""
        tokens = [
            op(".section"),
            arg(".text"),
            NEWLINE,
            ins("addi"),
            arg("sp"),
            COMMA,
            arg("sp"),
            COMMA,
            arg("-32"),
            NEWLINE,
            ins("sw"),
            arg("s0"),
            COMMA,
            arg("ra"),
            arg("0"),
            NEWLINE,
            lbl("section:"),
            NEWLINE,
            ins("sub"),
            arg("s0"),
            COMMA,
            arg("s0"),
            COMMA,
            arg("s0"),
            NEWLINE,
        ]

        self.assertEqual(list(tokenize(program.splitlines())), tokens)

    def test_split_whitespace_respecting_quotes_single(self):
        self.assertEqual(list(split_whitespace_respecting_quotes("test")), ["test"])

    def test_split_whitespace_respecting_quotes_empty(self):
        self.assertEqual(list(split_whitespace_respecting_quotes("")), [])

    def test_split_whitespace_respecting_quotes_two_parts(self):
        self.assertEqual(
            list(split_whitespace_respecting_quotes("test 123")), ["test", "123"]
        )

    def test_split_whitespace_respecting_quotes_whole_quoted(self):
        self.assertEqual(
            list(split_whitespace_respecting_quotes("'test 123'")), ["test 123"]
        )

    def test_split_whitespace_respecting_quotes_double_quotes(self):
        self.assertEqual(
            list(split_whitespace_respecting_quotes('"test 123"')), ["test 123"]
        )

    def test_split_whitespace_respecting_quotes_quoted_then_normal(self):
        self.assertEqual(
            list(split_whitespace_respecting_quotes('"test 123" abc')),
            ["test 123", "abc"],
        )

    def test_split_whitespace_respecting_quotes_quoted_sorrounded(self):
        self.assertEqual(
            list(split_whitespace_respecting_quotes('hello "test 123" abc')),
            ["hello", "test 123", "abc"],
        )

    def test_split_whitespace_respecting_quotes_weird_spaces(self):
        self.assertEqual(
            list(split_whitespace_respecting_quotes('hello  "test 123"\tabc')),
            ["hello", "test 123", "abc"],
        )

    def test_split_whitespace_respecting_quotes_quotes_no_spaces(self):
        self.assertEqual(
            list(split_whitespace_respecting_quotes('hello"test 123"abc')),
            ["hello", "test 123", "abc"],
        )
added tests for tokenizer 3 years ago			`from unittest import TestCase`

format black 2 years ago			`from riscemu.tokenizer import (`
			`tokenize,`
			`print_tokens,`
			`Token,`
			`TokenType,`
			`NEWLINE,`
			`COMMA,`
			`split_whitespace_respecting_quotes,`
			`)`
added tests for tokenizer 3 years ago

			`def ins(name: str) -> Token:`
			`return Token(TokenType.INSTRUCTION_NAME, name)`


			`def arg(name: str) -> Token:`
			`return Token(TokenType.ARGUMENT, name)`


			`def op(name: str) -> Token:`
			`return Token(TokenType.PSEUDO_OP, name)`


			`def lbl(name: str) -> Token:`
			`return Token(TokenType.LABEL, name)`


finished basic RISC-V parser 3 years ago			`class TestTokenizer(TestCase):`
added tests for tokenizer 3 years ago			`def test_instructions(self):`
format black 2 years ago			`program = ["li a0, 144", "divi a0, a0, 12", "xori a1, a0, 12"]`
added tests for tokenizer 3 years ago			`tokens = [`
format black 2 years ago			`ins("li"),`
			`arg("a0"),`
			`COMMA,`
			`arg("144"),`
			`NEWLINE,`
			`ins("divi"),`
			`arg("a0"),`
			`COMMA,`
			`arg("a0"),`
			`COMMA,`
			`arg("12"),`
			`NEWLINE,`
			`ins("xori"),`
			`arg("a1"),`
			`COMMA,`
			`arg("a0"),`
			`COMMA,`
			`arg("12"),`
			`NEWLINE,`
added tests for tokenizer 3 years ago			`]`
			`self.assertEqual(list(tokenize(program)), tokens)`

			`def test_comments(self):`
format black 2 years ago			`parsed_res = [ins("li"), arg("a0"), COMMA, arg("144"), NEWLINE]`
			`for c in ("#", "//", ";"):`
			`lines = [c + " this is a comment", "li a0, 144"]`
added tests for tokenizer 3 years ago			`self.assertEqual(list(tokenize(lines)), parsed_res)`

			`def test_pseudo_ins(self):`
			`parsed_res = [`
format black 2 years ago			`Token(TokenType.PSEUDO_OP, ".section"),`
			`Token(TokenType.ARGUMENT, ".text"),`
			`NEWLINE,`
			`Token(TokenType.PSEUDO_OP, ".type"),`
			`Token(TokenType.ARGUMENT, "init"),`
			`COMMA,`
			`Token(TokenType.ARGUMENT, "@function"),`
			`NEWLINE,`
added tests for tokenizer 3 years ago			`]`
format black 2 years ago			`input_program = [".section .text", ".type init, @function"]`
added tests for tokenizer 3 years ago			`self.assertEqual(list(tokenize(input_program)), parsed_res)`

			`def test_full_program(self):`
			`program = """`
			`# a hashtag comment`

			`; semicolon comment followed by an empty line`
			`.section .text`
			`// double slash comment`
			`addi sp, sp, -32`
			`sw s0, 0(ra)`
			`section:`
			`sub s0, s0, s0`
			`"""`
			`tokens = [`
format black 2 years ago			`op(".section"),`
			`arg(".text"),`
			`NEWLINE,`
			`ins("addi"),`
			`arg("sp"),`
			`COMMA,`
			`arg("sp"),`
			`COMMA,`
			`arg("-32"),`
			`NEWLINE,`
			`ins("sw"),`
			`arg("s0"),`
			`COMMA,`
			`arg("ra"),`
			`arg("0"),`
			`NEWLINE,`
			`lbl("section:"),`
			`NEWLINE,`
			`ins("sub"),`
			`arg("s0"),`
			`COMMA,`
			`arg("s0"),`
			`COMMA,`
			`arg("s0"),`
			`NEWLINE,`
added tests for tokenizer 3 years ago			`]`

			`self.assertEqual(list(tokenize(program.splitlines())), tokens)`

finished basic RISC-V parser 3 years ago			`def test_split_whitespace_respecting_quotes_single(self):`
format black 2 years ago			`self.assertEqual(list(split_whitespace_respecting_quotes("test")), ["test"])`
finished basic RISC-V parser 3 years ago
			`def test_split_whitespace_respecting_quotes_empty(self):`
format black 2 years ago			`self.assertEqual(list(split_whitespace_respecting_quotes("")), [])`
finished basic RISC-V parser 3 years ago
			`def test_split_whitespace_respecting_quotes_two_parts(self):`
			`self.assertEqual(`
			`list(split_whitespace_respecting_quotes("test 123")), ["test", "123"]`
			`)`

			`def test_split_whitespace_respecting_quotes_whole_quoted(self):`
			`self.assertEqual(`
			`list(split_whitespace_respecting_quotes("'test 123'")), ["test 123"]`
			`)`

			`def test_split_whitespace_respecting_quotes_double_quotes(self):`
			`self.assertEqual(`
			`list(split_whitespace_respecting_quotes('"test 123"')), ["test 123"]`
			`)`

			`def test_split_whitespace_respecting_quotes_quoted_then_normal(self):`
			`self.assertEqual(`
format black 2 years ago			`list(split_whitespace_respecting_quotes('"test 123" abc')),`
			`["test 123", "abc"],`
finished basic RISC-V parser 3 years ago			`)`

			`def test_split_whitespace_respecting_quotes_quoted_sorrounded(self):`
			`self.assertEqual(`
format black 2 years ago			`list(split_whitespace_respecting_quotes('hello "test 123" abc')),`
			`["hello", "test 123", "abc"],`
finished basic RISC-V parser 3 years ago			`)`

			`def test_split_whitespace_respecting_quotes_weird_spaces(self):`
			`self.assertEqual(`
format black 2 years ago			`list(split_whitespace_respecting_quotes('hello "test 123"\tabc')),`
			`["hello", "test 123", "abc"],`
finished basic RISC-V parser 3 years ago			`)`

			`def test_split_whitespace_respecting_quotes_quotes_no_spaces(self):`
			`self.assertEqual(`
format black 2 years ago			`list(split_whitespace_respecting_quotes('hello"test 123"abc')),`
			`["hello", "test 123", "abc"],`
finished basic RISC-V parser 3 years ago			`)`