You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

202 lines
6.6 KiB
Python

from ast import operator
from dataclasses import dataclass
from sre_parse import parse_template
from typing import Dict, List, Iterable, Generator, Tuple
from enum import Enum, auto
from unicodedata import digit
from .errors import EndOfInputError, ParseError
from .defs import Span, LexingContext, Token, TokenType, digits, keywords, operators, integer_type_suffixes, parens, identifier_terminating_chars
class Lexer:
separators = ':-+*/#!"\'=?%&<>[]{}()\n \t'
parens = '[]{}()<>'
context: LexingContext
content: str
pos: int
line: int
fname: str
size: int
def __init__(self, fname: str, context: LexingContext):
self.content = context.sources[fname]
self.fname = fname
self.pos = self.line = 0
self.word = ""
self.size = len(self.content)
self.context = context
def peek(self, offset: int = 0):
if self.pos + offset >= self.size:
return None
return self.content[self.pos + offset]
def startswith(self, *patterns: str, offset: int = 0):
# match longest first
for pattern in sorted(patterns, key=len, reverse=True):
if self.content.startswith(pattern, self.pos + offset):
return pattern
return False
def read_until(self, pattern: str, inclusive=True) -> Tuple[str, Span]:
start = self.pos
pos = self.pos
while not self.content[pos:].startswith(pattern) and pos < self.size:
pos += 1
if pos == self.size:
raise EndOfInputError(Span(start, pos, self.fname, self.context), pattern)
if inclusive:
pos += len(pattern)
self.pos = pos
return self.content[start:pos], Span(start, pos, self.fname, self.context)
def do_parse(self) -> Iterable[Token]:
while True:
c = self.peek()
# reached end of input
if c == None:
yield Token(Span(self.pos, self.pos, self.fname, self.context), "", TokenType.EOI)
break
if c in '\n\r':
start = self.pos
if self.startswith('\r\n'):
self.pos += 1
self.pos += 1
yield self.Token(start, TokenType.EOL)
continue
# check for integer literals
if self.startswith('0x', '0X', '0b', *'0123456789'):
yield self.parse_integer()
continue
# check for parenthesis
if c in parens:
# left parens at position 0, 2, 4, 6
left_paren = parens.index(c) % 2 == 0
self.pos += 1
yield self.Token(self.pos -1, TokenType.LBracket if left_paren else TokenType.RBracket)
continue
if self.startswith('//'):
start = self.pos
self.read_until('\n', inclusive=False) # read until newline, but don't consume newline
yield self.Token(start, TokenType.LineComment)
continue
if self.startswith('/*'):
start = self.pos
self.read_until('*/')
yield self.Token(start, TokenType.MultiComment)
continue
starts_with_keyword = self.startswith(*keywords)
if starts_with_keyword:
start = self.pos
self.pos += len(starts_with_keyword)
yield self.Token(start, TokenType.Keyword)
self.consume_expected_whitespace()
continue
starts_with_operator = self.startswith(*operators)
if starts_with_operator:
start = self.pos
self.pos += len(starts_with_operator)
yield self.Token(start, TokenType.Operator)
continue
if self.peek() in '"\'':
yield self.parse_string()
continue
if self.peek() in ' \t':
self.pos += 1
continue
# must be an identifier then
start = self.pos
while self.peek() not in identifier_terminating_chars:
self.pos += 1
if start == self.pos:
raise ParseError("Expected identifier!", Span(start, start+1, self.fname, self.context))
yield self.Token(start, TokenType.Identifier)
continue
def consume_expected_whitespace(self):
if self.peek() in '\r\n':
return
if self.peek() not in '\t ':
raise ParseError("Expected whitespace here", Span(self.pos, self.pos+1, self.fname, self.context))
while self.peek() in '\t ':
self.pos += 1
def parse_integer(self):
start = self.pos
if self.startswith('-'):
self.pos += 1
parse_type = 'dec'
if self.startswith('0x', '0X'):
parse_type = 'hex'
self.pos += 2
elif self.startswith('0b'):
parse_type = 'bin'
self.pos += 2
while self.peek() in digits[parse_type]:
self.pos += 1
suffix = self.startswith(*integer_type_suffixes)
if suffix:
self.pos += len(suffix)
return self.Token(start, TokenType.Integer)
def parse_string(self):
start = self.pos
terminator = self.peek()
escaped = False
self.pos += 1
string = ""
while not escaped and self.peek() != terminator:
char = self.peek()
if escaped:
match char:
case 'r':
string += '\r'
case 'b':
string += '\b'
case 'n':
string += '\n'
case 't':
string += '\t'
case 'e': # support terminal escape codes
string += '\033'
case other:
string += '\\' + other
escaped = False
elif self.peek() == '\\':
escaped = True
else:
string += char
self.pos += 1
# consume trailing terminator
self.pos += 1
return self.Token(start, TokenType.String, content=string)
def Token(self, start: int, type: TokenType, end=None, content=None) -> Token:
if end is None:
end = self.pos
if content is None:
content = self.content[start:end]
return Token(Span(start, end, self.fname, self.context), content, type)