You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
202 lines
6.6 KiB
Python
202 lines
6.6 KiB
Python
from ast import operator
|
|
from dataclasses import dataclass
|
|
from sre_parse import parse_template
|
|
from typing import Dict, List, Iterable, Generator, Tuple
|
|
from enum import Enum, auto
|
|
from unicodedata import digit
|
|
from .errors import EndOfInputError, ParseError
|
|
from .defs import Span, LexingContext, Token, TokenType, digits, keywords, operators, integer_type_suffixes, parens, identifier_terminating_chars
|
|
|
|
|
|
class Lexer:
|
|
separators = ':-+*/#!"\'=?%&<>[]{}()\n \t'
|
|
parens = '[]{}()<>'
|
|
|
|
context: LexingContext
|
|
content: str
|
|
pos: int
|
|
line: int
|
|
fname: str
|
|
size: int
|
|
|
|
def __init__(self, fname: str, context: LexingContext):
|
|
self.content = context.sources[fname]
|
|
self.fname = fname
|
|
self.pos = self.line = 0
|
|
self.word = ""
|
|
self.size = len(self.content)
|
|
self.context = context
|
|
|
|
def peek(self, offset: int = 0):
|
|
if self.pos + offset >= self.size:
|
|
return None
|
|
return self.content[self.pos + offset]
|
|
|
|
def startswith(self, *patterns: str, offset: int = 0):
|
|
# match longest first
|
|
for pattern in sorted(patterns, key=len, reverse=True):
|
|
if self.content.startswith(pattern, self.pos + offset):
|
|
return pattern
|
|
return False
|
|
|
|
|
|
def read_until(self, pattern: str, inclusive=True) -> Tuple[str, Span]:
|
|
start = self.pos
|
|
pos = self.pos
|
|
while not self.content[pos:].startswith(pattern) and pos < self.size:
|
|
pos += 1
|
|
if pos == self.size:
|
|
raise EndOfInputError(Span(start, pos, self.fname, self.context), pattern)
|
|
|
|
if inclusive:
|
|
pos += len(pattern)
|
|
self.pos = pos
|
|
|
|
return self.content[start:pos], Span(start, pos, self.fname, self.context)
|
|
|
|
def do_parse(self) -> Iterable[Token]:
|
|
while True:
|
|
c = self.peek()
|
|
# reached end of input
|
|
if c == None:
|
|
yield Token(Span(self.pos, self.pos, self.fname, self.context), "", TokenType.EOI)
|
|
break
|
|
|
|
if c in '\n\r':
|
|
start = self.pos
|
|
if self.startswith('\r\n'):
|
|
self.pos += 1
|
|
self.pos += 1
|
|
yield self.Token(start, TokenType.EOL)
|
|
continue
|
|
|
|
# check for integer literals
|
|
if self.startswith('0x', '0X', '0b', *'0123456789'):
|
|
yield self.parse_integer()
|
|
continue
|
|
|
|
# check for parenthesis
|
|
if c in parens:
|
|
# left parens at position 0, 2, 4, 6
|
|
left_paren = parens.index(c) % 2 == 0
|
|
self.pos += 1
|
|
yield self.Token(self.pos -1, TokenType.LBracket if left_paren else TokenType.RBracket)
|
|
continue
|
|
|
|
if self.startswith('//'):
|
|
start = self.pos
|
|
self.read_until('\n', inclusive=False) # read until newline, but don't consume newline
|
|
yield self.Token(start, TokenType.LineComment)
|
|
continue
|
|
|
|
if self.startswith('/*'):
|
|
start = self.pos
|
|
self.read_until('*/')
|
|
yield self.Token(start, TokenType.MultiComment)
|
|
continue
|
|
|
|
starts_with_keyword = self.startswith(*keywords)
|
|
if starts_with_keyword:
|
|
start = self.pos
|
|
self.pos += len(starts_with_keyword)
|
|
yield self.Token(start, TokenType.Keyword)
|
|
self.consume_expected_whitespace()
|
|
continue
|
|
|
|
starts_with_operator = self.startswith(*operators)
|
|
if starts_with_operator:
|
|
start = self.pos
|
|
self.pos += len(starts_with_operator)
|
|
yield self.Token(start, TokenType.Operator)
|
|
continue
|
|
|
|
if self.peek() in '"\'':
|
|
yield self.parse_string()
|
|
continue
|
|
|
|
if self.peek() in ' \t':
|
|
self.pos += 1
|
|
continue
|
|
|
|
# must be an identifier then
|
|
start = self.pos
|
|
while self.peek() not in identifier_terminating_chars:
|
|
self.pos += 1
|
|
if start == self.pos:
|
|
raise ParseError("Expected identifier!", Span(start, start+1, self.fname, self.context))
|
|
yield self.Token(start, TokenType.Identifier)
|
|
continue
|
|
|
|
def consume_expected_whitespace(self):
|
|
if self.peek() in '\r\n':
|
|
return
|
|
if self.peek() not in '\t ':
|
|
raise ParseError("Expected whitespace here", Span(self.pos, self.pos+1, self.fname, self.context))
|
|
while self.peek() in '\t ':
|
|
self.pos += 1
|
|
|
|
def parse_integer(self):
|
|
start = self.pos
|
|
|
|
if self.startswith('-'):
|
|
self.pos += 1
|
|
|
|
parse_type = 'dec'
|
|
if self.startswith('0x', '0X'):
|
|
parse_type = 'hex'
|
|
self.pos += 2
|
|
elif self.startswith('0b'):
|
|
parse_type = 'bin'
|
|
self.pos += 2
|
|
|
|
while self.peek() in digits[parse_type]:
|
|
self.pos += 1
|
|
|
|
suffix = self.startswith(*integer_type_suffixes)
|
|
if suffix:
|
|
self.pos += len(suffix)
|
|
|
|
return self.Token(start, TokenType.Integer)
|
|
|
|
|
|
def parse_string(self):
|
|
start = self.pos
|
|
terminator = self.peek()
|
|
escaped = False
|
|
self.pos += 1
|
|
string = ""
|
|
while not escaped and self.peek() != terminator:
|
|
char = self.peek()
|
|
if escaped:
|
|
match char:
|
|
case 'r':
|
|
string += '\r'
|
|
case 'b':
|
|
string += '\b'
|
|
case 'n':
|
|
string += '\n'
|
|
case 't':
|
|
string += '\t'
|
|
case 'e': # support terminal escape codes
|
|
string += '\033'
|
|
case other:
|
|
string += '\\' + other
|
|
escaped = False
|
|
elif self.peek() == '\\':
|
|
escaped = True
|
|
else:
|
|
string += char
|
|
self.pos += 1
|
|
# consume trailing terminator
|
|
self.pos += 1
|
|
|
|
return self.Token(start, TokenType.String, content=string)
|
|
|
|
def Token(self, start: int, type: TokenType, end=None, content=None) -> Token:
|
|
if end is None:
|
|
end = self.pos
|
|
if content is None:
|
|
content = self.content[start:end]
|
|
return Token(Span(start, end, self.fname, self.context), content, type)
|
|
|