from ast import operator from dataclasses import dataclass from sre_parse import parse_template from typing import Dict, List, Iterable, Generator, Tuple from enum import Enum, auto from unicodedata import digit from .errors import EndOfInputError, ParseError from .defs import Span, LexingContext, Token, TokenType, digits, keywords, operators, integer_type_suffixes, parens, identifier_terminating_chars class Lexer: separators = ':-+*/#!"\'=?%&<>[]{}()\n \t' parens = '[]{}()<>' context: LexingContext content: str pos: int line: int fname: str size: int def __init__(self, fname: str, context: LexingContext): self.content = context.sources[fname] self.fname = fname self.pos = self.line = 0 self.word = "" self.size = len(self.content) self.context = context def peek(self, offset: int = 0): if self.pos + offset >= self.size: return None return self.content[self.pos + offset] def startswith(self, *patterns: str, offset: int = 0): # match longest first for pattern in sorted(patterns, key=len, reverse=True): if self.content.startswith(pattern, self.pos + offset): return pattern return False def read_until(self, pattern: str, inclusive=True) -> Tuple[str, Span]: start = self.pos pos = self.pos while not self.content[pos:].startswith(pattern) and pos < self.size: pos += 1 if pos == self.size: raise EndOfInputError(Span(start, pos, self.fname, self.context), pattern) if inclusive: pos += len(pattern) self.pos = pos return self.content[start:pos], Span(start, pos, self.fname, self.context) def do_parse(self) -> Iterable[Token]: while True: c = self.peek() # reached end of input if c == None: yield Token(Span(self.pos, self.pos, self.fname, self.context), "", TokenType.EOI) break if c in '\n\r': start = self.pos if self.startswith('\r\n'): self.pos += 1 self.pos += 1 yield self.Token(start, TokenType.EOL) continue # check for integer literals if self.startswith('0x', '0X', '0b', *'0123456789'): yield self.parse_integer() continue # check for parenthesis if c in parens: # left parens at position 0, 2, 4, 6 left_paren = parens.index(c) % 2 == 0 self.pos += 1 yield self.Token(self.pos -1, TokenType.LBracket if left_paren else TokenType.RBracket) continue if self.startswith('//'): start = self.pos self.read_until('\n', inclusive=False) # read until newline, but don't consume newline yield self.Token(start, TokenType.LineComment) continue if self.startswith('/*'): start = self.pos self.read_until('*/') yield self.Token(start, TokenType.MultiComment) continue starts_with_keyword = self.startswith(*keywords) if starts_with_keyword: start = self.pos self.pos += len(starts_with_keyword) yield self.Token(start, TokenType.Keyword) self.consume_expected_whitespace() continue starts_with_operator = self.startswith(*operators) if starts_with_operator: start = self.pos self.pos += len(starts_with_operator) yield self.Token(start, TokenType.Operator) continue if self.peek() in '"\'': yield self.parse_string() continue if self.peek() in ' \t': self.pos += 1 continue # must be an identifier then start = self.pos while self.peek() not in identifier_terminating_chars: self.pos += 1 if start == self.pos: raise ParseError("Expected identifier!", Span(start, start+1, self.fname, self.context)) yield self.Token(start, TokenType.Identifier) continue def consume_expected_whitespace(self): if self.peek() in '\r\n': return if self.peek() not in '\t ': raise ParseError("Expected whitespace here", Span(self.pos, self.pos+1, self.fname, self.context)) while self.peek() in '\t ': self.pos += 1 def parse_integer(self): start = self.pos if self.startswith('-'): self.pos += 1 parse_type = 'dec' if self.startswith('0x', '0X'): parse_type = 'hex' self.pos += 2 elif self.startswith('0b'): parse_type = 'bin' self.pos += 2 while self.peek() in digits[parse_type]: self.pos += 1 suffix = self.startswith(*integer_type_suffixes) if suffix: self.pos += len(suffix) return self.Token(start, TokenType.Integer) def parse_string(self): start = self.pos terminator = self.peek() escaped = False self.pos += 1 string = "" while not escaped and self.peek() != terminator: char = self.peek() if escaped: match char: case 'r': string += '\r' case 'b': string += '\b' case 'n': string += '\n' case 't': string += '\t' case 'e': # support terminal escape codes string += '\033' case other: string += '\\' + other escaped = False elif self.peek() == '\\': escaped = True else: string += char self.pos += 1 # consume trailing terminator self.pos += 1 return self.Token(start, TokenType.String, content=string) def Token(self, start: int, type: TokenType, end=None, content=None) -> Token: if end is None: end = self.pos if content is None: content = self.content[start:end] return Token(Span(start, end, self.fname, self.context), content, type)