from dataclasses import dataclass @dataclass(frozen=True) class Token: type: str value: str index: int line: int column: int @dataclass(frozen=True) class LexerError(Exception): message: str text: str index: int line: int column: int class Tokens: EOF = "eof" WHITESPACE = "whitespace" NEWLINE = "newline" KEYWORD = "keyword" IDENTIFIER = "identifier" STRING = "string" NUMBER = "number" TRUE = "true" FALSE = "false" LPAR = "lpar" RPAR = "rpar" LBRACKET = "lbrace" RBRACKET = "rbracket" LBRACE = "lbrace" RBRACE = "rbrace" PLUS = "plus" MINUS = "minus" STAR = "star" SLASH = "slash" PERCENT = "percent" COMMA = "comma" SEMICOLON = "semicolon" COLON = "colon" DOT = "dot" QMARK = "qmark" VBAR = "vbar" AMPER = "amper" class TokenIter: KEYWORDS = ("def", "concept", "as", "pre", "post") """ Class that can iterate on the tokens """ def __init__(self, text): self.text = text self.text_len = len(text) def __iter__(self): i = 0 line = 1 column = 1 while i < self.text_len: c = self.text[i] if c == "+": yield Token(Tokens.PLUS, "+", i, line, column) i += 1 column += 1 elif c == "-": if i + 1 < self.text_len and self.text[i + 1].isdigit(): number = self.eat_number(i) yield Token(Tokens.NUMBER, number, i, line, column) i += len(number) column += len(number) else: yield Token(Tokens.MINUS, "-", i, line, column) i += 1 column += 1 elif c == "/": yield Token(Tokens.SLASH, "/", i, line, column) i += 1 column += 1 elif c == "*": yield Token(Tokens.STAR, "*", i, line, column) i += 1 column += 1 elif c == "{": yield Token(Tokens.LBRACE, "{", i, line, column) i += 1 column += 1 elif c == "}": yield Token(Tokens.RBRACE, "}", i, line, column) i += 1 column += 1 elif c == "(": yield Token(Tokens.LPAR, "(", i, line, column) i += 1 column += 1 elif c == ")": yield Token(Tokens.RPAR, ")", i, line, column) i += 1 column += 1 elif c == "[": yield Token(Tokens.LBRACKET, "[", i, line, column) i += 1 column += 1 elif c == "]": yield Token(Tokens.RBRACKET, "]", i, line, column) i += 1 column += 1 elif c == " " or c == "\t": whitespace = self.eat_whitespace(i) yield Token(Tokens.WHITESPACE, whitespace, i, line, column) i += len(whitespace) column += len(whitespace) elif c == ",": yield Token(Tokens.COMMA, ",", i, line, column) i += 1 column += 1 elif c == ".": yield Token(Tokens.DOT, ".", i, line, column) i += 1 column += 1 elif c == ";": yield Token(Tokens.SEMICOLON, ";", i, line, column) i += 1 column += 1 elif c == ":": yield Token(Tokens.COLON, ":", i, line, column) i += 1 column += 1 elif c == "?": yield Token(Tokens.QMARK, "?", i, line, column) i += 1 column += 1 elif c == "\n" or c == "\r": newline = self.eat_newline(i) yield Token(Tokens.NEWLINE, newline, i, line, column) i += len(newline) column = 1 line += 1 elif c.isalpha() or c == "_": identifier = self.eat_identifier(i) type = Tokens.KEYWORD if identifier in self.KEYWORDS else Tokens.IDENTIFIER yield Token(type, identifier, i, line, column) i += len(identifier) column += len(identifier) elif c.isdigit(): number = self.eat_number(i) yield Token(Tokens.NUMBER, number, i, line, column) i += len(number) column += len(number) elif c == "'" or c == '"': string, newlines = self.eat_string(i) yield Token(Tokens.STRING, string, i, line, column) i += len(string) column = 1 if newlines > 0 else column + len(string) line += newlines else: raise LexerError(f"Unknown token '{c}'", self.text, i, line, column) yield Token(Tokens.EOF, "", i, line, column) def eat_whitespace(self, start): result = self.text[start] i = start + 1 while i < self.text_len: c = self.text[i] if c == " " or c == "\t": result += c i += 1 else: break return result def eat_newline(self, start): if start + 1 == self.text_len: return self.text[start] current = self.text[start] next = self.text[start + 1] if current == "\n" and next == "\r" or current == "\r" and next == "\n": return current + next return current def eat_identifier(self, start): result = self.text[start] i = start + 1 while i < self.text_len: c = self.text[i] if c.isalpha() or c == "_" or c == "-" or c.isdigit(): result += c i += 1 else: break return result def eat_number(self, start): result = self.text[start] i = start + 1 while i < self.text_len: c = self.text[i] if c.isdigit() or c == ".": result += c i += 1 else: break return result def eat_string(self, start): quote = self.text[start] result = self.text[start] lines_count = 0 i = start + 1 escape = False newline = None while i < self.text_len: c = self.text[i] result += c i += 1 if newline: lines_count += 1 newline = c if c == newline else None else: if c == "\r" or c == "\n": newline = c if c == "\\": escape = True elif c == quote and not escape: break else: escape = False if newline: lines_count += 1 return result, lines_count