from dataclasses import dataclass from enum import Enum class TokenKind(Enum): EOF = "eof" WHITESPACE = "whitespace" NEWLINE = "newline" KEYWORD = "keyword" IDENTIFIER = "identifier" CONCEPT = "concept" STRING = "string" NUMBER = "number" TRUE = "true" FALSE = "false" LPAR = "lpar" RPAR = "rpar" LBRACKET = "lbrace" RBRACKET = "rbracket" LBRACE = "lbrace" RBRACE = "rbrace" PLUS = "plus" MINUS = "minus" STAR = "star" SLASH = "slash" PERCENT = "percent" COMMA = "comma" SEMICOLON = "semicolon" COLON = "colon" DOT = "dot" QMARK = "qmark" VBAR = "vbar" AMPER = "amper" EQUALS = "=" AT = "at" BACK_QUOTE = "bquote" # ` BACK_SLASH = "bslash" # \ CARAT = "carat" # ^ DOLLAR = "dollar" # $ EMARK = "emark" # ! GREATER = "greater" # > LESS = "less" # < HASH = "HASH" # # TILDE = "tilde" # ~ UNDERSCORE = "underscore" # _ DEGREE = "degree" # ° @dataclass() class Token: type: TokenKind value: object index: int line: int column: int def __repr__(self): if self.type == TokenKind.IDENTIFIER: value = str(self.value) elif self.type == TokenKind.WHITESPACE: value = "" elif self.type == TokenKind.NEWLINE: value = r"\n" elif self.type == TokenKind.EOF: value = "" else: value = self.value return f"Token({value})" @dataclass() class LexerError(Exception): message: str text: str index: int line: int column: int class Keywords(Enum): DEF = "def" CONCEPT = "concept" FROM = "from" BNF = "bnf" AS = "as" WHERE = "where" PRE = "pre" POST = "post" ISA = "isa" class Tokenizer: """ Class that can iterate on the tokens """ KEYWORDS = set(x.value for x in Keywords) def __init__(self, text): self.text = text self.text_len = len(text) self.column = 1 self.line = 1 self.i = 0 def __iter__(self): while self.i < self.text_len: c = self.text[self.i] if c == "+": if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit(): number = self.eat_number(self.i) yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column) self.i += len(number) self.column += len(number) else: yield Token(TokenKind.PLUS, "+", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "-": if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit(): number = self.eat_number(self.i) yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column) self.i += len(number) self.column += len(number) else: yield Token(TokenKind.MINUS, "-", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "/": yield Token(TokenKind.SLASH, "/", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "*": yield Token(TokenKind.STAR, "*", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "{": yield Token(TokenKind.LBRACE, "{", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "}": yield Token(TokenKind.RBRACE, "}", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "(": yield Token(TokenKind.LPAR, "(", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == ")": yield Token(TokenKind.RPAR, ")", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "[": yield Token(TokenKind.LBRACKET, "[", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "]": yield Token(TokenKind.RBRACKET, "]", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "=": yield Token(TokenKind.EQUALS, "=", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == " " or c == "\t": whitespace = self.eat_whitespace(self.i) yield Token(TokenKind.WHITESPACE, whitespace, self.i, self.line, self.column) self.i += len(whitespace) self.column += len(whitespace) elif c == ",": yield Token(TokenKind.COMMA, ",", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == ".": yield Token(TokenKind.DOT, ".", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == ";": yield Token(TokenKind.SEMICOLON, ";", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == ":": yield Token(TokenKind.COLON, ":", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "?": yield Token(TokenKind.QMARK, "?", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "|": yield Token(TokenKind.VBAR, "|", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "&": yield Token(TokenKind.AMPER, "&", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "<": yield Token(TokenKind.LESS, "<", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == ">": yield Token(TokenKind.GREATER, ">", self.i, self.line, self.column) self.i += 1 self.column += 1 elif c == "\n" or c == "\r": newline = self.eat_newline(self.i) yield Token(TokenKind.NEWLINE, newline, self.i, self.line, self.column) self.i += len(newline) self.column = 1 self.line += 1 elif c == "c" and self.i + 1 < self.text_len and self.text[self.i + 1] == ":": concept_name = self.eat_concept_name(self.i + 2, self.line, self.column) yield Token(TokenKind.CONCEPT, concept_name, self.i, self.line, self.column) self.i += len(concept_name) + 3 self.column += len(concept_name) + 3 elif c.isalpha() or c == "_": identifier = self.eat_identifier(self.i) token_type = TokenKind.KEYWORD if identifier in self.KEYWORDS else TokenKind.IDENTIFIER value = Keywords(identifier) if identifier in self.KEYWORDS else identifier yield Token(token_type, value, self.i, self.line, self.column) self.i += len(identifier) self.column += len(identifier) elif c.isdigit(): number = self.eat_number(self.i) yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column) self.i += len(number) self.column += len(number) elif c == "'" or c == '"': string, newlines = self.eat_string(self.i, self.line, self.column) yield Token(TokenKind.STRING, string, self.i, self.line, self.column) # quotes are kept self.i += len(string) self.column = 1 if newlines > 0 else self.column + len(string) self.line += newlines else: raise LexerError(f"Unknown token '{c}'", self.text, self.i, self.line, self.column) yield Token(TokenKind.EOF, "", self.i, self.line, self.column) def eat_concept_name(self, start, line, column): result = "" i = start end_colon_found = False while i < self.text_len: c = self.text[i] if c == "\n": raise LexerError(f"New line is forbidden in concept name", result, i, line, column + 2 + len(result)) if c == ":": end_colon_found = True break result += c i += 1 if not end_colon_found: raise LexerError(f"Missing ending colon", result, i, line, column + 2 + len(result)) if result == "": raise LexerError(f"Context name not found", result, start, line, column + 2 + len(result)) return result def eat_whitespace(self, start): result = self.text[start] i = start + 1 while i < self.text_len: c = self.text[i] if c == " " or c == "\t": result += c i += 1 else: break return result def eat_newline(self, start): if start + 1 == self.text_len: return self.text[start] current = self.text[start] next = self.text[start + 1] if current == "\n" and next == "\r" or current == "\r" and next == "\n": return current + next return current def eat_identifier(self, start): result = self.text[start] i = start + 1 while i < self.text_len: c = self.text[i] if c.isalpha() or c == "_" or c == "-" or c.isdigit(): result += c i += 1 else: break return result def eat_number(self, start): result = self.text[start] i = start + 1 while i < self.text_len: c = self.text[i] if c.isdigit() or c == ".": result += c i += 1 else: break return result def eat_string(self, start_index, start_line, start_column): quote = self.text[start_index] result = self.text[start_index] lines_count = 0 i = start_index + 1 escape = False newline = None while i < self.text_len: c = self.text[i] result += c i += 1 if newline: lines_count += 1 newline = c if c == newline else None else: if c == "\r" or c == "\n": newline = c if c == "\\": escape = True elif c == quote and not escape: break else: escape = False # add trailing new line if needed if newline: lines_count += 1 if result[-1] != quote: raise LexerError("Missing Trailing quote", result, i, start_line + lines_count, 1 if lines_count > 0 else start_column + len(result)) return result, lines_count