562 lines
20 KiB
Python
562 lines
20 KiB
Python
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
|
|
|
|
class TokenKind(Enum):
|
|
EOF = "eof"
|
|
WHITESPACE = "whitespace"
|
|
NEWLINE = "newline"
|
|
KEYWORD = "keyword"
|
|
IDENTIFIER = "identifier"
|
|
CONCEPT = "concept"
|
|
RULE = "rule"
|
|
STRING = "string"
|
|
NUMBER = "number"
|
|
TRUE = "true"
|
|
FALSE = "false"
|
|
LPAR = "lpar"
|
|
RPAR = "rpar"
|
|
LBRACKET = "lbracket"
|
|
RBRACKET = "rbracket"
|
|
LBRACE = "lbrace"
|
|
RBRACE = "rbrace"
|
|
PLUS = "plus"
|
|
MINUS = "minus"
|
|
STAR = "star"
|
|
SLASH = "slash"
|
|
PERCENT = "percent"
|
|
COMMA = "comma"
|
|
SEMICOLON = "semicolon"
|
|
COLON = "colon"
|
|
DOT = "dot"
|
|
QMARK = "qmark"
|
|
VBAR = "vbar"
|
|
AMPER = "amper"
|
|
EQUALS = "="
|
|
AT = "at"
|
|
BACK_QUOTE = "bquote" # `
|
|
BACK_SLASH = "bslash" # \
|
|
CARAT = "carat" # ^
|
|
DOLLAR = "dollar" # $
|
|
EURO = "dollar" # €
|
|
STERLING = "steling" # £
|
|
EMARK = "emark" # !
|
|
GREATER = "greater" # >
|
|
LESS = "less" # <
|
|
HASH = "HASH" # #
|
|
TILDE = "tilde" # ~
|
|
UNDERSCORE = "underscore" # _
|
|
DEGREE = "degree" # °
|
|
WORD = "word"
|
|
EQUALSEQUALS = "=="
|
|
VAR_DEF = "__var__"
|
|
REGEX = "r'xxx' or r\"xxx\" or r|xxx| or r/xxx/"
|
|
|
|
|
|
@dataclass()
|
|
class Token:
|
|
type: TokenKind
|
|
value: object
|
|
index: int
|
|
line: int
|
|
column: int
|
|
|
|
_strip_quote: str = field(default=None, repr=False, compare=False, hash=None)
|
|
_str_value: str = field(default=None, repr=False, compare=False, hash=None)
|
|
_repr_value: str = field(default=None, repr=False, compare=False, hash=None)
|
|
|
|
def __repr__(self):
|
|
return f"Token({self.repr_value})"
|
|
|
|
@property
|
|
def strip_quote(self):
|
|
if self._strip_quote:
|
|
return self._strip_quote
|
|
|
|
self._strip_quote = self.value[1:-1] if self.type == TokenKind.STRING else self.value
|
|
return self._strip_quote
|
|
|
|
@property
|
|
def str_value(self):
|
|
if self._str_value:
|
|
return self._str_value
|
|
|
|
self._str_value = self.to_str(False)
|
|
return self._str_value
|
|
|
|
@property
|
|
def repr_value(self):
|
|
if self._repr_value:
|
|
return self._repr_value
|
|
|
|
if self.type == TokenKind.EOF:
|
|
self._repr_value = "<EOF>"
|
|
elif self.type == TokenKind.WHITESPACE:
|
|
self._repr_value = "<ws!>" if self.value == "" else "<tab>" if self.value[0] == "\t" else "<ws>"
|
|
elif self.type == TokenKind.NEWLINE:
|
|
self._repr_value = "<nl>"
|
|
elif self.type == TokenKind.CONCEPT:
|
|
from core.utils import str_concept
|
|
self._repr_value = str_concept(self.value)
|
|
elif self.type == TokenKind.RULE:
|
|
from core.utils import str_concept
|
|
self._repr_value = str_concept(self.value, prefix="r:")
|
|
else:
|
|
self._repr_value = self.str_value
|
|
return self._repr_value
|
|
|
|
@staticmethod
|
|
def is_whitespace(token):
|
|
return token and token.type == TokenKind.WHITESPACE
|
|
|
|
def to_str(self, strip_quote):
|
|
if strip_quote and self.type == TokenKind.STRING:
|
|
return self.value[1:-1]
|
|
elif self.type == TokenKind.KEYWORD:
|
|
return self.value.value
|
|
elif self.type == TokenKind.CONCEPT:
|
|
from core.utils import str_concept
|
|
return str_concept(self.value)
|
|
elif self.type == TokenKind.RULE:
|
|
from core.utils import str_concept
|
|
return str_concept(self.value, prefix="r:")
|
|
else:
|
|
return str(self.value)
|
|
|
|
def clone(self):
|
|
return Token(self.type, self.value, self.index, self.line, self.column)
|
|
|
|
|
|
@dataclass()
|
|
class LexerError(Exception):
|
|
message: str
|
|
text: str
|
|
index: int
|
|
line: int
|
|
column: int
|
|
|
|
|
|
class Keywords(Enum):
|
|
DEF = "def"
|
|
CONCEPT = "concept"
|
|
RULE = "rule"
|
|
FROM = "from"
|
|
BNF = "bnf"
|
|
AS = "as"
|
|
WHERE = "where"
|
|
PRE = "pre"
|
|
POST = "post"
|
|
ISA = "isa"
|
|
RET = "ret"
|
|
WHEN = "when"
|
|
PRINT = "print"
|
|
THEN = "then"
|
|
|
|
|
|
class Tokenizer:
|
|
"""
|
|
Class that can iterate on the tokens
|
|
"""
|
|
|
|
def __init__(self, text, yield_eof=True, parse_word=False):
|
|
self.text = text
|
|
self.text_len = len(text)
|
|
self.column = 1
|
|
self.line = 1
|
|
self.i = 0
|
|
self.yield_eof = yield_eof
|
|
self.parse_word = parse_word
|
|
|
|
def __iter__(self):
|
|
|
|
while self.i < self.text_len:
|
|
c = self.text[self.i]
|
|
if c == "+":
|
|
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
|
|
number = self.eat_number(self.i)
|
|
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
|
self.i += len(number)
|
|
self.column += len(number)
|
|
else:
|
|
yield Token(TokenKind.PLUS, "+", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "-":
|
|
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
|
|
number = self.eat_number(self.i)
|
|
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
|
self.i += len(number)
|
|
self.column += len(number)
|
|
else:
|
|
yield Token(TokenKind.MINUS, "-", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "_":
|
|
from core.concept import VARIABLE_PREFIX
|
|
if self.i + 7 < self.text_len and \
|
|
self.text[self.i: self.i + 7] == VARIABLE_PREFIX and \
|
|
self.text[self.i + 7].isdigit():
|
|
number = self.eat_number(self.i + 7)
|
|
yield Token(TokenKind.VAR_DEF, VARIABLE_PREFIX + number, self.i, self.line, self.column)
|
|
self.i += 7 + len(number)
|
|
self.column += 7 + len(number)
|
|
elif self.i + 1 < self.text_len and (self.text[self.i + 1].isalpha() or self.text[self.i + 1] == "_"):
|
|
identifier = self.eat_identifier(self.i)
|
|
yield Token(TokenKind.IDENTIFIER, identifier, self.i, self.line, self.column)
|
|
self.i += len(identifier)
|
|
self.column += len(identifier)
|
|
else:
|
|
yield Token(TokenKind.UNDERSCORE, "_", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "/":
|
|
yield Token(TokenKind.SLASH, "/", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "*":
|
|
yield Token(TokenKind.STAR, "*", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "{":
|
|
yield Token(TokenKind.LBRACE, "{", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "}":
|
|
yield Token(TokenKind.RBRACE, "}", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "(":
|
|
yield Token(TokenKind.LPAR, "(", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == ")":
|
|
yield Token(TokenKind.RPAR, ")", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "[":
|
|
yield Token(TokenKind.LBRACKET, "[", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "]":
|
|
yield Token(TokenKind.RBRACKET, "]", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "=":
|
|
if self.i + 1 < self.text_len and self.text[self.i + 1] == "=":
|
|
yield Token(TokenKind.EQUALSEQUALS, "==", self.i, self.line, self.column)
|
|
self.i += 2
|
|
self.column += 2
|
|
else:
|
|
yield Token(TokenKind.EQUALS, "=", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == " " or c == "\t":
|
|
whitespace = self.eat_whitespace(self.i)
|
|
yield Token(TokenKind.WHITESPACE, whitespace, self.i, self.line, self.column)
|
|
self.i += len(whitespace)
|
|
self.column += len(whitespace)
|
|
elif c == ",":
|
|
yield Token(TokenKind.COMMA, ",", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == ".":
|
|
yield Token(TokenKind.DOT, ".", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == ";":
|
|
yield Token(TokenKind.SEMICOLON, ";", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == ":":
|
|
yield Token(TokenKind.COLON, ":", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "?":
|
|
yield Token(TokenKind.QMARK, "?", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "|":
|
|
yield Token(TokenKind.VBAR, "|", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "&":
|
|
yield Token(TokenKind.AMPER, "&", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "<":
|
|
yield Token(TokenKind.LESS, "<", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == ">":
|
|
yield Token(TokenKind.GREATER, ">", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "!":
|
|
yield Token(TokenKind.EMARK, "!", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "`":
|
|
yield Token(TokenKind.BACK_QUOTE, "`", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "\\":
|
|
yield Token(TokenKind.BACK_SLASH, "\\", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "^":
|
|
yield Token(TokenKind.CARAT, "^", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "$":
|
|
yield Token(TokenKind.DOLLAR, "$", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "€":
|
|
yield Token(TokenKind.EURO, "€", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "£":
|
|
yield Token(TokenKind.STERLING, "£", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "#":
|
|
yield Token(TokenKind.HASH, "#", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "°":
|
|
yield Token(TokenKind.DEGREE, "°", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "~":
|
|
yield Token(TokenKind.TILDE, "~", self.i, self.line, self.column)
|
|
self.i += 1
|
|
self.column += 1
|
|
elif c == "\n" or c == "\r":
|
|
newline = self.eat_newline(self.i)
|
|
yield Token(TokenKind.NEWLINE, newline, self.i, self.line, self.column)
|
|
self.i += len(newline)
|
|
self.column = 1
|
|
self.line += 1
|
|
elif c == "c" and self.i + 1 < self.text_len and self.text[self.i + 1] == ":":
|
|
name, id, length = self.eat_concept(self.i + 2, self.line, self.column + 2)
|
|
yield Token(TokenKind.CONCEPT, (name, id), self.i, self.line, self.column)
|
|
self.i += length + 2
|
|
self.column += length + 2
|
|
elif c == "r" and self.i + 1 < self.text_len and self.text[self.i + 1] == ":":
|
|
name, id, length = self.eat_concept(self.i + 2, self.line, self.column + 2)
|
|
yield Token(TokenKind.RULE, (name, id), self.i, self.line, self.column)
|
|
self.i += length + 2
|
|
self.column += length + 2
|
|
elif c == "r" and self.i + 1 < self.text_len and self.text[self.i + 1] in "'\"|/":
|
|
string, newlines, column_index = self.eat_string(self.i + 1, self.line, self.column)
|
|
yield Token(TokenKind.REGEX, string, self.i, self.line, self.column) # quotes are kept
|
|
self.i += len(string) + 1
|
|
self.column = column_index # 1 if newlines > 0 else self.column + len(string)
|
|
self.line += newlines
|
|
elif self.parse_word and (c.isalpha() or c.isdigit()):
|
|
word = self.eat_word(self.i)
|
|
yield Token(TokenKind.WORD, word, self.i, self.line, self.column)
|
|
self.i += len(word)
|
|
self.column += len(word)
|
|
elif c.isalpha():
|
|
identifier = self.eat_identifier(self.i)
|
|
yield Token(TokenKind.IDENTIFIER, identifier, self.i, self.line, self.column)
|
|
self.i += len(identifier)
|
|
self.column += len(identifier)
|
|
elif c.isdigit():
|
|
number = self.eat_number(self.i)
|
|
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
|
self.i += len(number)
|
|
self.column += len(number)
|
|
elif c == "'" or c == '"':
|
|
string, newlines, column_index = self.eat_string(self.i, self.line, self.column)
|
|
yield Token(TokenKind.STRING, string, self.i, self.line, self.column) # quotes are kept
|
|
self.i += len(string)
|
|
self.column = column_index # 1 if newlines > 0 else self.column + len(string)
|
|
self.line += newlines
|
|
# elif c == "_":
|
|
# yield Token(TokenKind.UNDERSCORE, "_", self.i, self.line, self.column)
|
|
# self.i += 1
|
|
# self.column += 1
|
|
else:
|
|
raise LexerError(f"Unknown token '{c}'", self.text, self.i, self.line, self.column)
|
|
|
|
if self.yield_eof:
|
|
yield Token(TokenKind.EOF, "", self.i, self.line, self.column)
|
|
|
|
def eat_concept(self, start, line, column):
|
|
key, id, buffer = None, None, ""
|
|
i = start
|
|
processing_key = True
|
|
|
|
while i < self.text_len:
|
|
|
|
c = self.text[i]
|
|
if c == "\n":
|
|
raise LexerError(f"New line in concept name", self.text[start:i], i, line, column + i - start)
|
|
|
|
if c == ":":
|
|
if processing_key:
|
|
key = buffer if buffer else None
|
|
else:
|
|
id = buffer if buffer else None
|
|
i += 1 # eat the colon
|
|
break
|
|
|
|
if c == "|":
|
|
key = buffer if buffer else None
|
|
buffer = ""
|
|
processing_key = False
|
|
i += 1
|
|
continue
|
|
|
|
buffer += c
|
|
i += 1
|
|
else:
|
|
raise LexerError(f"Missing ending colon", self.text[start:i], i, line, column + i - start)
|
|
|
|
if (key, id) == (None, None):
|
|
raise LexerError(f"Concept identifiers not found", "", start, line, column)
|
|
|
|
return key, id, i - start
|
|
|
|
def eat_whitespace(self, start):
|
|
result = self.text[start]
|
|
i = start + 1
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
if c == " " or c == "\t":
|
|
result += c
|
|
i += 1
|
|
else:
|
|
break
|
|
|
|
return result
|
|
|
|
def eat_newline(self, start):
|
|
if start + 1 == self.text_len:
|
|
return self.text[start]
|
|
|
|
current = self.text[start]
|
|
next = self.text[start + 1]
|
|
if current == "\n" and next == "\r" or current == "\r" and next == "\n":
|
|
return current + next
|
|
|
|
return current
|
|
|
|
def eat_identifier(self, start):
|
|
result = self.text[start]
|
|
i = start + 1
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
if c.isalpha() or c == "_" or c == "-" or c.isdigit():
|
|
result += c
|
|
i += 1
|
|
else:
|
|
break
|
|
|
|
return result
|
|
|
|
def eat_number(self, start):
|
|
result = self.text[start]
|
|
i = start + 1
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
if c.isdigit() or c == ".":
|
|
result += c
|
|
i += 1
|
|
else:
|
|
break
|
|
|
|
return result
|
|
|
|
def eat_string(self, start_index, start_line, start_column):
|
|
quote = self.text[start_index]
|
|
result = self.text[start_index]
|
|
lines_count = 0
|
|
column_index = start_column + 1
|
|
|
|
i = start_index + 1
|
|
escape = False
|
|
# newline = None
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
result += c
|
|
i += 1
|
|
column_index += 1
|
|
|
|
if c == "\n":
|
|
lines_count += 1
|
|
column_index = 1
|
|
|
|
if c == "\\":
|
|
escape = True
|
|
elif c == quote and not escape:
|
|
break
|
|
else:
|
|
escape = False
|
|
|
|
# # add trailing new line if needed
|
|
# if newline:
|
|
# lines_count += 1
|
|
# column_index = 1
|
|
|
|
if result[-1] != quote:
|
|
raise LexerError("Missing Trailing quote", result, i, start_line + lines_count,
|
|
1 if lines_count > 0 else start_column + len(result))
|
|
|
|
return result, lines_count, column_index
|
|
|
|
def eat_word(self, start):
|
|
"""
|
|
Word is an alphanum (no space)
|
|
:param start:
|
|
:return:
|
|
"""
|
|
result = self.text[start]
|
|
i = start + 1
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
if c.isalpha() or c.isdigit():
|
|
result += c
|
|
i += 1
|
|
else:
|
|
break
|
|
|
|
return result
|
|
|
|
|
|
class IterParser:
|
|
def __init__(self, source):
|
|
self.source = source
|
|
self.iterator = iter(Tokenizer(source))
|
|
self.tokens_after = []
|
|
self.token = None
|
|
self.error_sink = None
|
|
|
|
def next_token(self, skip_whitespace=True):
|
|
try:
|
|
if len(self.tokens_after) > 0:
|
|
self.token = self.tokens_after.pop(0)
|
|
else:
|
|
self.token = next(self.iterator)
|
|
if skip_whitespace:
|
|
while self.token.type in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
|
|
self.token = next(self.iterator)
|
|
return self.token.type != TokenKind.EOF
|
|
except StopIteration:
|
|
return False
|
|
|
|
def the_token_after(self, skip_whitespace=True):
|
|
try:
|
|
token_after = next(self.iterator)
|
|
self.tokens_after.append(token_after)
|
|
if skip_whitespace:
|
|
while token_after.type in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
|
|
token_after = next(self.iterator)
|
|
self.tokens_after.append(token_after)
|
|
|
|
return token_after
|
|
except StopIteration:
|
|
return Token(TokenKind.EOF, -1, -1, -1, -1)
|