Files
Sheerka-Old/src/core/tokenizer.py
T

562 lines
20 KiB
Python

from dataclasses import dataclass, field
from enum import Enum
class TokenKind(Enum):
EOF = "eof"
WHITESPACE = "whitespace"
NEWLINE = "newline"
KEYWORD = "keyword"
IDENTIFIER = "identifier"
CONCEPT = "concept"
RULE = "rule"
STRING = "string"
NUMBER = "number"
TRUE = "true"
FALSE = "false"
LPAR = "lpar"
RPAR = "rpar"
LBRACKET = "lbracket"
RBRACKET = "rbracket"
LBRACE = "lbrace"
RBRACE = "rbrace"
PLUS = "plus"
MINUS = "minus"
STAR = "star"
SLASH = "slash"
PERCENT = "percent"
COMMA = "comma"
SEMICOLON = "semicolon"
COLON = "colon"
DOT = "dot"
QMARK = "qmark"
VBAR = "vbar"
AMPER = "amper"
EQUALS = "="
AT = "at"
BACK_QUOTE = "bquote" # `
BACK_SLASH = "bslash" # \
CARAT = "carat" # ^
DOLLAR = "dollar" # $
EURO = "dollar" # €
STERLING = "steling" # £
EMARK = "emark" # !
GREATER = "greater" # >
LESS = "less" # <
HASH = "HASH" # #
TILDE = "tilde" # ~
UNDERSCORE = "underscore" # _
DEGREE = "degree" # °
WORD = "word"
EQUALSEQUALS = "=="
VAR_DEF = "__var__"
REGEX = "r'xxx' or r\"xxx\" or r|xxx| or r/xxx/"
@dataclass()
class Token:
type: TokenKind
value: object
index: int
line: int
column: int
_strip_quote: str = field(default=None, repr=False, compare=False, hash=None)
_str_value: str = field(default=None, repr=False, compare=False, hash=None)
_repr_value: str = field(default=None, repr=False, compare=False, hash=None)
def __repr__(self):
return f"Token({self.repr_value})"
@property
def strip_quote(self):
if self._strip_quote:
return self._strip_quote
self._strip_quote = self.value[1:-1] if self.type == TokenKind.STRING else self.value
return self._strip_quote
@property
def str_value(self):
if self._str_value:
return self._str_value
self._str_value = self.to_str(False)
return self._str_value
@property
def repr_value(self):
if self._repr_value:
return self._repr_value
if self.type == TokenKind.EOF:
self._repr_value = "<EOF>"
elif self.type == TokenKind.WHITESPACE:
self._repr_value = "<ws!>" if self.value == "" else "<tab>" if self.value[0] == "\t" else "<ws>"
elif self.type == TokenKind.NEWLINE:
self._repr_value = "<nl>"
elif self.type == TokenKind.CONCEPT:
from core.utils import str_concept
self._repr_value = str_concept(self.value)
elif self.type == TokenKind.RULE:
from core.utils import str_concept
self._repr_value = str_concept(self.value, prefix="r:")
else:
self._repr_value = self.str_value
return self._repr_value
@staticmethod
def is_whitespace(token):
return token and token.type == TokenKind.WHITESPACE
def to_str(self, strip_quote):
if strip_quote and self.type == TokenKind.STRING:
return self.value[1:-1]
elif self.type == TokenKind.KEYWORD:
return self.value.value
elif self.type == TokenKind.CONCEPT:
from core.utils import str_concept
return str_concept(self.value)
elif self.type == TokenKind.RULE:
from core.utils import str_concept
return str_concept(self.value, prefix="r:")
else:
return str(self.value)
def clone(self):
return Token(self.type, self.value, self.index, self.line, self.column)
@dataclass()
class LexerError(Exception):
message: str
text: str
index: int
line: int
column: int
class Keywords(Enum):
DEF = "def"
CONCEPT = "concept"
RULE = "rule"
FROM = "from"
BNF = "bnf"
AS = "as"
WHERE = "where"
PRE = "pre"
POST = "post"
ISA = "isa"
RET = "ret"
WHEN = "when"
PRINT = "print"
THEN = "then"
class Tokenizer:
"""
Class that can iterate on the tokens
"""
def __init__(self, text, yield_eof=True, parse_word=False):
self.text = text
self.text_len = len(text)
self.column = 1
self.line = 1
self.i = 0
self.yield_eof = yield_eof
self.parse_word = parse_word
def __iter__(self):
while self.i < self.text_len:
c = self.text[self.i]
if c == "+":
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
number = self.eat_number(self.i)
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
self.i += len(number)
self.column += len(number)
else:
yield Token(TokenKind.PLUS, "+", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "-":
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
number = self.eat_number(self.i)
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
self.i += len(number)
self.column += len(number)
else:
yield Token(TokenKind.MINUS, "-", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "_":
from core.concept import VARIABLE_PREFIX
if self.i + 7 < self.text_len and \
self.text[self.i: self.i + 7] == VARIABLE_PREFIX and \
self.text[self.i + 7].isdigit():
number = self.eat_number(self.i + 7)
yield Token(TokenKind.VAR_DEF, VARIABLE_PREFIX + number, self.i, self.line, self.column)
self.i += 7 + len(number)
self.column += 7 + len(number)
elif self.i + 1 < self.text_len and (self.text[self.i + 1].isalpha() or self.text[self.i + 1] == "_"):
identifier = self.eat_identifier(self.i)
yield Token(TokenKind.IDENTIFIER, identifier, self.i, self.line, self.column)
self.i += len(identifier)
self.column += len(identifier)
else:
yield Token(TokenKind.UNDERSCORE, "_", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "/":
yield Token(TokenKind.SLASH, "/", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "*":
yield Token(TokenKind.STAR, "*", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "{":
yield Token(TokenKind.LBRACE, "{", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "}":
yield Token(TokenKind.RBRACE, "}", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "(":
yield Token(TokenKind.LPAR, "(", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ")":
yield Token(TokenKind.RPAR, ")", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "[":
yield Token(TokenKind.LBRACKET, "[", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "]":
yield Token(TokenKind.RBRACKET, "]", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "=":
if self.i + 1 < self.text_len and self.text[self.i + 1] == "=":
yield Token(TokenKind.EQUALSEQUALS, "==", self.i, self.line, self.column)
self.i += 2
self.column += 2
else:
yield Token(TokenKind.EQUALS, "=", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == " " or c == "\t":
whitespace = self.eat_whitespace(self.i)
yield Token(TokenKind.WHITESPACE, whitespace, self.i, self.line, self.column)
self.i += len(whitespace)
self.column += len(whitespace)
elif c == ",":
yield Token(TokenKind.COMMA, ",", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ".":
yield Token(TokenKind.DOT, ".", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ";":
yield Token(TokenKind.SEMICOLON, ";", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ":":
yield Token(TokenKind.COLON, ":", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "?":
yield Token(TokenKind.QMARK, "?", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "|":
yield Token(TokenKind.VBAR, "|", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "&":
yield Token(TokenKind.AMPER, "&", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "<":
yield Token(TokenKind.LESS, "<", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ">":
yield Token(TokenKind.GREATER, ">", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "!":
yield Token(TokenKind.EMARK, "!", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "`":
yield Token(TokenKind.BACK_QUOTE, "`", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "\\":
yield Token(TokenKind.BACK_SLASH, "\\", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "^":
yield Token(TokenKind.CARAT, "^", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "$":
yield Token(TokenKind.DOLLAR, "$", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "":
yield Token(TokenKind.EURO, "", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "£":
yield Token(TokenKind.STERLING, "£", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "#":
yield Token(TokenKind.HASH, "#", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "°":
yield Token(TokenKind.DEGREE, "°", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "~":
yield Token(TokenKind.TILDE, "~", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "\n" or c == "\r":
newline = self.eat_newline(self.i)
yield Token(TokenKind.NEWLINE, newline, self.i, self.line, self.column)
self.i += len(newline)
self.column = 1
self.line += 1
elif c == "c" and self.i + 1 < self.text_len and self.text[self.i + 1] == ":":
name, id, length = self.eat_concept(self.i + 2, self.line, self.column + 2)
yield Token(TokenKind.CONCEPT, (name, id), self.i, self.line, self.column)
self.i += length + 2
self.column += length + 2
elif c == "r" and self.i + 1 < self.text_len and self.text[self.i + 1] == ":":
name, id, length = self.eat_concept(self.i + 2, self.line, self.column + 2)
yield Token(TokenKind.RULE, (name, id), self.i, self.line, self.column)
self.i += length + 2
self.column += length + 2
elif c == "r" and self.i + 1 < self.text_len and self.text[self.i + 1] in "'\"|/":
string, newlines, column_index = self.eat_string(self.i + 1, self.line, self.column)
yield Token(TokenKind.REGEX, string, self.i, self.line, self.column) # quotes are kept
self.i += len(string) + 1
self.column = column_index # 1 if newlines > 0 else self.column + len(string)
self.line += newlines
elif self.parse_word and (c.isalpha() or c.isdigit()):
word = self.eat_word(self.i)
yield Token(TokenKind.WORD, word, self.i, self.line, self.column)
self.i += len(word)
self.column += len(word)
elif c.isalpha():
identifier = self.eat_identifier(self.i)
yield Token(TokenKind.IDENTIFIER, identifier, self.i, self.line, self.column)
self.i += len(identifier)
self.column += len(identifier)
elif c.isdigit():
number = self.eat_number(self.i)
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
self.i += len(number)
self.column += len(number)
elif c == "'" or c == '"':
string, newlines, column_index = self.eat_string(self.i, self.line, self.column)
yield Token(TokenKind.STRING, string, self.i, self.line, self.column) # quotes are kept
self.i += len(string)
self.column = column_index # 1 if newlines > 0 else self.column + len(string)
self.line += newlines
# elif c == "_":
# yield Token(TokenKind.UNDERSCORE, "_", self.i, self.line, self.column)
# self.i += 1
# self.column += 1
else:
raise LexerError(f"Unknown token '{c}'", self.text, self.i, self.line, self.column)
if self.yield_eof:
yield Token(TokenKind.EOF, "", self.i, self.line, self.column)
def eat_concept(self, start, line, column):
key, id, buffer = None, None, ""
i = start
processing_key = True
while i < self.text_len:
c = self.text[i]
if c == "\n":
raise LexerError(f"New line in concept name", self.text[start:i], i, line, column + i - start)
if c == ":":
if processing_key:
key = buffer if buffer else None
else:
id = buffer if buffer else None
i += 1 # eat the colon
break
if c == "|":
key = buffer if buffer else None
buffer = ""
processing_key = False
i += 1
continue
buffer += c
i += 1
else:
raise LexerError(f"Missing ending colon", self.text[start:i], i, line, column + i - start)
if (key, id) == (None, None):
raise LexerError(f"Concept identifiers not found", "", start, line, column)
return key, id, i - start
def eat_whitespace(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c == " " or c == "\t":
result += c
i += 1
else:
break
return result
def eat_newline(self, start):
if start + 1 == self.text_len:
return self.text[start]
current = self.text[start]
next = self.text[start + 1]
if current == "\n" and next == "\r" or current == "\r" and next == "\n":
return current + next
return current
def eat_identifier(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c.isalpha() or c == "_" or c == "-" or c.isdigit():
result += c
i += 1
else:
break
return result
def eat_number(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c.isdigit() or c == ".":
result += c
i += 1
else:
break
return result
def eat_string(self, start_index, start_line, start_column):
quote = self.text[start_index]
result = self.text[start_index]
lines_count = 0
column_index = start_column + 1
i = start_index + 1
escape = False
# newline = None
while i < self.text_len:
c = self.text[i]
result += c
i += 1
column_index += 1
if c == "\n":
lines_count += 1
column_index = 1
if c == "\\":
escape = True
elif c == quote and not escape:
break
else:
escape = False
# # add trailing new line if needed
# if newline:
# lines_count += 1
# column_index = 1
if result[-1] != quote:
raise LexerError("Missing Trailing quote", result, i, start_line + lines_count,
1 if lines_count > 0 else start_column + len(result))
return result, lines_count, column_index
def eat_word(self, start):
"""
Word is an alphanum (no space)
:param start:
:return:
"""
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c.isalpha() or c.isdigit():
result += c
i += 1
else:
break
return result
class IterParser:
def __init__(self, source):
self.source = source
self.iterator = iter(Tokenizer(source))
self.tokens_after = []
self.token = None
self.error_sink = None
def next_token(self, skip_whitespace=True):
try:
if len(self.tokens_after) > 0:
self.token = self.tokens_after.pop(0)
else:
self.token = next(self.iterator)
if skip_whitespace:
while self.token.type in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
self.token = next(self.iterator)
return self.token.type != TokenKind.EOF
except StopIteration:
return False
def the_token_after(self, skip_whitespace=True):
try:
token_after = next(self.iterator)
self.tokens_after.append(token_after)
if skip_whitespace:
while token_after.type in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
token_after = next(self.iterator)
self.tokens_after.append(token_after)
return token_after
except StopIteration:
return Token(TokenKind.EOF, -1, -1, -1, -1)