Refactored sheerka class: splitted to use sub handlers. Refactored unit tests to use classes.
This commit is contained in:
@@ -0,0 +1,411 @@
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TokenKind(Enum):
|
||||
EOF = "eof"
|
||||
WHITESPACE = "whitespace"
|
||||
NEWLINE = "newline"
|
||||
KEYWORD = "keyword"
|
||||
IDENTIFIER = "identifier"
|
||||
CONCEPT = "concept"
|
||||
STRING = "string"
|
||||
NUMBER = "number"
|
||||
TRUE = "true"
|
||||
FALSE = "false"
|
||||
LPAR = "lpar"
|
||||
RPAR = "rpar"
|
||||
LBRACKET = "lbrace"
|
||||
RBRACKET = "rbracket"
|
||||
LBRACE = "lbrace"
|
||||
RBRACE = "rbrace"
|
||||
PLUS = "plus"
|
||||
MINUS = "minus"
|
||||
STAR = "star"
|
||||
SLASH = "slash"
|
||||
PERCENT = "percent"
|
||||
COMMA = "comma"
|
||||
SEMICOLON = "semicolon"
|
||||
COLON = "colon"
|
||||
DOT = "dot"
|
||||
QMARK = "qmark"
|
||||
VBAR = "vbar"
|
||||
AMPER = "amper"
|
||||
EQUALS = "="
|
||||
AT = "at"
|
||||
BACK_QUOTE = "bquote" # `
|
||||
BACK_SLASH = "bslash" # \
|
||||
CARAT = "carat" # ^
|
||||
DOLLAR = "dollar" # $
|
||||
EURO = "dollar" # €
|
||||
STERLING = "steling" # £
|
||||
EMARK = "emark" # !
|
||||
GREATER = "greater" # >
|
||||
LESS = "less" # <
|
||||
HASH = "HASH" # #
|
||||
TILDE = "tilde" # ~
|
||||
UNDERSCORE = "underscore" # _
|
||||
DEGREE = "degree" # °
|
||||
|
||||
|
||||
@dataclass()
|
||||
class Token:
|
||||
type: TokenKind
|
||||
value: object
|
||||
index: int
|
||||
line: int
|
||||
column: int
|
||||
|
||||
def __repr__(self):
|
||||
if self.type == TokenKind.IDENTIFIER:
|
||||
value = str(self.value)
|
||||
elif self.type == TokenKind.WHITESPACE:
|
||||
value = "<ws>"
|
||||
elif self.type == TokenKind.NEWLINE:
|
||||
value = r"\n"
|
||||
elif self.type == TokenKind.EOF:
|
||||
value = "<EOF>"
|
||||
else:
|
||||
value = self.value
|
||||
|
||||
return f"Token({value})"
|
||||
|
||||
|
||||
@dataclass()
|
||||
class LexerError(Exception):
|
||||
message: str
|
||||
text: str
|
||||
index: int
|
||||
line: int
|
||||
column: int
|
||||
|
||||
|
||||
class Keywords(Enum):
|
||||
DEF = "def"
|
||||
CONCEPT = "concept"
|
||||
FROM = "from"
|
||||
BNF = "bnf"
|
||||
AS = "as"
|
||||
WHERE = "where"
|
||||
PRE = "pre"
|
||||
POST = "post"
|
||||
ISA = "isa"
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""
|
||||
Class that can iterate on the tokens
|
||||
"""
|
||||
|
||||
KEYWORDS = set(x.value for x in Keywords)
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
self.text_len = len(text)
|
||||
self.column = 1
|
||||
self.line = 1
|
||||
self.i = 0
|
||||
|
||||
def __iter__(self):
|
||||
|
||||
while self.i < self.text_len:
|
||||
c = self.text[self.i]
|
||||
if c == "+":
|
||||
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
|
||||
number = self.eat_number(self.i)
|
||||
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
||||
self.i += len(number)
|
||||
self.column += len(number)
|
||||
else:
|
||||
yield Token(TokenKind.PLUS, "+", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "-":
|
||||
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
|
||||
number = self.eat_number(self.i)
|
||||
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
||||
self.i += len(number)
|
||||
self.column += len(number)
|
||||
else:
|
||||
yield Token(TokenKind.MINUS, "-", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "_":
|
||||
if self.i + 1 < self.text_len and self.text[self.i + 1].isalpha():
|
||||
identifier = self.eat_identifier(self.i)
|
||||
token_type = TokenKind.KEYWORD if identifier in self.KEYWORDS else TokenKind.IDENTIFIER
|
||||
value = Keywords(identifier) if identifier in self.KEYWORDS else identifier
|
||||
yield Token(token_type, value, self.i, self.line, self.column)
|
||||
self.i += len(identifier)
|
||||
self.column += len(identifier)
|
||||
else:
|
||||
yield Token(TokenKind.UNDERSCORE, "_", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "/":
|
||||
yield Token(TokenKind.SLASH, "/", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "*":
|
||||
yield Token(TokenKind.STAR, "*", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "{":
|
||||
yield Token(TokenKind.LBRACE, "{", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "}":
|
||||
yield Token(TokenKind.RBRACE, "}", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "(":
|
||||
yield Token(TokenKind.LPAR, "(", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ")":
|
||||
yield Token(TokenKind.RPAR, ")", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "[":
|
||||
yield Token(TokenKind.LBRACKET, "[", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "]":
|
||||
yield Token(TokenKind.RBRACKET, "]", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "=":
|
||||
yield Token(TokenKind.EQUALS, "=", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == " " or c == "\t":
|
||||
whitespace = self.eat_whitespace(self.i)
|
||||
yield Token(TokenKind.WHITESPACE, whitespace, self.i, self.line, self.column)
|
||||
self.i += len(whitespace)
|
||||
self.column += len(whitespace)
|
||||
elif c == ",":
|
||||
yield Token(TokenKind.COMMA, ",", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ".":
|
||||
yield Token(TokenKind.DOT, ".", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ";":
|
||||
yield Token(TokenKind.SEMICOLON, ";", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ":":
|
||||
yield Token(TokenKind.COLON, ":", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "?":
|
||||
yield Token(TokenKind.QMARK, "?", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "|":
|
||||
yield Token(TokenKind.VBAR, "|", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "&":
|
||||
yield Token(TokenKind.AMPER, "&", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "<":
|
||||
yield Token(TokenKind.LESS, "<", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ">":
|
||||
yield Token(TokenKind.GREATER, ">", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "!":
|
||||
yield Token(TokenKind.EMARK, "!", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "`":
|
||||
yield Token(TokenKind.BACK_QUOTE, "`", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "\\":
|
||||
yield Token(TokenKind.BACK_SLASH, "\\", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "^":
|
||||
yield Token(TokenKind.CARAT, "^", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "$":
|
||||
yield Token(TokenKind.DOLLAR, "$", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "€":
|
||||
yield Token(TokenKind.EURO, "€", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "£":
|
||||
yield Token(TokenKind.STERLING, "£", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "#":
|
||||
yield Token(TokenKind.HASH, "#", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "°":
|
||||
yield Token(TokenKind.DEGREE, "°", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "~":
|
||||
yield Token(TokenKind.TILDE, "~", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "\n" or c == "\r":
|
||||
newline = self.eat_newline(self.i)
|
||||
yield Token(TokenKind.NEWLINE, newline, self.i, self.line, self.column)
|
||||
self.i += len(newline)
|
||||
self.column = 1
|
||||
self.line += 1
|
||||
elif c == "c" and self.i + 1 < self.text_len and self.text[self.i + 1] == ":":
|
||||
concept_name = self.eat_concept_name(self.i + 2, self.line, self.column)
|
||||
yield Token(TokenKind.CONCEPT, concept_name, self.i, self.line, self.column)
|
||||
self.i += len(concept_name) + 3
|
||||
self.column += len(concept_name) + 3
|
||||
elif c.isalpha() or c == "_":
|
||||
identifier = self.eat_identifier(self.i)
|
||||
token_type = TokenKind.KEYWORD if identifier in self.KEYWORDS else TokenKind.IDENTIFIER
|
||||
value = Keywords(identifier) if identifier in self.KEYWORDS else identifier
|
||||
yield Token(token_type, value, self.i, self.line, self.column)
|
||||
self.i += len(identifier)
|
||||
self.column += len(identifier)
|
||||
elif c.isdigit():
|
||||
number = self.eat_number(self.i)
|
||||
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
||||
self.i += len(number)
|
||||
self.column += len(number)
|
||||
elif c == "'" or c == '"':
|
||||
string, newlines = self.eat_string(self.i, self.line, self.column)
|
||||
yield Token(TokenKind.STRING, string, self.i, self.line, self.column) # quotes are kept
|
||||
self.i += len(string)
|
||||
self.column = 1 if newlines > 0 else self.column + len(string)
|
||||
self.line += newlines
|
||||
elif c == "_":
|
||||
yield Token(TokenKind.UNDERSCORE, "_", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
else:
|
||||
raise LexerError(f"Unknown token '{c}'", self.text, self.i, self.line, self.column)
|
||||
|
||||
yield Token(TokenKind.EOF, "", self.i, self.line, self.column)
|
||||
|
||||
def eat_concept_name(self, start, line, column):
|
||||
result = ""
|
||||
i = start
|
||||
end_colon_found = False
|
||||
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
|
||||
if c == "\n":
|
||||
raise LexerError(f"New line is forbidden in concept name", result, i, line, column + 2 + len(result))
|
||||
|
||||
if c == ":":
|
||||
end_colon_found = True
|
||||
break
|
||||
|
||||
result += c
|
||||
i += 1
|
||||
|
||||
if not end_colon_found:
|
||||
raise LexerError(f"Missing ending colon", result, i, line, column + 2 + len(result))
|
||||
|
||||
if result == "":
|
||||
raise LexerError(f"Concept name not found", result, start, line, column + 2 + len(result))
|
||||
|
||||
return result
|
||||
|
||||
def eat_whitespace(self, start):
|
||||
result = self.text[start]
|
||||
i = start + 1
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
if c == " " or c == "\t":
|
||||
result += c
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def eat_newline(self, start):
|
||||
if start + 1 == self.text_len:
|
||||
return self.text[start]
|
||||
|
||||
current = self.text[start]
|
||||
next = self.text[start + 1]
|
||||
if current == "\n" and next == "\r" or current == "\r" and next == "\n":
|
||||
return current + next
|
||||
|
||||
return current
|
||||
|
||||
def eat_identifier(self, start):
|
||||
result = self.text[start]
|
||||
i = start + 1
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
if c.isalpha() or c == "_" or c == "-" or c.isdigit():
|
||||
result += c
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def eat_number(self, start):
|
||||
result = self.text[start]
|
||||
i = start + 1
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
if c.isdigit() or c == ".":
|
||||
result += c
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def eat_string(self, start_index, start_line, start_column):
|
||||
quote = self.text[start_index]
|
||||
result = self.text[start_index]
|
||||
lines_count = 0
|
||||
|
||||
i = start_index + 1
|
||||
escape = False
|
||||
newline = None
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
result += c
|
||||
i += 1
|
||||
|
||||
if newline:
|
||||
lines_count += 1
|
||||
newline = c if c == newline else None
|
||||
else:
|
||||
if c == "\r" or c == "\n":
|
||||
newline = c
|
||||
|
||||
if c == "\\":
|
||||
escape = True
|
||||
elif c == quote and not escape:
|
||||
break
|
||||
else:
|
||||
escape = False
|
||||
|
||||
# add trailing new line if needed
|
||||
if newline:
|
||||
lines_count += 1
|
||||
|
||||
if result[-1] != quote:
|
||||
raise LexerError("Missing Trailing quote", result, i, start_line + lines_count,
|
||||
1 if lines_count > 0 else start_column + len(result))
|
||||
|
||||
return result, lines_count
|
||||
Reference in New Issue
Block a user