Files
Sheerka-Old/parsers/defaultparser.py
T
2019-10-22 19:02:51 +02:00

250 lines
7.0 KiB
Python

from dataclasses import dataclass
@dataclass(frozen=True)
class Token:
type: str
value: str
index: int
line: int
column: int
@dataclass(frozen=True)
class LexerError(Exception):
message: str
text: str
index: int
line: int
column: int
class Tokens:
EOF = "eof"
WHITESPACE = "whitespace"
NEWLINE = "newline"
KEYWORD = "keyword"
IDENTIFIER = "identifier"
STRING = "string"
NUMBER = "number"
TRUE = "true"
FALSE = "false"
LPAR = "lpar"
RPAR = "rpar"
LBRACKET = "lbrace"
RBRACKET = "rbracket"
LBRACE = "lbrace"
RBRACE = "rbrace"
PLUS = "plus"
MINUS = "minus"
STAR = "star"
SLASH = "slash"
PERCENT = "percent"
COMMA = "comma"
SEMICOLON = "semicolon"
COLON = "colon"
DOT = "dot"
QMARK = "qmark"
VBAR = "vbar"
AMPER = "amper"
class TokenIter:
KEYWORDS = ("def", "concept", "as", "pre", "post")
"""
Class that can iterate on the tokens
"""
def __init__(self, text):
self.text = text
self.text_len = len(text)
def __iter__(self):
i = 0
line = 1
column = 1
while i < self.text_len:
c = self.text[i]
if c == "+":
yield Token(Tokens.PLUS, "+", i, line, column)
i += 1
column += 1
elif c == "-":
if i + 1 < self.text_len and self.text[i + 1].isdigit():
number = self.eat_number(i)
yield Token(Tokens.NUMBER, number, i, line, column)
i += len(number)
column += len(number)
else:
yield Token(Tokens.MINUS, "-", i, line, column)
i += 1
column += 1
elif c == "/":
yield Token(Tokens.SLASH, "/", i, line, column)
i += 1
column += 1
elif c == "*":
yield Token(Tokens.STAR, "*", i, line, column)
i += 1
column += 1
elif c == "{":
yield Token(Tokens.LBRACE, "{", i, line, column)
i += 1
column += 1
elif c == "}":
yield Token(Tokens.RBRACE, "}", i, line, column)
i += 1
column += 1
elif c == "(":
yield Token(Tokens.LPAR, "(", i, line, column)
i += 1
column += 1
elif c == ")":
yield Token(Tokens.RPAR, ")", i, line, column)
i += 1
column += 1
elif c == "[":
yield Token(Tokens.LBRACKET, "[", i, line, column)
i += 1
column += 1
elif c == "]":
yield Token(Tokens.RBRACKET, "]", i, line, column)
i += 1
column += 1
elif c == " " or c == "\t":
whitespace = self.eat_whitespace(i)
yield Token(Tokens.WHITESPACE, whitespace, i, line, column)
i += len(whitespace)
column += len(whitespace)
elif c == ",":
yield Token(Tokens.COMMA, ",", i, line, column)
i += 1
column += 1
elif c == ".":
yield Token(Tokens.DOT, ".", i, line, column)
i += 1
column += 1
elif c == ";":
yield Token(Tokens.SEMICOLON, ";", i, line, column)
i += 1
column += 1
elif c == ":":
yield Token(Tokens.COLON, ":", i, line, column)
i += 1
column += 1
elif c == "?":
yield Token(Tokens.QMARK, "?", i, line, column)
i += 1
column += 1
elif c == "\n" or c == "\r":
newline = self.eat_newline(i)
yield Token(Tokens.NEWLINE, newline, i, line, column)
i += len(newline)
column = 1
line += 1
elif c.isalpha() or c == "_":
identifier = self.eat_identifier(i)
type = Tokens.KEYWORD if identifier in self.KEYWORDS else Tokens.IDENTIFIER
yield Token(type, identifier, i, line, column)
i += len(identifier)
column += len(identifier)
elif c.isdigit():
number = self.eat_number(i)
yield Token(Tokens.NUMBER, number, i, line, column)
i += len(number)
column += len(number)
elif c == "'" or c == '"':
string, newlines = self.eat_string(i)
yield Token(Tokens.STRING, string, i, line, column)
i += len(string)
column = 1 if newlines > 0 else column + len(string)
line += newlines
else:
raise LexerError(f"Unknown token '{c}'", self.text, i, line, column)
yield Token(Tokens.EOF, "", i, line, column)
def eat_whitespace(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c == " " or c == "\t":
result += c
i += 1
else:
break
return result
def eat_newline(self, start):
if start + 1 == self.text_len:
return self.text[start]
current = self.text[start]
next = self.text[start + 1]
if current == "\n" and next == "\r" or current == "\r" and next == "\n":
return current + next
return current
def eat_identifier(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c.isalpha() or c == "_" or c == "-" or c.isdigit():
result += c
i += 1
else:
break
return result
def eat_number(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c.isdigit() or c == ".":
result += c
i += 1
else:
break
return result
def eat_string(self, start):
quote = self.text[start]
result = self.text[start]
lines_count = 0
i = start + 1
escape = False
newline = None
while i < self.text_len:
c = self.text[i]
result += c
i += 1
if newline:
lines_count += 1
newline = c if c == newline else None
else:
if c == "\r" or c == "\n":
newline = c
if c == "\\":
escape = True
elif c == quote and not escape:
break
else:
escape = False
if newline:
lines_count += 1
return result, lines_count