250 lines
7.0 KiB
Python
250 lines
7.0 KiB
Python
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Token:
|
|
type: str
|
|
value: str
|
|
index: int
|
|
line: int
|
|
column: int
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LexerError(Exception):
|
|
message: str
|
|
text: str
|
|
index: int
|
|
line: int
|
|
column: int
|
|
|
|
|
|
class Tokens:
|
|
EOF = "eof"
|
|
WHITESPACE = "whitespace"
|
|
NEWLINE = "newline"
|
|
KEYWORD = "keyword"
|
|
IDENTIFIER = "identifier"
|
|
STRING = "string"
|
|
NUMBER = "number"
|
|
TRUE = "true"
|
|
FALSE = "false"
|
|
LPAR = "lpar"
|
|
RPAR = "rpar"
|
|
LBRACKET = "lbrace"
|
|
RBRACKET = "rbracket"
|
|
LBRACE = "lbrace"
|
|
RBRACE = "rbrace"
|
|
PLUS = "plus"
|
|
MINUS = "minus"
|
|
STAR = "star"
|
|
SLASH = "slash"
|
|
PERCENT = "percent"
|
|
COMMA = "comma"
|
|
SEMICOLON = "semicolon"
|
|
COLON = "colon"
|
|
DOT = "dot"
|
|
QMARK = "qmark"
|
|
VBAR = "vbar"
|
|
AMPER = "amper"
|
|
|
|
|
|
class TokenIter:
|
|
KEYWORDS = ("def", "concept", "as", "pre", "post")
|
|
|
|
"""
|
|
Class that can iterate on the tokens
|
|
"""
|
|
|
|
def __init__(self, text):
|
|
self.text = text
|
|
self.text_len = len(text)
|
|
|
|
def __iter__(self):
|
|
|
|
i = 0
|
|
line = 1
|
|
column = 1
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
if c == "+":
|
|
yield Token(Tokens.PLUS, "+", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "-":
|
|
if i + 1 < self.text_len and self.text[i + 1].isdigit():
|
|
number = self.eat_number(i)
|
|
yield Token(Tokens.NUMBER, number, i, line, column)
|
|
i += len(number)
|
|
column += len(number)
|
|
else:
|
|
yield Token(Tokens.MINUS, "-", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "/":
|
|
yield Token(Tokens.SLASH, "/", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "*":
|
|
yield Token(Tokens.STAR, "*", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "{":
|
|
yield Token(Tokens.LBRACE, "{", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "}":
|
|
yield Token(Tokens.RBRACE, "}", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "(":
|
|
yield Token(Tokens.LPAR, "(", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == ")":
|
|
yield Token(Tokens.RPAR, ")", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "[":
|
|
yield Token(Tokens.LBRACKET, "[", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "]":
|
|
yield Token(Tokens.RBRACKET, "]", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == " " or c == "\t":
|
|
whitespace = self.eat_whitespace(i)
|
|
yield Token(Tokens.WHITESPACE, whitespace, i, line, column)
|
|
i += len(whitespace)
|
|
column += len(whitespace)
|
|
elif c == ",":
|
|
yield Token(Tokens.COMMA, ",", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == ".":
|
|
yield Token(Tokens.DOT, ".", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == ";":
|
|
yield Token(Tokens.SEMICOLON, ";", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == ":":
|
|
yield Token(Tokens.COLON, ":", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "?":
|
|
yield Token(Tokens.QMARK, "?", i, line, column)
|
|
i += 1
|
|
column += 1
|
|
elif c == "\n" or c == "\r":
|
|
newline = self.eat_newline(i)
|
|
yield Token(Tokens.NEWLINE, newline, i, line, column)
|
|
i += len(newline)
|
|
column = 1
|
|
line += 1
|
|
elif c.isalpha() or c == "_":
|
|
identifier = self.eat_identifier(i)
|
|
type = Tokens.KEYWORD if identifier in self.KEYWORDS else Tokens.IDENTIFIER
|
|
yield Token(type, identifier, i, line, column)
|
|
i += len(identifier)
|
|
column += len(identifier)
|
|
elif c.isdigit():
|
|
number = self.eat_number(i)
|
|
yield Token(Tokens.NUMBER, number, i, line, column)
|
|
i += len(number)
|
|
column += len(number)
|
|
elif c == "'" or c == '"':
|
|
string, newlines = self.eat_string(i)
|
|
yield Token(Tokens.STRING, string, i, line, column)
|
|
i += len(string)
|
|
column = 1 if newlines > 0 else column + len(string)
|
|
line += newlines
|
|
else:
|
|
raise LexerError(f"Unknown token '{c}'", self.text, i, line, column)
|
|
|
|
yield Token(Tokens.EOF, "", i, line, column)
|
|
|
|
def eat_whitespace(self, start):
|
|
result = self.text[start]
|
|
i = start + 1
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
if c == " " or c == "\t":
|
|
result += c
|
|
i += 1
|
|
else:
|
|
break
|
|
|
|
return result
|
|
|
|
def eat_newline(self, start):
|
|
if start + 1 == self.text_len:
|
|
return self.text[start]
|
|
|
|
current = self.text[start]
|
|
next = self.text[start + 1]
|
|
if current == "\n" and next == "\r" or current == "\r" and next == "\n":
|
|
return current + next
|
|
|
|
return current
|
|
|
|
def eat_identifier(self, start):
|
|
result = self.text[start]
|
|
i = start + 1
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
if c.isalpha() or c == "_" or c == "-" or c.isdigit():
|
|
result += c
|
|
i += 1
|
|
else:
|
|
break
|
|
|
|
return result
|
|
|
|
def eat_number(self, start):
|
|
result = self.text[start]
|
|
i = start + 1
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
if c.isdigit() or c == ".":
|
|
result += c
|
|
i += 1
|
|
else:
|
|
break
|
|
|
|
return result
|
|
|
|
def eat_string(self, start):
|
|
quote = self.text[start]
|
|
result = self.text[start]
|
|
lines_count = 0
|
|
|
|
i = start + 1
|
|
escape = False
|
|
newline = None
|
|
while i < self.text_len:
|
|
c = self.text[i]
|
|
result += c
|
|
i += 1
|
|
|
|
if newline:
|
|
lines_count += 1
|
|
newline = c if c == newline else None
|
|
else:
|
|
if c == "\r" or c == "\n":
|
|
newline = c
|
|
|
|
if c == "\\":
|
|
escape = True
|
|
elif c == quote and not escape:
|
|
break
|
|
else:
|
|
escape = False
|
|
|
|
if newline:
|
|
lines_count += 1
|
|
|
|
return result, lines_count
|