Added ExactConceptParser
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from dataclasses import dataclass, field
|
||||
from parsers.tokenizer import TokenKind, Keywords
|
||||
from dataclasses import dataclass
|
||||
from core.tokenizer import TokenKind, Keywords
|
||||
|
||||
|
||||
@dataclass()
|
||||
@@ -21,13 +21,12 @@ class ErrorNode(Node):
|
||||
|
||||
|
||||
class BaseParser:
|
||||
def __init__(self, name, text):
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.text = text
|
||||
self.has_error = False
|
||||
self.error_sink = []
|
||||
|
||||
def parse(self):
|
||||
def parse(self, context, text):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
|
||||
+102
-94
@@ -1,5 +1,5 @@
|
||||
from parsers.BaseParser import BaseParser, Node, NopNode, ErrorNode
|
||||
from parsers.tokenizer import Tokenizer, TokenKind, Token, Keywords
|
||||
from core.tokenizer import Tokenizer, TokenKind, Token, Keywords
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
|
||||
@@ -147,24 +147,29 @@ class BinaryNode(DefaultParserNode):
|
||||
|
||||
|
||||
class DefaultParser(BaseParser):
|
||||
def __init__(self, text, sub_parser):
|
||||
BaseParser.__init__(self, "DefaultParser", text)
|
||||
"""
|
||||
Parse sheerka specific grammar (like def concept)
|
||||
"""
|
||||
def __init__(self, sub_parser=None):
|
||||
BaseParser.__init__(self, "DefaultParser")
|
||||
self.sub_parser = sub_parser
|
||||
self.lexer = Tokenizer(text)
|
||||
self.lexer_iter = None
|
||||
self._current = None
|
||||
self.context = None
|
||||
self.text = None
|
||||
|
||||
def reset_parser(self, context, text):
|
||||
self.context = context
|
||||
# hack before implementing all the sub parsers
|
||||
if context:
|
||||
self.sub_parser = context.sheerka.parsers[1]
|
||||
|
||||
self.text = text
|
||||
self.lexer_iter = iter(Tokenizer(text))
|
||||
self._current = None
|
||||
|
||||
self.next_token()
|
||||
|
||||
def collect_tokens(self, *args):
|
||||
result = []
|
||||
for item in args:
|
||||
if isinstance(item, Node):
|
||||
result.extend(item.tokens)
|
||||
else:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
def add_error(self, error, next_token=True):
|
||||
self.has_error = True
|
||||
self.error_sink.append(error)
|
||||
@@ -186,21 +191,23 @@ class DefaultParser(BaseParser):
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def get_concept_name(tokens, variables=None):
|
||||
name = ""
|
||||
def get_concept_key(tokens, variables=None):
|
||||
key = ""
|
||||
first = True
|
||||
for token in tokens:
|
||||
if token.type == TokenKind.EOF:
|
||||
break
|
||||
if token.type == TokenKind.WHITESPACE:
|
||||
continue
|
||||
if not first:
|
||||
name += " "
|
||||
key += " "
|
||||
if variables is not None and token.value in variables:
|
||||
name += "__var__" + str(variables.index(token.value))
|
||||
key += "__var__" + str(variables.index(token.value))
|
||||
else:
|
||||
name += token.value[1:-1] if token.type == TokenKind.STRING else token.value
|
||||
key += token.value[1:-1] if token.type == TokenKind.STRING else token.value
|
||||
first = False
|
||||
|
||||
return name
|
||||
return key
|
||||
|
||||
@staticmethod
|
||||
def fix_indentation(tokens):
|
||||
@@ -242,7 +249,8 @@ class DefaultParser(BaseParser):
|
||||
|
||||
return tokens[4:]
|
||||
|
||||
def parse(self):
|
||||
def parse(self, context, text):
|
||||
self.reset_parser(context, text)
|
||||
return self.parse_statement()
|
||||
|
||||
def parse_statement(self):
|
||||
@@ -277,7 +285,7 @@ class DefaultParser(BaseParser):
|
||||
name_as_tokens.append(token)
|
||||
self.next_token()
|
||||
token = self.get_token()
|
||||
name = self.get_concept_name(name_as_tokens)
|
||||
name = self.get_concept_key(name_as_tokens)
|
||||
tokens_found["name"] = name_as_tokens
|
||||
|
||||
# try to parse as, where, pre and post declarations
|
||||
@@ -328,8 +336,8 @@ class DefaultParser(BaseParser):
|
||||
|
||||
# start = current_tokens[0].index
|
||||
# end = current_tokens[-1].index + len(current_tokens[-1].value)
|
||||
sub_parser = self.sub_parser(current_tokens, source=keyword.value)
|
||||
sub_tree = sub_parser.parse()
|
||||
sub_parser = self.sub_parser(source=keyword.value)
|
||||
sub_tree = sub_parser.parse(self.context, current_tokens)
|
||||
if isinstance(sub_tree, ErrorNode):
|
||||
self.add_error(sub_tree, False)
|
||||
asts[keyword] = sub_tree
|
||||
@@ -344,74 +352,74 @@ class DefaultParser(BaseParser):
|
||||
log.debug(f"Found DefConcept node '{def_concept_node}'")
|
||||
return def_concept_node
|
||||
|
||||
def parse_expression(self):
|
||||
return self.parse_addition()
|
||||
|
||||
def parse_addition(self):
|
||||
left = self.parse_multiply()
|
||||
token = self.get_token()
|
||||
if token is None or token.type == TokenKind.EOF:
|
||||
return left
|
||||
|
||||
if token.type == TokenKind.NUMBER: # example 15 +5 or 15 -5
|
||||
right = self.parse_addition()
|
||||
return BinaryNode(self.collect_tokens(left, token, right), TokenKind.PLUS, left, right)
|
||||
|
||||
if token.type not in (TokenKind.PLUS, TokenKind.MINUS):
|
||||
return left
|
||||
|
||||
self.next_token()
|
||||
right = self.parse_addition()
|
||||
return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right)
|
||||
|
||||
def parse_multiply(self):
|
||||
left = self.parse_atom()
|
||||
token = self.get_token()
|
||||
if token is None or token.type == TokenKind.EOF:
|
||||
return left
|
||||
|
||||
if token.type not in (TokenKind.STAR, TokenKind.SLASH):
|
||||
return left
|
||||
|
||||
self.next_token()
|
||||
right = self.parse_multiply()
|
||||
return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right)
|
||||
|
||||
def parse_atom(self):
|
||||
token = self.get_token()
|
||||
if token.type == TokenKind.NUMBER:
|
||||
self.next_token()
|
||||
return NumberNode([token], float(token.value) if '.' in token.value else int(token.value))
|
||||
elif token.type == TokenKind.STRING:
|
||||
self.next_token()
|
||||
return StringNode([token], token.value[1:-1], token.value[0])
|
||||
elif token.type == TokenKind.IDENTIFIER:
|
||||
if token.value == "true":
|
||||
self.next_token()
|
||||
return TrueNode([token])
|
||||
elif token.value == "false":
|
||||
self.next_token()
|
||||
return FalseNode([token])
|
||||
elif token.value == "null":
|
||||
self.next_token()
|
||||
return NullNode([token])
|
||||
else:
|
||||
self.next_token()
|
||||
return VariableNode([token], token.value)
|
||||
elif token.type == TokenKind.LPAR:
|
||||
self.next_token()
|
||||
exp = self.parse_expression()
|
||||
token = self.get_token()
|
||||
self.next_token()
|
||||
|
||||
if token.type != TokenKind.RPAR:
|
||||
error = UnexpectedTokenErrorNode([token], "Right parenthesis not found.", [TokenKind.RPAR])
|
||||
self.add_error(error)
|
||||
return error
|
||||
|
||||
return exp
|
||||
else:
|
||||
error = UnexpectedTokenErrorNode([token], "Unexpected token",
|
||||
[TokenKind.NUMBER, TokenKind.STRING, TokenKind.IDENTIFIER, "true", "false",
|
||||
"null", TokenKind.LPAR])
|
||||
return self.add_error(error)
|
||||
# def parse_expression(self):
|
||||
# return self.parse_addition()
|
||||
#
|
||||
# def parse_addition(self):
|
||||
# left = self.parse_multiply()
|
||||
# token = self.get_token()
|
||||
# if token is None or token.type == TokenKind.EOF:
|
||||
# return left
|
||||
#
|
||||
# if token.type == TokenKind.NUMBER: # example 15 +5 or 15 -5
|
||||
# right = self.parse_addition()
|
||||
# return BinaryNode(self.collect_tokens(left, token, right), TokenKind.PLUS, left, right)
|
||||
#
|
||||
# if token.type not in (TokenKind.PLUS, TokenKind.MINUS):
|
||||
# return left
|
||||
#
|
||||
# self.next_token()
|
||||
# right = self.parse_addition()
|
||||
# return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right)
|
||||
#
|
||||
# def parse_multiply(self):
|
||||
# left = self.parse_atom()
|
||||
# token = self.get_token()
|
||||
# if token is None or token.type == TokenKind.EOF:
|
||||
# return left
|
||||
#
|
||||
# if token.type not in (TokenKind.STAR, TokenKind.SLASH):
|
||||
# return left
|
||||
#
|
||||
# self.next_token()
|
||||
# right = self.parse_multiply()
|
||||
# return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right)
|
||||
#
|
||||
# def parse_atom(self):
|
||||
# token = self.get_token()
|
||||
# if token.type == TokenKind.NUMBER:
|
||||
# self.next_token()
|
||||
# return NumberNode([token], float(token.value) if '.' in token.value else int(token.value))
|
||||
# elif token.type == TokenKind.STRING:
|
||||
# self.next_token()
|
||||
# return StringNode([token], token.value[1:-1], token.value[0])
|
||||
# elif token.type == TokenKind.IDENTIFIER:
|
||||
# if token.value == "true":
|
||||
# self.next_token()
|
||||
# return TrueNode([token])
|
||||
# elif token.value == "false":
|
||||
# self.next_token()
|
||||
# return FalseNode([token])
|
||||
# elif token.value == "null":
|
||||
# self.next_token()
|
||||
# return NullNode([token])
|
||||
# else:
|
||||
# self.next_token()
|
||||
# return VariableNode([token], token.value)
|
||||
# elif token.type == TokenKind.LPAR:
|
||||
# self.next_token()
|
||||
# exp = self.parse_expression()
|
||||
# token = self.get_token()
|
||||
# self.next_token()
|
||||
#
|
||||
# if token.type != TokenKind.RPAR:
|
||||
# error = UnexpectedTokenErrorNode([token], "Right parenthesis not found.", [TokenKind.RPAR])
|
||||
# self.add_error(error)
|
||||
# return error
|
||||
#
|
||||
# return exp
|
||||
# else:
|
||||
# error = UnexpectedTokenErrorNode([token], "Unexpected token",
|
||||
# [TokenKind.NUMBER, TokenKind.STRING, TokenKind.IDENTIFIER, "true", "false",
|
||||
# "null", TokenKind.LPAR])
|
||||
# return self.add_error(error)
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
from core.sheerka import ReturnValue
|
||||
from parsers.BaseParser import BaseParser
|
||||
from core.tokenizer import Tokenizer, Keywords, TokenKind
|
||||
from core.concept import Concept
|
||||
|
||||
|
||||
class ExactConceptParser(BaseParser):
|
||||
"""
|
||||
Tries to recognize a single concept
|
||||
"""
|
||||
|
||||
MAX_WORDS_SIZE = 10
|
||||
|
||||
def __init__(self):
|
||||
BaseParser.__init__(self, "ConceptParser")
|
||||
|
||||
def parse(self, context, text):
|
||||
res = []
|
||||
sheerka = context.sheerka
|
||||
words = self.get_words(text)
|
||||
if len(words) > self.MAX_WORDS_SIZE:
|
||||
return ReturnValue(self.name, False, sheerka.new(sheerka.CONCEPT_TOO_LONG_CONCEPT_NAME))
|
||||
|
||||
recognized = False
|
||||
for combination in self.combinations(words):
|
||||
concept_key = " ".join(combination)
|
||||
|
||||
# Very important question to think about later
|
||||
# Must we return a new instance or the existing one
|
||||
# That will depend on the context
|
||||
# Let's return a new one for now and see if it works
|
||||
concept = sheerka.new(concept_key)
|
||||
if not sheerka.isinstance(concept, sheerka.UNKNOWN_CONCEPT_NAME):
|
||||
# update the properties if needed
|
||||
for i, token in enumerate(combination):
|
||||
if token.startswith(Concept.PROPERTY_PREFIX):
|
||||
index = int(token[len(Concept.PROPERTY_PREFIX):])
|
||||
concept.set_prop_by_index(index, words[i])
|
||||
res.append(ReturnValue(self.name, True, concept))
|
||||
recognized = True
|
||||
|
||||
if recognized:
|
||||
return res
|
||||
|
||||
return ReturnValue(self.name, False, sheerka.new(sheerka.UNKNOWN_CONCEPT_NAME, body=text))
|
||||
|
||||
@staticmethod
|
||||
def get_words(text):
|
||||
res = []
|
||||
for t in iter(Tokenizer(text)):
|
||||
if t.type == TokenKind.EOF:
|
||||
break
|
||||
if t.type == TokenKind.NEWLINE or t.type == TokenKind.WHITESPACE:
|
||||
continue
|
||||
res.append(t.value.value if isinstance(t.value, Keywords) else t.value)
|
||||
return res
|
||||
|
||||
def combinations(self, iterable):
|
||||
# combinations('foo', 'bar', 'baz') -->
|
||||
# ('foo', 'bar', 'baz'),
|
||||
# ('__var__0', 'bar', 'baz'),
|
||||
# ('foo', '__var__0', 'baz'),
|
||||
# ('foo', 'bar', '__var__0'),
|
||||
# ('__var__0', '__var__1', 'baz'),
|
||||
# ('__var__0', 'bar', '__var__1'),
|
||||
# ('foo', '__var__0', '__var__1'),
|
||||
# ('__var__0', '__var__1', '__var__2')]
|
||||
|
||||
pool = tuple(iterable)
|
||||
n = len(pool)
|
||||
|
||||
res = set()
|
||||
|
||||
for r in range(0, n + 1):
|
||||
indices = list(range(r))
|
||||
res.add(self.get_tuple(pool, indices))
|
||||
while True:
|
||||
for i in reversed(range(r)):
|
||||
if indices[i] != i + n - r:
|
||||
break
|
||||
else:
|
||||
break
|
||||
indices[i] += 1
|
||||
for j in range(i + 1, r):
|
||||
indices[j] = indices[j - 1] + 1
|
||||
res.add(self.get_tuple(pool, indices))
|
||||
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def get_tuple(pool, indices):
|
||||
res = []
|
||||
vars = {}
|
||||
k = 0
|
||||
|
||||
# init vars
|
||||
for i in indices:
|
||||
value = pool[i]
|
||||
if value not in vars:
|
||||
vars[pool[i]] = f"{Concept.PROPERTY_PREFIX}{k}"
|
||||
k += 1
|
||||
|
||||
# create tuple
|
||||
for i in range(len(pool)):
|
||||
value = pool[i]
|
||||
res.append(vars[value] if value in vars else value)
|
||||
return tuple(res)
|
||||
+18
-13
@@ -26,36 +26,41 @@ class PythonNode(Node):
|
||||
|
||||
|
||||
class PythonParser(BaseParser):
|
||||
def __init__(self, text, source="<undef>"):
|
||||
text = text if isinstance(text, str) else self.get_text_from_tokens(text)
|
||||
text = text.strip()
|
||||
BaseParser.__init__(self, "PythonParser", text)
|
||||
"""
|
||||
Parse Python scripts
|
||||
"""
|
||||
def __init__(self, source="<undef>"):
|
||||
|
||||
BaseParser.__init__(self, "PythonParser")
|
||||
self.source = source
|
||||
|
||||
def parse(self):
|
||||
def parse(self, context, text):
|
||||
text = text if isinstance(text, str) else self.get_text_from_tokens(text)
|
||||
text = text.strip()
|
||||
|
||||
# first, try to parse an expression
|
||||
res, tree, error = self.try_parse_expression()
|
||||
res, tree, error = self.try_parse_expression(text)
|
||||
if not res:
|
||||
# then try to parse a statement
|
||||
res, tree, error = self.try_parse_statement()
|
||||
res, tree, error = self.try_parse_statement(text)
|
||||
if not res:
|
||||
self.has_error = True
|
||||
error_node = PythonErrorNode(self.text, error)
|
||||
error_node = PythonErrorNode(text, error)
|
||||
self.error_sink.append(error_node)
|
||||
return error_node
|
||||
|
||||
log.debug("Recognized python code.")
|
||||
return PythonNode(self.text, tree)
|
||||
return PythonNode(text, tree)
|
||||
|
||||
def try_parse_expression(self):
|
||||
def try_parse_expression(self, text):
|
||||
try:
|
||||
return True, ast.parse(self.text, f"<{self.source}>", 'eval'), None
|
||||
return True, ast.parse(text, f"<{self.source}>", 'eval'), None
|
||||
except Exception as error:
|
||||
return False, None, error
|
||||
|
||||
def try_parse_statement(self):
|
||||
def try_parse_statement(self, text):
|
||||
try:
|
||||
return True, ast.parse(self.text, f"<{self.source}>", 'exec'), None
|
||||
return True, ast.parse(text, f"<{self.source}>", 'exec'), None
|
||||
except Exception as error:
|
||||
return False, None, error
|
||||
|
||||
|
||||
@@ -1,297 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TokenKind(Enum):
|
||||
EOF = "eof"
|
||||
WHITESPACE = "whitespace"
|
||||
NEWLINE = "newline"
|
||||
KEYWORD = "keyword"
|
||||
IDENTIFIER = "identifier"
|
||||
STRING = "string"
|
||||
NUMBER = "number"
|
||||
TRUE = "true"
|
||||
FALSE = "false"
|
||||
LPAR = "lpar"
|
||||
RPAR = "rpar"
|
||||
LBRACKET = "lbrace"
|
||||
RBRACKET = "rbracket"
|
||||
LBRACE = "lbrace"
|
||||
RBRACE = "rbrace"
|
||||
PLUS = "plus"
|
||||
MINUS = "minus"
|
||||
STAR = "star"
|
||||
SLASH = "slash"
|
||||
PERCENT = "percent"
|
||||
COMMA = "comma"
|
||||
SEMICOLON = "semicolon"
|
||||
COLON = "colon"
|
||||
DOT = "dot"
|
||||
QMARK = "qmark"
|
||||
VBAR = "vbar"
|
||||
AMPER = "amper"
|
||||
EQUALS = "="
|
||||
|
||||
|
||||
@dataclass()
|
||||
class Token:
|
||||
type: TokenKind
|
||||
value: object
|
||||
index: int
|
||||
line: int
|
||||
column: int
|
||||
|
||||
|
||||
@dataclass()
|
||||
class LexerError(Exception):
|
||||
message: str
|
||||
text: str
|
||||
index: int
|
||||
line: int
|
||||
column: int
|
||||
|
||||
|
||||
class Keywords(Enum):
|
||||
DEF = "def"
|
||||
CONCEPT = "concept"
|
||||
AS = "as"
|
||||
WHERE = "where"
|
||||
PRE = "pre"
|
||||
POST = "post"
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""
|
||||
Class that can iterate on the tokens
|
||||
"""
|
||||
|
||||
KEYWORDS = set(x.value for x in Keywords)
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
self.text_len = len(text)
|
||||
self.column = 1
|
||||
self.line = 1
|
||||
self.i = 0
|
||||
|
||||
def __iter__(self):
|
||||
|
||||
while self.i < self.text_len:
|
||||
c = self.text[self.i]
|
||||
if c == "+":
|
||||
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
|
||||
number = self.eat_number(self.i)
|
||||
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
||||
self.i += len(number)
|
||||
self.column += len(number)
|
||||
else:
|
||||
yield Token(TokenKind.PLUS, "+", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "-":
|
||||
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
|
||||
number = self.eat_number(self.i)
|
||||
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
||||
self.i += len(number)
|
||||
self.column += len(number)
|
||||
else:
|
||||
yield Token(TokenKind.MINUS, "-", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "/":
|
||||
yield Token(TokenKind.SLASH, "/", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "*":
|
||||
yield Token(TokenKind.STAR, "*", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "{":
|
||||
yield Token(TokenKind.LBRACE, "{", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "}":
|
||||
yield Token(TokenKind.RBRACE, "}", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "(":
|
||||
yield Token(TokenKind.LPAR, "(", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ")":
|
||||
yield Token(TokenKind.RPAR, ")", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "[":
|
||||
yield Token(TokenKind.LBRACKET, "[", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "]":
|
||||
yield Token(TokenKind.RBRACKET, "]", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "=":
|
||||
yield Token(TokenKind.EQUALS, "=", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == " " or c == "\t":
|
||||
whitespace = self.eat_whitespace(self.i)
|
||||
yield Token(TokenKind.WHITESPACE, whitespace, self.i, self.line, self.column)
|
||||
self.i += len(whitespace)
|
||||
self.column += len(whitespace)
|
||||
elif c == ",":
|
||||
yield Token(TokenKind.COMMA, ",", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ".":
|
||||
yield Token(TokenKind.DOT, ".", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ";":
|
||||
yield Token(TokenKind.SEMICOLON, ";", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == ":":
|
||||
yield Token(TokenKind.COLON, ":", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "?":
|
||||
yield Token(TokenKind.QMARK, "?", self.i, self.line, self.column)
|
||||
self.i += 1
|
||||
self.column += 1
|
||||
elif c == "\n" or c == "\r":
|
||||
newline = self.eat_newline(self.i)
|
||||
yield Token(TokenKind.NEWLINE, newline, self.i, self.line, self.column)
|
||||
self.i += len(newline)
|
||||
self.column = 1
|
||||
self.line += 1
|
||||
elif c.isalpha() or c == "_":
|
||||
identifier = self.eat_identifier(self.i)
|
||||
token_type = TokenKind.KEYWORD if identifier in self.KEYWORDS else TokenKind.IDENTIFIER
|
||||
value = Keywords(identifier) if identifier in self.KEYWORDS else identifier
|
||||
yield Token(token_type, value, self.i, self.line, self.column)
|
||||
self.i += len(identifier)
|
||||
self.column += len(identifier)
|
||||
elif c.isdigit():
|
||||
number = self.eat_number(self.i)
|
||||
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
|
||||
self.i += len(number)
|
||||
self.column += len(number)
|
||||
elif c == "'" or c == '"':
|
||||
string, newlines = self.eat_string(self.i, self.line, self.column)
|
||||
yield Token(TokenKind.STRING, string, self.i, self.line, self.column) # quotes are kept
|
||||
self.i += len(string)
|
||||
self.column = 1 if newlines > 0 else self.column + len(string)
|
||||
self.line += newlines
|
||||
else:
|
||||
raise LexerError(f"Unknown token '{c}'", self.text, self.i, self.line, self.column)
|
||||
|
||||
yield Token(TokenKind.EOF, "", self.i, self.line, self.column)
|
||||
|
||||
def eat_whitespace(self, start):
|
||||
result = self.text[start]
|
||||
i = start + 1
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
if c == " " or c == "\t":
|
||||
result += c
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def eat_newline(self, start):
|
||||
if start + 1 == self.text_len:
|
||||
return self.text[start]
|
||||
|
||||
current = self.text[start]
|
||||
next = self.text[start + 1]
|
||||
if current == "\n" and next == "\r" or current == "\r" and next == "\n":
|
||||
return current + next
|
||||
|
||||
return current
|
||||
|
||||
def eat_identifier(self, start):
|
||||
result = self.text[start]
|
||||
i = start + 1
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
if c.isalpha() or c == "_" or c == "-" or c.isdigit():
|
||||
result += c
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def eat_number(self, start):
|
||||
result = self.text[start]
|
||||
i = start + 1
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
if c.isdigit() or c == ".":
|
||||
result += c
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def eat_string(self, start_index, start_line, start_column):
|
||||
quote = self.text[start_index]
|
||||
result = self.text[start_index]
|
||||
lines_count = 0
|
||||
|
||||
i = start_index + 1
|
||||
escape = False
|
||||
newline = None
|
||||
while i < self.text_len:
|
||||
c = self.text[i]
|
||||
result += c
|
||||
i += 1
|
||||
|
||||
if newline:
|
||||
lines_count += 1
|
||||
newline = c if c == newline else None
|
||||
else:
|
||||
if c == "\r" or c == "\n":
|
||||
newline = c
|
||||
|
||||
if c == "\\":
|
||||
escape = True
|
||||
elif c == quote and not escape:
|
||||
break
|
||||
else:
|
||||
escape = False
|
||||
|
||||
# add trailing new line if needed
|
||||
if newline:
|
||||
lines_count += 1
|
||||
|
||||
if result[-1] != quote:
|
||||
raise LexerError("Missing Trailing quote", result, i, start_line + lines_count,
|
||||
1 if lines_count > 0 else start_column + len(result))
|
||||
|
||||
return result, lines_count
|
||||
|
||||
def seek(self, words):
|
||||
if self.i == self.text_len:
|
||||
return 0
|
||||
|
||||
# init
|
||||
offsets = {}
|
||||
start_index = self.i
|
||||
|
||||
buffer = ""
|
||||
while self.i < self.text_len:
|
||||
c = self.text[self.i]
|
||||
|
||||
# skip white space
|
||||
if c in (" ", "\t"):
|
||||
self.i += 1
|
||||
continue
|
||||
|
||||
for word in words:
|
||||
if c == word[offset]:
|
||||
os
|
||||
Reference in New Issue
Block a user