Added ExactConceptParser

This commit is contained in:
2019-11-09 17:29:50 +01:00
parent a636198222
commit 576ce77740
12 changed files with 603 additions and 169 deletions
+76 -5
View File
@@ -2,6 +2,8 @@ import hashlib
from enum import Enum
import logging
from core.tokenizer import Tokenizer, TokenKind
log = logging.getLogger(__name__)
@@ -20,6 +22,8 @@ class Concept:
"""
props_to_serialize = ("id", "is_builtin", "name", "where", "pre", "post", "body", "desc")
PROPERTY_PREFIX = "__var__"
def __init__(self, name=None, is_builtin=False, where=None, pre=None, post=None, body=None, desc=None, key=None):
self.name = name
self.is_builtin = is_builtin
@@ -31,7 +35,7 @@ class Concept:
self.id = None
self.key = key
self.props = [] # list of Property for this concept
self.props = {} # list of Property for this concept
self.functions = {} # list of helper functions
self.codes = {} # cached ast for the where, pre, post and body parts
@@ -54,10 +58,48 @@ class Concept:
def get_key(self):
return self.key
def init_key(self, tokens=None):
"""
Create the key for this concept.
Must be called only when the concept if fully initialized
The method is not called set_key to make sure that no other class set the key by mistake
:param tokens:
:return:
"""
if self.key is not None:
return self.key
if tokens is None:
tokens = iter(Tokenizer(self.name))
variables = list(self.props.keys())
key = ""
first = True
for token in tokens:
if token.type == TokenKind.EOF:
break
if token.type == TokenKind.WHITESPACE:
continue
if not first:
key += " "
if variables is not None and token.value in variables:
key += self.PROPERTY_PREFIX + str(variables.index(token.value))
else:
key += token.value[1:-1] if token.type == TokenKind.STRING else token.value
first = False
self.key = key
return self
def add_codes(self, codes):
"""
From a dict of <ConceptParts, AST>
fill the codes
Gets the ASTs for 'where', 'pre', 'post' and 'body'
There ASTs are know when the concept is freshly parsed.
So the values are kept in cache.
For concepts loaded from sdp, these ASTs must be created again
:param codes:
:return:
"""
@@ -68,6 +110,8 @@ class Concept:
if key in possibles_codes:
self.codes[ConceptParts(key)] = codes[key]
return self
def get_digest(self):
"""
Returns the digest of the event
@@ -76,23 +120,47 @@ class Concept:
return hashlib.sha256(f"Concept:{self.name}{self.pre}{self.post}{self.body}".encode("utf-8")).hexdigest()
def to_dict(self):
"""
Returns a dict representing 'self'
:return:
"""
props_as_dict = dict((prop, getattr(self, prop)) for prop in self.props_to_serialize)
props_as_dict["props"] = [(p.name, p.value) for p in self.props]
props_as_dict["props"] = [(p, self.props[p].value) for p in self.props]
return props_as_dict
def from_dict(self, as_dict):
"""
Initializes 'self' from a dict
:param as_dict:
:return:
"""
for prop in self.props_to_serialize:
if prop in as_dict:
setattr(self, prop, as_dict[prop])
if "props" in as_dict:
for n, v in as_dict["props"]:
self.props.append(Property(n, v))
self.set_prop(n, v)
return self
def update_from(self, other):
"""
Update self using the properties of another concept
This method is to mimic the class to instance pattern
'other' is the class, the template, and 'self' is a new instance
:param other:
:return:
"""
for prop in self.props_to_serialize:
setattr(self, prop, getattr(other, prop))
return self
def set_prop(self, prop_name, prop_value):
self.props[prop_name] = Property(prop_name, prop_value)
def set_prop_by_index(self, index, prop_value):
prop_name = list(self.props.keys())[index]
self.props[prop_name] = Property(prop_name, prop_value)
class ErrorConcept(Concept):
NAME = "Error"
@@ -132,3 +200,6 @@ class Property:
def __init__(self, name, value):
self.name = name
self.value = value
def __repr__(self):
return f"{self.name}={self.value}"
+42 -20
View File
@@ -1,9 +1,9 @@
from dataclasses import dataclass
from core.concept import Concept, ErrorConcept, Property, TooManySuccessConcept, ReturnValueConcept
from parsers.PythonParser import PythonParser, PythonGetNamesVisitor, PythonNode
from parsers.PythonParser import PythonGetNamesVisitor, PythonNode
from sdp.sheerkaDataProvider import SheerkaDataProvider, Event, SheerkaDataProviderDuplicateKeyError
from parsers.DefaultParser import DefaultParser, DefConceptNode
from parsers.DefaultParser import DefConceptNode, DefaultParser
import core.utils
import logging
@@ -50,6 +50,7 @@ class Sheerka(Concept):
NAME = "Sheerka"
UNKNOWN_CONCEPT_NAME = "Unknown Concept"
SUCCESS_CONCEPT_NAME = "Success"
CONCEPT_TOO_LONG_CONCEPT_NAME = "Concept too long"
CONCEPTS_ENTRY = "All_Concepts"
BUILTIN_CONCEPTS_KEYS = "Builtins_Concepts"
@@ -60,6 +61,8 @@ class Sheerka(Concept):
super().__init__(Sheerka.NAME)
# cache of the most used concepts
# Note that these are only templates
# They are used as a footprint for instantiation
self.concepts_cache = {}
# a concept can be instantiated
@@ -91,8 +94,9 @@ class Sheerka(Concept):
try:
self.init_logging()
self.sdp = SheerkaDataProvider(root_folder)
self.parsers.append(lambda text: DefaultParser(text, PythonParser))
self.parsers.append(lambda text: PythonParser(text))
self.parsers.append(core.utils.get_class("parsers.DefaultParser.DefaultParser"))
self.parsers.append(core.utils.get_class("parsers.PythonParser.PythonParser"))
#self.parsers.append(core.utils.get_class("parsers.ExactConceptParser.ExactConceptParser"))
self.evaluators.append(core.utils.get_object("evaluators.DefaultEvaluator.DefaultEvaluator"))
self.evaluators.append(core.utils.get_object("evaluators.AddConceptEvaluator.AddConceptEvaluator"))
@@ -103,7 +107,7 @@ class Sheerka(Concept):
self.create_builtin_concepts()
except IOError as e:
return ReturnValue(self, False, self.get(Sheerka.ERROR_CONCEPT_NAME), e)
return ReturnValue(self, False, self.get(ErrorConcept.NAME), e)
return ReturnValue(self, True, self.get(Sheerka.SUCCESS_CONCEPT_NAME))
@@ -129,12 +133,15 @@ class Sheerka(Concept):
self,
Concept(Sheerka.UNKNOWN_CONCEPT_NAME, key=Sheerka.UNKNOWN_CONCEPT_NAME),
Concept(Sheerka.SUCCESS_CONCEPT_NAME, key=Sheerka.SUCCESS_CONCEPT_NAME),
Concept(Sheerka.CONCEPT_TOO_LONG_CONCEPT_NAME, key=Sheerka.CONCEPT_TOO_LONG_CONCEPT_NAME),
ErrorConcept(),
TooManySuccessConcept(),
ReturnValueConcept(),
]
for concept in builtins:
self.add_in_cache(concept)
from_db = self.sdp.get_safe(self.CONCEPTS_ENTRY, concept.key)
if from_db is None:
log.debug(f"'{concept.name}' concept is not found. Adding.")
@@ -143,7 +150,6 @@ class Sheerka(Concept):
else:
log.debug(f"Found concept '{from_db}'. Updating.")
concept.update_from(from_db)
self.concepts_cache[concept.key] = concept
def init_logging(self):
if self.debug:
@@ -158,7 +164,7 @@ class Sheerka(Concept):
def eval(self, text):
evt_digest = self.sdp.save_event(Event(text))
exec_context = ExecutionContext(self, evt_digest)
return_values = self.try_parse(text)
return_values = self.try_parse(exec_context, text)
return_values = self.try_eval(exec_context, return_values)
# return_values = []
@@ -172,17 +178,17 @@ class Sheerka(Concept):
return return_values
def try_parse(self, text):
def try_parse(self, context, text):
result = []
log.debug(f"Parsing '{text}'")
for parser in self.parsers:
p = parser(text)
p = parser()
# try:
# tree = p.parse()
# result.append((p.name, tree))
# except Exception as e:
# result.append((p.name, e))
tree = p.parse()
tree = p.parse(context, text)
result.append(ReturnValue(p.name, not p.has_error, p.error_sink if p.has_error else tree))
return result
@@ -235,11 +241,12 @@ class Sheerka(Concept):
setattr(concept, prop, source)
# try to find variables (eg props)
# Note that with this method, the variables will be created in the order of appearance
for token in def_concept_node.tokens["name"]:
if token.value in get_names_visitor.names:
concept.props.append(Property(token.value, None))
concept.set_prop(token.value, None)
concept.key = DefaultParser.get_concept_name(def_concept_node.tokens["name"], [p.name for p in concept.props])
concept.init_key(def_concept_node.tokens["name"])
concept.add_codes(def_concept_node.get_codes())
self.set_id_if_needed(concept, False)
@@ -249,22 +256,34 @@ class Sheerka(Concept):
return ReturnValue(self.add_concept.__name__, False, ErrorConcept(body=error), error.args[0])
return ReturnValue(self.add_concept.__name__, True, concept)
def get(self, concept_name):
def add_in_cache(self, concept):
"""
Adds a concept template in cache.
The cache is used as a proxy before looking at sdp
:param concept:
:return:
"""
self.concepts_cache[concept.key] = concept
def get(self, concept_key):
"""
Tries to find a concept
:param concept_name:
TODO: how to manage single vs multiple instances
:param concept_key:
:return:
"""
# first search in cache
if concept_name in self.concepts_cache:
return self.concepts_cache[concept_name]
if concept_key in self.concepts_cache:
return self.concepts_cache[concept_key]
return self.sdp.get(self.CONCEPTS_ENTRY, concept_name)
return self.sdp.get_safe(self.CONCEPTS_ENTRY, concept_key) or \
self.new(self.UNKNOWN_CONCEPT_NAME, body=concept_key)
def new(self, concept, **kwargs):
"""
Returns an instance of a new concept
TODO: Checks if the concept is supposed to be unique (ex Sheerka, or the number 'one' for example)
:param concept:
:param kwargs:
:return:
@@ -287,11 +306,14 @@ class Sheerka(Concept):
:return:
"""
if not isinstance(a, Concept) or not isinstance(b, Concept):
return False
if not isinstance(a, Concept):
raise SyntaxError("The first parameter of isinstance MUST be a concept")
b_key = b if isinstance(b, str) else b.key
# TODO : manage when a is the list of all possible b
return a.key == b.key
# for example, if a is a color, it will be found the entry 'All_Colors'
return a.key == b_key
@staticmethod
def test():
+319
View File
@@ -0,0 +1,319 @@
from dataclasses import dataclass
from enum import Enum
class TokenKind(Enum):
EOF = "eof"
WHITESPACE = "whitespace"
NEWLINE = "newline"
KEYWORD = "keyword"
IDENTIFIER = "identifier"
STRING = "string"
NUMBER = "number"
TRUE = "true"
FALSE = "false"
LPAR = "lpar"
RPAR = "rpar"
LBRACKET = "lbrace"
RBRACKET = "rbracket"
LBRACE = "lbrace"
RBRACE = "rbrace"
PLUS = "plus"
MINUS = "minus"
STAR = "star"
SLASH = "slash"
PERCENT = "percent"
COMMA = "comma"
SEMICOLON = "semicolon"
COLON = "colon"
DOT = "dot"
QMARK = "qmark"
VBAR = "vbar"
AMPER = "amper"
EQUALS = "="
AT = "at"
BACK_QUOTE = "bquote" # `
BACK_SLASH = "bslash" # \
CARAT = "carat" # ^
DOLLAR = "dollar" # $
EMARK = "emark" # !
GREATER = "greater" # >
LESS = "less" # <
HASH = "HASH" # #
TILDE = "tilde" # ~
UNDERSCORE = "underscore" # _
DEGREE = "degree" # °
@dataclass()
class Token:
type: TokenKind
value: object
index: int
line: int
column: int
@dataclass()
class LexerError(Exception):
message: str
text: str
index: int
line: int
column: int
class Keywords(Enum):
DEF = "def"
CONCEPT = "concept"
AS = "as"
WHERE = "where"
PRE = "pre"
POST = "post"
class Tokenizer:
"""
Class that can iterate on the tokens
"""
KEYWORDS = set(x.value for x in Keywords)
def __init__(self, text):
self.text = text
self.text_len = len(text)
self.column = 1
self.line = 1
self.i = 0
def __iter__(self):
while self.i < self.text_len:
c = self.text[self.i]
if c == "+":
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
number = self.eat_number(self.i)
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
self.i += len(number)
self.column += len(number)
else:
yield Token(TokenKind.PLUS, "+", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "-":
if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit():
number = self.eat_number(self.i)
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
self.i += len(number)
self.column += len(number)
else:
yield Token(TokenKind.MINUS, "-", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "/":
yield Token(TokenKind.SLASH, "/", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "*":
yield Token(TokenKind.STAR, "*", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "{":
yield Token(TokenKind.LBRACE, "{", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "}":
yield Token(TokenKind.RBRACE, "}", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "(":
yield Token(TokenKind.LPAR, "(", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ")":
yield Token(TokenKind.RPAR, ")", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "[":
yield Token(TokenKind.LBRACKET, "[", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "]":
yield Token(TokenKind.RBRACKET, "]", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "=":
yield Token(TokenKind.EQUALS, "=", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == " " or c == "\t":
whitespace = self.eat_whitespace(self.i)
yield Token(TokenKind.WHITESPACE, whitespace, self.i, self.line, self.column)
self.i += len(whitespace)
self.column += len(whitespace)
elif c == ",":
yield Token(TokenKind.COMMA, ",", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ".":
yield Token(TokenKind.DOT, ".", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ";":
yield Token(TokenKind.SEMICOLON, ";", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == ":":
yield Token(TokenKind.COLON, ":", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "?":
yield Token(TokenKind.QMARK, "?", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "|":
yield Token(TokenKind.VBAR, "|", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "&":
yield Token(TokenKind.AMPER, "&", self.i, self.line, self.column)
self.i += 1
self.column += 1
elif c == "\n" or c == "\r":
newline = self.eat_newline(self.i)
yield Token(TokenKind.NEWLINE, newline, self.i, self.line, self.column)
self.i += len(newline)
self.column = 1
self.line += 1
elif c.isalpha() or c == "_":
identifier = self.eat_identifier(self.i)
token_type = TokenKind.KEYWORD if identifier in self.KEYWORDS else TokenKind.IDENTIFIER
value = Keywords(identifier) if identifier in self.KEYWORDS else identifier
yield Token(token_type, value, self.i, self.line, self.column)
self.i += len(identifier)
self.column += len(identifier)
elif c.isdigit():
number = self.eat_number(self.i)
yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column)
self.i += len(number)
self.column += len(number)
elif c == "'" or c == '"':
string, newlines = self.eat_string(self.i, self.line, self.column)
yield Token(TokenKind.STRING, string, self.i, self.line, self.column) # quotes are kept
self.i += len(string)
self.column = 1 if newlines > 0 else self.column + len(string)
self.line += newlines
else:
raise LexerError(f"Unknown token '{c}'", self.text, self.i, self.line, self.column)
yield Token(TokenKind.EOF, "", self.i, self.line, self.column)
def eat_whitespace(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c == " " or c == "\t":
result += c
i += 1
else:
break
return result
def eat_newline(self, start):
if start + 1 == self.text_len:
return self.text[start]
current = self.text[start]
next = self.text[start + 1]
if current == "\n" and next == "\r" or current == "\r" and next == "\n":
return current + next
return current
def eat_identifier(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c.isalpha() or c == "_" or c == "-" or c.isdigit():
result += c
i += 1
else:
break
return result
def eat_number(self, start):
result = self.text[start]
i = start + 1
while i < self.text_len:
c = self.text[i]
if c.isdigit() or c == ".":
result += c
i += 1
else:
break
return result
def eat_string(self, start_index, start_line, start_column):
quote = self.text[start_index]
result = self.text[start_index]
lines_count = 0
i = start_index + 1
escape = False
newline = None
while i < self.text_len:
c = self.text[i]
result += c
i += 1
if newline:
lines_count += 1
newline = c if c == newline else None
else:
if c == "\r" or c == "\n":
newline = c
if c == "\\":
escape = True
elif c == quote and not escape:
break
else:
escape = False
# add trailing new line if needed
if newline:
lines_count += 1
if result[-1] != quote:
raise LexerError("Missing Trailing quote", result, i, start_line + lines_count,
1 if lines_count > 0 else start_column + len(result))
return result, lines_count
def seek(self, words):
if self.i == self.text_len:
return 0
# init
offsets = {}
start_index = self.i
buffer = ""
while self.i < self.text_len:
c = self.text[self.i]
# skip white space
if c in (" ", "\t"):
self.i += 1
continue
for word in words:
if c == word[offset]:
os