Added simple form of concept composition

This commit is contained in:
2020-01-15 18:38:29 +01:00
parent 51fa9629d0
commit 8152f82c6b
22 changed files with 1105 additions and 544 deletions
+52 -38
View File
@@ -6,7 +6,8 @@
# Arpeggio: A flexible PEG parser for Python,
# Knowledge-Based Systems, 2016, 95, 71 - 74, doi:10.1016/j.knosys.2015.12.004
#####################################################################################################
from dataclasses import field, dataclass
from collections import namedtuple
from dataclasses import dataclass
from collections import defaultdict
from core.builtin_concepts import BuiltinConcepts
from core.concept import Concept, ConceptParts, DoNotResolve
@@ -15,23 +16,6 @@ from parsers.BaseParser import BaseParser, Node, ErrorNode
import core.utils
def flatten(iterable):
if iterable is None:
return []
result = []
for e in iterable:
if e.parsing_expression.rule_name is not None and e.parsing_expression.rule_name != "":
if hasattr(e, "children"):
e.children = flatten(e.children)
result.append(e)
elif hasattr(e, "children"):
result.extend(flatten(e.children))
else:
result.append(e)
return result
@dataclass()
class LexerNode(Node):
start: int # starting index in the tokens list
@@ -68,10 +52,10 @@ class UnrecognizedTokensNode(LexerNode):
return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))
def __eq__(self, other):
if isinstance(other, tuple):
if len(other) != 3:
return False
return self.start == other[0] and self.end == other[1] and self.source == other[2]
if isinstance(other, utnode):
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
if not isinstance(other, UnrecognizedTokensNode):
return False
@@ -80,6 +64,9 @@ class UnrecognizedTokensNode(LexerNode):
self.end == other.end and \
self.source == other.source
def __hash__(self):
return hash((self.start, self.end, self.source))
def __repr__(self):
return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"
@@ -99,17 +86,14 @@ class ConceptNode(LexerNode):
self.source = BaseParser.get_text_from_tokens(self.tokens)
def __eq__(self, other):
if isinstance(other, tuple):
if len(other) == 2:
return self.concept.key == other[0] and self.source == other[1]
else:
return self.concept.key == other[0] and \
self.start == other[1] and \
self.end == other[2] and \
self.source == other[3]
if isinstance(other, cnode):
return self.concept.key == other.concept_key and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source
# if not super().__eq__(other):
# return False
if isinstance(other, short_cnode):
return self.concept.key == other.concept_key and self.source == other.source
if not isinstance(other, ConceptNode):
return False
@@ -127,6 +111,42 @@ class ConceptNode(LexerNode):
return f"ConceptNode(concept='{self.concept}', start={self.start}, end={self.end}, source='{self.source}')"
class SourceCodeNode(LexerNode):
"""
Returned when some source code (like Python source code is recognized)
"""
def __init__(self, node, start, end, tokens=None, source=None):
super().__init__(start, end, tokens, source)
self.node = node # The PythonNode (or whatever language node) that is found
def __eq__(self, other):
if isinstance(other, scnode):
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
if not isinstance(other, SourceCodeNode):
return False
return self.node == other.node and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __hash__(self):
return hash((self.start, self.end, self.source))
def __repr__(self):
return f"SourceCodeNode(start={self.start}, end={self.end}, source='{self.source}')"
cnode = namedtuple("ConceptNode", "concept_key start end source")
short_cnode = namedtuple("ConceptNode", "concept_key source")
utnode = namedtuple("UnrecognizedTokensNode", "start end source")
scnode = namedtuple("SourceCodeNode", "start end source")
class NonTerminalNode(LexerNode):
"""
Returned by the ConceptLexerParser
@@ -146,9 +166,6 @@ class NonTerminalNode(LexerNode):
return name + sub_names
def __eq__(self, other):
# if not super().__eq__(other):
# return False
if not isinstance(other, NonTerminalNode):
return False
@@ -176,9 +193,6 @@ class TerminalNode(LexerNode):
return name + f"'{self.value}'"
def __eq__(self, other):
# if not super().__eq__(other):
# return False
if not isinstance(other, TerminalNode):
return False
+110
View File
@@ -0,0 +1,110 @@
from core.builtin_concepts import BuiltinConcepts
from core.tokenizer import TokenKind, Token
from parsers.BaseParser import BaseParser
from parsers.ConceptLexerParser import ConceptNode, UnrecognizedTokensNode, SourceCodeNode
from parsers.MultipleConceptsParser import MultipleConceptsParser
from core.concept import VARIABLE_PREFIX
import logging
multiple_concepts_parser = MultipleConceptsParser()
class ConceptsWithConceptsParser(BaseParser):
def __init__(self, **kwargs):
super().__init__("ConceptsWithConcepts", 25)
@staticmethod
def get_tokens(nodes):
tokens = []
for node in nodes:
if isinstance(node, ConceptNode):
index, line, column = node.tokens[0].index, node.tokens[0].line, node.tokens[0].column
tokens.append(Token(TokenKind.CONCEPT, node.concept, index, line, column))
else:
for token in node.tokens:
if token.type == TokenKind.EOF:
break
elif token.type in (TokenKind.NEWLINE, TokenKind.WHITESPACE):
continue
else:
tokens.append(token)
return tokens
@staticmethod
def get_key(nodes):
key = ""
index = 0
for node in nodes:
if key:
key += " "
if isinstance(node, UnrecognizedTokensNode):
key += node.source.strip()
else:
key += f"{VARIABLE_PREFIX}{index}"
index += 1
return key
def finalize_concept(self, context, concept, nodes):
index = 0
for node in nodes:
if isinstance(node, ConceptNode):
prop_name = list(concept.props.keys())[index]
concept.cached_asts[prop_name] = node.concept
context.log(
self.verbose_log,
f"Setting property '{prop_name}='{node.concept}'.",
self.name)
index += 1
elif isinstance(node, SourceCodeNode):
prop_name = list(concept.props.keys())[index]
sheerka = context.sheerka
value = sheerka.new(BuiltinConcepts.PARSER_RESULT, parser=self, source=node.source, body=node.node)
concept.cached_asts[prop_name] = [context.sheerka.ret(self.name, True, value)]
context.log(
self.verbose_log,
f"Setting property '{prop_name}'='Python({node.source})'.",
self.name)
index += 1
return concept
def parse(self, context, text):
sheerka = context.sheerka
if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
return None
if not text.parser == multiple_concepts_parser:
return None
nodes = text.body
concept_key = self.get_key(nodes)
concept = sheerka.new(concept_key)
if sheerka.isinstance(concept, BuiltinConcepts.UNKNOWN_CONCEPT):
return sheerka.ret(
self.name,
False,
sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=text.body))
concepts = concept if hasattr(concept, "__iter__") else [concept]
for concept in concepts:
self.finalize_concept(context, concept, nodes)
res = []
for concept in concepts:
res.append(sheerka.ret(
self.name,
True,
sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=text.source,
body=concept,
try_parsed=None)))
return res[0] if len(res) == 1 else res
+104 -36
View File
@@ -1,8 +1,11 @@
import ast
from core.builtin_concepts import BuiltinConcepts
from core.tokenizer import TokenKind
from parsers.BaseParser import BaseParser
from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode
from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode, SourceCodeNode
import core.utils
from parsers.PythonParser import PythonParser
concept_lexer_parser = ConceptLexerParser()
@@ -18,6 +21,25 @@ class MultipleConceptsParser(BaseParser):
def __init__(self, **kwargs):
BaseParser.__init__(self, "MultipleConcepts", 45)
@staticmethod
def finalize(nodes_found, unrecognized_tokens):
if not unrecognized_tokens:
return nodes_found, unrecognized_tokens
unrecognized_tokens.fix_source()
if unrecognized_tokens.not_whitespace():
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
return nodes_found, None
@staticmethod
def create_or_add(unrecognized_tokens, token, index):
if unrecognized_tokens:
unrecognized_tokens.add_token(token, index)
else:
unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
return unrecognized_tokens
def parse(self, context, text):
sheerka = context.sheerka
if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
@@ -29,50 +51,42 @@ class MultipleConceptsParser(BaseParser):
sheerka = context.sheerka
nodes = text.value
nodes_found = [[]]
source = ""
concepts_only = True
for node in nodes:
if isinstance(node, UnrecognizedTokensNode):
unrecognized_tokens = None
for i, token in enumerate(node.tokens):
index = node.start + i
i = 0
if token.type == TokenKind.IDENTIFIER:
# it may be a concept
concept = context.new_concept(token.value)
if hasattr(concept, "__iter__") or not sheerka.is_unknown(concept):
# finish processing unrecognized_tokens
if unrecognized_tokens:
unrecognized_tokens.fix_source()
source += unrecognized_tokens.source
if unrecognized_tokens.not_whitespace():
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
unrecognized_tokens = None
while i < len(node.tokens):
source += token.value
concepts = concept if hasattr(concept, "__iter__") else [concept]
concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
nodes_found = core.utils.product(nodes_found, concepts_nodes)
continue
else:
# it cannot be a concept
concepts_only &= token.type == TokenKind.WHITESPACE or token.type == TokenKind.NEWLINE
token_index = node.start + i
token = node.tokens[i]
if unrecognized_tokens:
unrecognized_tokens.add_token(token, index)
else:
unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
concepts_nodes = self.get_concepts_nodes(context, token_index, token)
if concepts_nodes is not None:
nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)
nodes_found = core.utils.product(nodes_found, concepts_nodes)
i += 1
continue
if unrecognized_tokens:
unrecognized_tokens.fix_source()
source += unrecognized_tokens.source
if unrecognized_tokens.not_whitespace():
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
source_code_node = self.get_source_code_node(context, token_index, node.tokens[i:])
if source_code_node:
nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)
nodes_found = core.utils.product(nodes_found, [source_code_node])
i += len(source_code_node.tokens)
continue
# not a concept nor some source code
unrecognized_tokens = self.create_or_add(unrecognized_tokens, token, token_index)
concepts_only &= token.type in (TokenKind.WHITESPACE, TokenKind.NEWLINE)
i += 1
# finish processing if needed
nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)
else:
nodes_found = core.utils.product(nodes_found, [node])
source += node.source
ret = []
for choice in nodes_found:
@@ -83,14 +97,68 @@ class MultipleConceptsParser(BaseParser):
sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=source,
source=text.source,
body=choice,
try_parsed=None))
)
if len(ret) == 1:
self.log_result(context, source, ret[0])
self.log_result(context, text.source, ret[0])
return ret[0]
else:
self.log_multiple_results(context, source, ret)
self.log_multiple_results(context, text.source, ret)
return ret
@staticmethod
def get_concepts_nodes(context, index, token):
"""
Tries to recognize a concept
from the univers of all known concepts
"""
if token.type != TokenKind.IDENTIFIER:
return None
concept = context.new_concept(token.value)
if hasattr(concept, "__iter__") or context.sheerka.is_known(concept):
concepts = concept if hasattr(concept, "__iter__") else [concept]
concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
return concepts_nodes
return None
@staticmethod
def get_source_code_node(context, index, tokens):
"""
Tries to recognize source code.
For the time being, only Python is supported
:param context:
:param tokens:
:param index:
:return:
"""
if len(tokens) == 0 or (len(tokens) == 1 and tokens[0].type == TokenKind.EOF):
return None
end_index = len(tokens)
while end_index > 0:
parser = PythonParser()
tokens_to_parse = tokens[:end_index]
res = parser.parse(context, tokens_to_parse)
if res.status:
# only expression are accepted
ast_ = res.value.value.ast_
if not isinstance(ast_, ast.Expression):
return None
try:
compiled = compile(ast_, "<string>", "eval")
eval(compiled, {}, {})
except Exception:
return None
source = BaseParser.get_text_from_tokens(tokens_to_parse)
return SourceCodeNode(res.value.value, index, index + end_index - 1, tokens_to_parse, source)
end_index -= 1
return None
+80 -1
View File
@@ -5,6 +5,8 @@ from dataclasses import dataclass, field
import ast
import logging
from parsers.ConceptLexerParser import ConceptNode
log = logging.getLogger(__name__)
@@ -22,7 +24,7 @@ class PythonNode(Node):
def __init__(self, source, ast_, concepts=None):
self.source = source
self.ast_ = ast_
self.concepts = concepts or {}
self.concepts = concepts or {} # when concepts are recognized in the expression
# def __repr__(self):
# return "PythonNode(source='" + self.source + "', ast=" + self.get_dump(self.ast_) + ")"
@@ -133,3 +135,80 @@ class PythonGetNamesVisitor(ast.NodeVisitor):
def visit_Name(self, node):
self.names.add(node.id)
class LexerNodeParserHelperForPython:
"""Helper class to parse mix of concepts and Python"""
def __init__(self):
self.identifiers = {} # cache for already created identifier (the key is id(concept))
self.identifiers_key = {} # number of identifiers with the same root (prefix)
def _get_identifier(self, concept):
"""
Get an identifier for a concept.
Make sure to return the same identifier if the same concept
Make sure to return a different identifier if same name but different concept
Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
to be instance variables
I would like to keep this parser as stateless as possible
:param concept:
:return:
"""
if id(concept) in self.identifiers:
return self.identifiers[id(concept)]
identifier = "__C__" + self._sanitize(concept.key or concept.name)
if concept.id:
identifier += "__" + concept.id
if identifier in self.identifiers_key:
self.identifiers_key[identifier] += 1
identifier += f"_{self.identifiers_key[identifier]}"
else:
self.identifiers_key[identifier] = 0
identifier += "__C__"
self.identifiers[id(concept)] = identifier
return identifier
@staticmethod
def _sanitize(identifier):
res = ""
for c in identifier:
res += c if c.isalnum() else "0"
return res
def parse(self, context, nodes):
source = ""
to_parse = ""
concepts = {} # the key is the Python identifier
for node in nodes:
if isinstance(node, ConceptNode):
source += node.source
if to_parse:
to_parse += " "
concept = node.concept
python_id = self._get_identifier(concept)
to_parse += python_id
concepts[python_id] = concept
else:
source += node.source
to_parse += node.source
with context.push(self, desc="Trying Python for '" + to_parse + "'") as sub_context:
sub_context.add_inputs(to_parse=to_parse)
python_parser = PythonParser()
result = python_parser.parse(sub_context, to_parse)
sub_context.add_values(return_values=result)
if result.status:
python_node = result.body.body
python_node.source = source
python_node.concepts = concepts
return python_node
return result.body # the error
+4 -11
View File
@@ -37,6 +37,10 @@ class PythonWithConceptsParser(BaseParser):
def _get_identifier(c):
"""
Get an identifier for a concept.
Make sure to return the same identifier if the same concept
Make sure to return a different identifier if same name but different concept
Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
to be instance variables
I would like to keep this parser as stateless as possible
@@ -99,14 +103,3 @@ class PythonWithConceptsParser(BaseParser):
self.name,
False,
result.body)
def concept_identifier(self, concept):
if id(concept) in self.identifiers:
return self.identifiers[id(concept)]
identifier = "__C__" + (concept.key or concept.name)
if concept.id:
identifier += "__" + concept.id
identifier += "__C__"
return identifier