Added simple form of concept composition
This commit is contained in:
@@ -6,7 +6,8 @@
|
||||
# Arpeggio: A flexible PEG parser for Python,
|
||||
# Knowledge-Based Systems, 2016, 95, 71 - 74, doi:10.1016/j.knosys.2015.12.004
|
||||
#####################################################################################################
|
||||
from dataclasses import field, dataclass
|
||||
from collections import namedtuple
|
||||
from dataclasses import dataclass
|
||||
from collections import defaultdict
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from core.concept import Concept, ConceptParts, DoNotResolve
|
||||
@@ -15,23 +16,6 @@ from parsers.BaseParser import BaseParser, Node, ErrorNode
|
||||
import core.utils
|
||||
|
||||
|
||||
def flatten(iterable):
|
||||
if iterable is None:
|
||||
return []
|
||||
|
||||
result = []
|
||||
for e in iterable:
|
||||
if e.parsing_expression.rule_name is not None and e.parsing_expression.rule_name != "":
|
||||
if hasattr(e, "children"):
|
||||
e.children = flatten(e.children)
|
||||
result.append(e)
|
||||
elif hasattr(e, "children"):
|
||||
result.extend(flatten(e.children))
|
||||
else:
|
||||
result.append(e)
|
||||
return result
|
||||
|
||||
|
||||
@dataclass()
|
||||
class LexerNode(Node):
|
||||
start: int # starting index in the tokens list
|
||||
@@ -68,10 +52,10 @@ class UnrecognizedTokensNode(LexerNode):
|
||||
return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, tuple):
|
||||
if len(other) != 3:
|
||||
return False
|
||||
return self.start == other[0] and self.end == other[1] and self.source == other[2]
|
||||
if isinstance(other, utnode):
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
if not isinstance(other, UnrecognizedTokensNode):
|
||||
return False
|
||||
@@ -80,6 +64,9 @@ class UnrecognizedTokensNode(LexerNode):
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.start, self.end, self.source))
|
||||
|
||||
def __repr__(self):
|
||||
return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"
|
||||
|
||||
@@ -99,17 +86,14 @@ class ConceptNode(LexerNode):
|
||||
self.source = BaseParser.get_text_from_tokens(self.tokens)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, tuple):
|
||||
if len(other) == 2:
|
||||
return self.concept.key == other[0] and self.source == other[1]
|
||||
else:
|
||||
return self.concept.key == other[0] and \
|
||||
self.start == other[1] and \
|
||||
self.end == other[2] and \
|
||||
self.source == other[3]
|
||||
if isinstance(other, cnode):
|
||||
return self.concept.key == other.concept_key and \
|
||||
self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
# if not super().__eq__(other):
|
||||
# return False
|
||||
if isinstance(other, short_cnode):
|
||||
return self.concept.key == other.concept_key and self.source == other.source
|
||||
|
||||
if not isinstance(other, ConceptNode):
|
||||
return False
|
||||
@@ -127,6 +111,42 @@ class ConceptNode(LexerNode):
|
||||
return f"ConceptNode(concept='{self.concept}', start={self.start}, end={self.end}, source='{self.source}')"
|
||||
|
||||
|
||||
class SourceCodeNode(LexerNode):
|
||||
"""
|
||||
Returned when some source code (like Python source code is recognized)
|
||||
"""
|
||||
|
||||
def __init__(self, node, start, end, tokens=None, source=None):
|
||||
super().__init__(start, end, tokens, source)
|
||||
self.node = node # The PythonNode (or whatever language node) that is found
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, scnode):
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
if not isinstance(other, SourceCodeNode):
|
||||
return False
|
||||
|
||||
return self.node == other.node and \
|
||||
self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.start, self.end, self.source))
|
||||
|
||||
def __repr__(self):
|
||||
return f"SourceCodeNode(start={self.start}, end={self.end}, source='{self.source}')"
|
||||
|
||||
|
||||
cnode = namedtuple("ConceptNode", "concept_key start end source")
|
||||
short_cnode = namedtuple("ConceptNode", "concept_key source")
|
||||
utnode = namedtuple("UnrecognizedTokensNode", "start end source")
|
||||
scnode = namedtuple("SourceCodeNode", "start end source")
|
||||
|
||||
|
||||
class NonTerminalNode(LexerNode):
|
||||
"""
|
||||
Returned by the ConceptLexerParser
|
||||
@@ -146,9 +166,6 @@ class NonTerminalNode(LexerNode):
|
||||
return name + sub_names
|
||||
|
||||
def __eq__(self, other):
|
||||
# if not super().__eq__(other):
|
||||
# return False
|
||||
|
||||
if not isinstance(other, NonTerminalNode):
|
||||
return False
|
||||
|
||||
@@ -176,9 +193,6 @@ class TerminalNode(LexerNode):
|
||||
return name + f"'{self.value}'"
|
||||
|
||||
def __eq__(self, other):
|
||||
# if not super().__eq__(other):
|
||||
# return False
|
||||
|
||||
if not isinstance(other, TerminalNode):
|
||||
return False
|
||||
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from core.tokenizer import TokenKind, Token
|
||||
from parsers.BaseParser import BaseParser
|
||||
from parsers.ConceptLexerParser import ConceptNode, UnrecognizedTokensNode, SourceCodeNode
|
||||
from parsers.MultipleConceptsParser import MultipleConceptsParser
|
||||
from core.concept import VARIABLE_PREFIX
|
||||
import logging
|
||||
|
||||
multiple_concepts_parser = MultipleConceptsParser()
|
||||
|
||||
|
||||
class ConceptsWithConceptsParser(BaseParser):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__("ConceptsWithConcepts", 25)
|
||||
|
||||
@staticmethod
|
||||
def get_tokens(nodes):
|
||||
tokens = []
|
||||
|
||||
for node in nodes:
|
||||
if isinstance(node, ConceptNode):
|
||||
index, line, column = node.tokens[0].index, node.tokens[0].line, node.tokens[0].column
|
||||
tokens.append(Token(TokenKind.CONCEPT, node.concept, index, line, column))
|
||||
else:
|
||||
for token in node.tokens:
|
||||
if token.type == TokenKind.EOF:
|
||||
break
|
||||
elif token.type in (TokenKind.NEWLINE, TokenKind.WHITESPACE):
|
||||
continue
|
||||
else:
|
||||
tokens.append(token)
|
||||
|
||||
return tokens
|
||||
|
||||
@staticmethod
|
||||
def get_key(nodes):
|
||||
key = ""
|
||||
index = 0
|
||||
for node in nodes:
|
||||
if key:
|
||||
key += " "
|
||||
|
||||
if isinstance(node, UnrecognizedTokensNode):
|
||||
key += node.source.strip()
|
||||
else:
|
||||
key += f"{VARIABLE_PREFIX}{index}"
|
||||
index += 1
|
||||
|
||||
return key
|
||||
|
||||
def finalize_concept(self, context, concept, nodes):
|
||||
index = 0
|
||||
for node in nodes:
|
||||
|
||||
if isinstance(node, ConceptNode):
|
||||
prop_name = list(concept.props.keys())[index]
|
||||
concept.cached_asts[prop_name] = node.concept
|
||||
context.log(
|
||||
self.verbose_log,
|
||||
f"Setting property '{prop_name}='{node.concept}'.",
|
||||
self.name)
|
||||
index += 1
|
||||
elif isinstance(node, SourceCodeNode):
|
||||
prop_name = list(concept.props.keys())[index]
|
||||
sheerka = context.sheerka
|
||||
value = sheerka.new(BuiltinConcepts.PARSER_RESULT, parser=self, source=node.source, body=node.node)
|
||||
concept.cached_asts[prop_name] = [context.sheerka.ret(self.name, True, value)]
|
||||
context.log(
|
||||
self.verbose_log,
|
||||
f"Setting property '{prop_name}'='Python({node.source})'.",
|
||||
self.name)
|
||||
index += 1
|
||||
|
||||
return concept
|
||||
|
||||
def parse(self, context, text):
|
||||
sheerka = context.sheerka
|
||||
if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
|
||||
return None
|
||||
|
||||
if not text.parser == multiple_concepts_parser:
|
||||
return None
|
||||
|
||||
nodes = text.body
|
||||
|
||||
concept_key = self.get_key(nodes)
|
||||
concept = sheerka.new(concept_key)
|
||||
if sheerka.isinstance(concept, BuiltinConcepts.UNKNOWN_CONCEPT):
|
||||
return sheerka.ret(
|
||||
self.name,
|
||||
False,
|
||||
sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=text.body))
|
||||
|
||||
concepts = concept if hasattr(concept, "__iter__") else [concept]
|
||||
for concept in concepts:
|
||||
self.finalize_concept(context, concept, nodes)
|
||||
|
||||
res = []
|
||||
for concept in concepts:
|
||||
res.append(sheerka.ret(
|
||||
self.name,
|
||||
True,
|
||||
sheerka.new(
|
||||
BuiltinConcepts.PARSER_RESULT,
|
||||
parser=self,
|
||||
source=text.source,
|
||||
body=concept,
|
||||
try_parsed=None)))
|
||||
|
||||
return res[0] if len(res) == 1 else res
|
||||
@@ -1,8 +1,11 @@
|
||||
import ast
|
||||
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from core.tokenizer import TokenKind
|
||||
from parsers.BaseParser import BaseParser
|
||||
from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode
|
||||
from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode, SourceCodeNode
|
||||
import core.utils
|
||||
from parsers.PythonParser import PythonParser
|
||||
|
||||
concept_lexer_parser = ConceptLexerParser()
|
||||
|
||||
@@ -18,6 +21,25 @@ class MultipleConceptsParser(BaseParser):
|
||||
def __init__(self, **kwargs):
|
||||
BaseParser.__init__(self, "MultipleConcepts", 45)
|
||||
|
||||
@staticmethod
|
||||
def finalize(nodes_found, unrecognized_tokens):
|
||||
if not unrecognized_tokens:
|
||||
return nodes_found, unrecognized_tokens
|
||||
|
||||
unrecognized_tokens.fix_source()
|
||||
if unrecognized_tokens.not_whitespace():
|
||||
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
|
||||
|
||||
return nodes_found, None
|
||||
|
||||
@staticmethod
|
||||
def create_or_add(unrecognized_tokens, token, index):
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.add_token(token, index)
|
||||
else:
|
||||
unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
|
||||
return unrecognized_tokens
|
||||
|
||||
def parse(self, context, text):
|
||||
sheerka = context.sheerka
|
||||
if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
|
||||
@@ -29,50 +51,42 @@ class MultipleConceptsParser(BaseParser):
|
||||
sheerka = context.sheerka
|
||||
nodes = text.value
|
||||
nodes_found = [[]]
|
||||
source = ""
|
||||
concepts_only = True
|
||||
|
||||
for node in nodes:
|
||||
if isinstance(node, UnrecognizedTokensNode):
|
||||
unrecognized_tokens = None
|
||||
for i, token in enumerate(node.tokens):
|
||||
index = node.start + i
|
||||
i = 0
|
||||
|
||||
if token.type == TokenKind.IDENTIFIER:
|
||||
# it may be a concept
|
||||
concept = context.new_concept(token.value)
|
||||
if hasattr(concept, "__iter__") or not sheerka.is_unknown(concept):
|
||||
# finish processing unrecognized_tokens
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.fix_source()
|
||||
source += unrecognized_tokens.source
|
||||
if unrecognized_tokens.not_whitespace():
|
||||
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
|
||||
unrecognized_tokens = None
|
||||
while i < len(node.tokens):
|
||||
|
||||
source += token.value
|
||||
concepts = concept if hasattr(concept, "__iter__") else [concept]
|
||||
concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
|
||||
nodes_found = core.utils.product(nodes_found, concepts_nodes)
|
||||
continue
|
||||
else:
|
||||
# it cannot be a concept
|
||||
concepts_only &= token.type == TokenKind.WHITESPACE or token.type == TokenKind.NEWLINE
|
||||
token_index = node.start + i
|
||||
token = node.tokens[i]
|
||||
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.add_token(token, index)
|
||||
else:
|
||||
unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
|
||||
concepts_nodes = self.get_concepts_nodes(context, token_index, token)
|
||||
if concepts_nodes is not None:
|
||||
nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)
|
||||
nodes_found = core.utils.product(nodes_found, concepts_nodes)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.fix_source()
|
||||
source += unrecognized_tokens.source
|
||||
if unrecognized_tokens.not_whitespace():
|
||||
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
|
||||
source_code_node = self.get_source_code_node(context, token_index, node.tokens[i:])
|
||||
if source_code_node:
|
||||
nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)
|
||||
nodes_found = core.utils.product(nodes_found, [source_code_node])
|
||||
i += len(source_code_node.tokens)
|
||||
continue
|
||||
|
||||
# not a concept nor some source code
|
||||
unrecognized_tokens = self.create_or_add(unrecognized_tokens, token, token_index)
|
||||
concepts_only &= token.type in (TokenKind.WHITESPACE, TokenKind.NEWLINE)
|
||||
i += 1
|
||||
|
||||
# finish processing if needed
|
||||
nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)
|
||||
|
||||
else:
|
||||
nodes_found = core.utils.product(nodes_found, [node])
|
||||
source += node.source
|
||||
|
||||
ret = []
|
||||
for choice in nodes_found:
|
||||
@@ -83,14 +97,68 @@ class MultipleConceptsParser(BaseParser):
|
||||
sheerka.new(
|
||||
BuiltinConcepts.PARSER_RESULT,
|
||||
parser=self,
|
||||
source=source,
|
||||
source=text.source,
|
||||
body=choice,
|
||||
try_parsed=None))
|
||||
)
|
||||
|
||||
if len(ret) == 1:
|
||||
self.log_result(context, source, ret[0])
|
||||
self.log_result(context, text.source, ret[0])
|
||||
return ret[0]
|
||||
else:
|
||||
self.log_multiple_results(context, source, ret)
|
||||
self.log_multiple_results(context, text.source, ret)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def get_concepts_nodes(context, index, token):
|
||||
"""
|
||||
Tries to recognize a concept
|
||||
from the univers of all known concepts
|
||||
"""
|
||||
|
||||
if token.type != TokenKind.IDENTIFIER:
|
||||
return None
|
||||
|
||||
concept = context.new_concept(token.value)
|
||||
if hasattr(concept, "__iter__") or context.sheerka.is_known(concept):
|
||||
concepts = concept if hasattr(concept, "__iter__") else [concept]
|
||||
concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
|
||||
return concepts_nodes
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_source_code_node(context, index, tokens):
|
||||
"""
|
||||
Tries to recognize source code.
|
||||
For the time being, only Python is supported
|
||||
:param context:
|
||||
:param tokens:
|
||||
:param index:
|
||||
:return:
|
||||
"""
|
||||
|
||||
if len(tokens) == 0 or (len(tokens) == 1 and tokens[0].type == TokenKind.EOF):
|
||||
return None
|
||||
|
||||
end_index = len(tokens)
|
||||
while end_index > 0:
|
||||
parser = PythonParser()
|
||||
tokens_to_parse = tokens[:end_index]
|
||||
res = parser.parse(context, tokens_to_parse)
|
||||
if res.status:
|
||||
# only expression are accepted
|
||||
ast_ = res.value.value.ast_
|
||||
if not isinstance(ast_, ast.Expression):
|
||||
return None
|
||||
try:
|
||||
compiled = compile(ast_, "<string>", "eval")
|
||||
eval(compiled, {}, {})
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
source = BaseParser.get_text_from_tokens(tokens_to_parse)
|
||||
return SourceCodeNode(res.value.value, index, index + end_index - 1, tokens_to_parse, source)
|
||||
end_index -= 1
|
||||
|
||||
return None
|
||||
|
||||
+80
-1
@@ -5,6 +5,8 @@ from dataclasses import dataclass, field
|
||||
import ast
|
||||
import logging
|
||||
|
||||
from parsers.ConceptLexerParser import ConceptNode
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -22,7 +24,7 @@ class PythonNode(Node):
|
||||
def __init__(self, source, ast_, concepts=None):
|
||||
self.source = source
|
||||
self.ast_ = ast_
|
||||
self.concepts = concepts or {}
|
||||
self.concepts = concepts or {} # when concepts are recognized in the expression
|
||||
|
||||
# def __repr__(self):
|
||||
# return "PythonNode(source='" + self.source + "', ast=" + self.get_dump(self.ast_) + ")"
|
||||
@@ -133,3 +135,80 @@ class PythonGetNamesVisitor(ast.NodeVisitor):
|
||||
|
||||
def visit_Name(self, node):
|
||||
self.names.add(node.id)
|
||||
|
||||
class LexerNodeParserHelperForPython:
|
||||
"""Helper class to parse mix of concepts and Python"""
|
||||
|
||||
def __init__(self):
|
||||
self.identifiers = {} # cache for already created identifier (the key is id(concept))
|
||||
self.identifiers_key = {} # number of identifiers with the same root (prefix)
|
||||
|
||||
def _get_identifier(self, concept):
|
||||
"""
|
||||
Get an identifier for a concept.
|
||||
Make sure to return the same identifier if the same concept
|
||||
Make sure to return a different identifier if same name but different concept
|
||||
|
||||
Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
|
||||
to be instance variables
|
||||
I would like to keep this parser as stateless as possible
|
||||
:param concept:
|
||||
:return:
|
||||
"""
|
||||
if id(concept) in self.identifiers:
|
||||
return self.identifiers[id(concept)]
|
||||
|
||||
identifier = "__C__" + self._sanitize(concept.key or concept.name)
|
||||
if concept.id:
|
||||
identifier += "__" + concept.id
|
||||
|
||||
if identifier in self.identifiers_key:
|
||||
self.identifiers_key[identifier] += 1
|
||||
identifier += f"_{self.identifiers_key[identifier]}"
|
||||
else:
|
||||
self.identifiers_key[identifier] = 0
|
||||
|
||||
identifier += "__C__"
|
||||
|
||||
self.identifiers[id(concept)] = identifier
|
||||
return identifier
|
||||
|
||||
@staticmethod
|
||||
def _sanitize(identifier):
|
||||
res = ""
|
||||
for c in identifier:
|
||||
res += c if c.isalnum() else "0"
|
||||
return res
|
||||
|
||||
def parse(self, context, nodes):
|
||||
source = ""
|
||||
to_parse = ""
|
||||
|
||||
concepts = {} # the key is the Python identifier
|
||||
|
||||
for node in nodes:
|
||||
if isinstance(node, ConceptNode):
|
||||
source += node.source
|
||||
if to_parse:
|
||||
to_parse += " "
|
||||
concept = node.concept
|
||||
python_id = self._get_identifier(concept)
|
||||
to_parse += python_id
|
||||
concepts[python_id] = concept
|
||||
else:
|
||||
source += node.source
|
||||
to_parse += node.source
|
||||
|
||||
with context.push(self, desc="Trying Python for '" + to_parse + "'") as sub_context:
|
||||
sub_context.add_inputs(to_parse=to_parse)
|
||||
python_parser = PythonParser()
|
||||
result = python_parser.parse(sub_context, to_parse)
|
||||
sub_context.add_values(return_values=result)
|
||||
|
||||
if result.status:
|
||||
python_node = result.body.body
|
||||
python_node.source = source
|
||||
python_node.concepts = concepts
|
||||
return python_node
|
||||
|
||||
return result.body # the error
|
||||
|
||||
@@ -37,6 +37,10 @@ class PythonWithConceptsParser(BaseParser):
|
||||
|
||||
def _get_identifier(c):
|
||||
"""
|
||||
Get an identifier for a concept.
|
||||
Make sure to return the same identifier if the same concept
|
||||
Make sure to return a different identifier if same name but different concept
|
||||
|
||||
Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
|
||||
to be instance variables
|
||||
I would like to keep this parser as stateless as possible
|
||||
@@ -99,14 +103,3 @@ class PythonWithConceptsParser(BaseParser):
|
||||
self.name,
|
||||
False,
|
||||
result.body)
|
||||
|
||||
def concept_identifier(self, concept):
|
||||
if id(concept) in self.identifiers:
|
||||
return self.identifiers[id(concept)]
|
||||
|
||||
identifier = "__C__" + (concept.key or concept.name)
|
||||
if concept.id:
|
||||
identifier += "__" + concept.id
|
||||
identifier += "__C__"
|
||||
|
||||
return identifier
|
||||
|
||||
Reference in New Issue
Block a user