Added simple form of concept composition

2020-01-15 18:38:29 +01:00
parent 51fa9629d0
commit 8152f82c6b
22 changed files with 1105 additions and 544 deletions
@@ -6,7 +6,8 @@
 #       Arpeggio: A flexible PEG parser for Python,
 #       Knowledge-Based Systems, 2016, 95, 71 - 74, doi:10.1016/j.knosys.2015.12.004
 #####################################################################################################
-from dataclasses import field, dataclass
+from collections import namedtuple
+from dataclasses import dataclass
 from collections import defaultdict
 from core.builtin_concepts import BuiltinConcepts
 from core.concept import Concept, ConceptParts, DoNotResolve
@@ -15,23 +16,6 @@ from parsers.BaseParser import BaseParser, Node, ErrorNode
 import core.utils


-def flatten(iterable):
-    if iterable is None:
-        return []
-
-    result = []
-    for e in iterable:
-        if e.parsing_expression.rule_name is not None and e.parsing_expression.rule_name != "":
-            if hasattr(e, "children"):
-                e.children = flatten(e.children)
-            result.append(e)
-        elif hasattr(e, "children"):
-            result.extend(flatten(e.children))
-        else:
-            result.append(e)
-    return result
-
-
@dataclass()
 class LexerNode(Node):
    start: int  # starting index in the tokens list
@@ -68,10 +52,10 @@ class UnrecognizedTokensNode(LexerNode):
        return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))

    def __eq__(self, other):
-        if isinstance(other, tuple):
-            if len(other) != 3:
-                return False
-            return self.start == other[0] and self.end == other[1] and self.source == other[2]
+        if isinstance(other, utnode):
+            return self.start == other.start and \
+                   self.end == other.end and \
+                   self.source == other.source

        if not isinstance(other, UnrecognizedTokensNode):
            return False
@@ -80,6 +64,9 @@ class UnrecognizedTokensNode(LexerNode):
               self.end == other.end and \
               self.source == other.source

+    def __hash__(self):
+        return hash((self.start, self.end, self.source))
+
    def __repr__(self):
        return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"

@@ -99,17 +86,14 @@ class ConceptNode(LexerNode):
            self.source = BaseParser.get_text_from_tokens(self.tokens)

    def __eq__(self, other):
-        if isinstance(other, tuple):
-            if len(other) == 2:
-                return self.concept.key == other[0] and self.source == other[1]
-            else:
-                return self.concept.key == other[0] and \
-                       self.start == other[1] and \
-                       self.end == other[2] and \
-                       self.source == other[3]
+        if isinstance(other, cnode):
+            return self.concept.key == other.concept_key and \
+                   self.start == other.start and \
+                   self.end == other.end and \
+                   self.source == other.source

-        # if not super().__eq__(other):
-        #     return False
+        if isinstance(other, short_cnode):
+            return self.concept.key == other.concept_key and self.source == other.source

        if not isinstance(other, ConceptNode):
            return False
@@ -127,6 +111,42 @@ class ConceptNode(LexerNode):
        return f"ConceptNode(concept='{self.concept}', start={self.start}, end={self.end}, source='{self.source}')"


+class SourceCodeNode(LexerNode):
+    """
+    Returned when some source code (like Python source code is recognized)
+    """
+
+    def __init__(self, node, start, end, tokens=None, source=None):
+        super().__init__(start, end, tokens, source)
+        self.node = node  # The PythonNode (or whatever language node) that is found
+
+    def __eq__(self, other):
+        if isinstance(other, scnode):
+            return self.start == other.start and \
+                   self.end == other.end and \
+                   self.source == other.source
+
+        if not isinstance(other, SourceCodeNode):
+            return False
+
+        return self.node == other.node and \
+               self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source
+
+    def __hash__(self):
+        return hash((self.start, self.end, self.source))
+
+    def __repr__(self):
+        return f"SourceCodeNode(start={self.start}, end={self.end}, source='{self.source}')"
+
+
+cnode = namedtuple("ConceptNode", "concept_key start end source")
+short_cnode = namedtuple("ConceptNode", "concept_key source")
+utnode = namedtuple("UnrecognizedTokensNode", "start end source")
+scnode = namedtuple("SourceCodeNode", "start end source")
+
+
 class NonTerminalNode(LexerNode):
    """
    Returned by the ConceptLexerParser
@@ -146,9 +166,6 @@ class NonTerminalNode(LexerNode):
        return name + sub_names

    def __eq__(self, other):
-        # if not super().__eq__(other):
-        #     return False
-
        if not isinstance(other, NonTerminalNode):
            return False

@@ -176,9 +193,6 @@ class TerminalNode(LexerNode):
        return name + f"'{self.value}'"

    def __eq__(self, other):
-        # if not super().__eq__(other):
-        #     return False
-
        if not isinstance(other, TerminalNode):
            return False

@@ -0,0 +1,110 @@
+from core.builtin_concepts import BuiltinConcepts
+from core.tokenizer import TokenKind, Token
+from parsers.BaseParser import BaseParser
+from parsers.ConceptLexerParser import ConceptNode, UnrecognizedTokensNode, SourceCodeNode
+from parsers.MultipleConceptsParser import MultipleConceptsParser
+from core.concept import VARIABLE_PREFIX
+import logging
+
+multiple_concepts_parser = MultipleConceptsParser()
+
+
+class ConceptsWithConceptsParser(BaseParser):
+    def __init__(self, **kwargs):
+        super().__init__("ConceptsWithConcepts", 25)
+
+    @staticmethod
+    def get_tokens(nodes):
+        tokens = []
+
+        for node in nodes:
+            if isinstance(node, ConceptNode):
+                index, line, column = node.tokens[0].index, node.tokens[0].line, node.tokens[0].column
+                tokens.append(Token(TokenKind.CONCEPT, node.concept, index, line, column))
+            else:
+                for token in node.tokens:
+                    if token.type == TokenKind.EOF:
+                        break
+                    elif token.type in (TokenKind.NEWLINE, TokenKind.WHITESPACE):
+                        continue
+                    else:
+                        tokens.append(token)
+
+        return tokens
+
+    @staticmethod
+    def get_key(nodes):
+        key = ""
+        index = 0
+        for node in nodes:
+            if key:
+                key += " "
+
+            if isinstance(node, UnrecognizedTokensNode):
+                key += node.source.strip()
+            else:
+                key += f"{VARIABLE_PREFIX}{index}"
+                index += 1
+
+        return key
+
+    def finalize_concept(self, context, concept, nodes):
+        index = 0
+        for node in nodes:
+
+            if isinstance(node, ConceptNode):
+                prop_name = list(concept.props.keys())[index]
+                concept.cached_asts[prop_name] = node.concept
+                context.log(
+                    self.verbose_log,
+                    f"Setting property '{prop_name}='{node.concept}'.",
+                    self.name)
+                index += 1
+            elif isinstance(node, SourceCodeNode):
+                prop_name = list(concept.props.keys())[index]
+                sheerka = context.sheerka
+                value = sheerka.new(BuiltinConcepts.PARSER_RESULT, parser=self, source=node.source, body=node.node)
+                concept.cached_asts[prop_name] = [context.sheerka.ret(self.name, True, value)]
+                context.log(
+                    self.verbose_log,
+                    f"Setting property '{prop_name}'='Python({node.source})'.",
+                    self.name)
+                index += 1
+
+        return concept
+
+    def parse(self, context, text):
+        sheerka = context.sheerka
+        if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
+            return None
+
+        if not text.parser == multiple_concepts_parser:
+            return None
+
+        nodes = text.body
+
+        concept_key = self.get_key(nodes)
+        concept = sheerka.new(concept_key)
+        if sheerka.isinstance(concept, BuiltinConcepts.UNKNOWN_CONCEPT):
+            return sheerka.ret(
+                self.name,
+                False,
+                sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=text.body))
+
+        concepts = concept if hasattr(concept, "__iter__") else [concept]
+        for concept in concepts:
+            self.finalize_concept(context, concept, nodes)
+
+        res = []
+        for concept in concepts:
+            res.append(sheerka.ret(
+                self.name,
+                True,
+                sheerka.new(
+                    BuiltinConcepts.PARSER_RESULT,
+                    parser=self,
+                    source=text.source,
+                    body=concept,
+                    try_parsed=None)))
+
+        return res[0] if len(res) == 1 else res
@@ -1,8 +1,11 @@
+import ast
+
 from core.builtin_concepts import BuiltinConcepts
 from core.tokenizer import TokenKind
 from parsers.BaseParser import BaseParser
-from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode
+from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode, SourceCodeNode
 import core.utils
+from parsers.PythonParser import PythonParser

 concept_lexer_parser = ConceptLexerParser()

@@ -18,6 +21,25 @@ class MultipleConceptsParser(BaseParser):
    def __init__(self, **kwargs):
        BaseParser.__init__(self, "MultipleConcepts", 45)

+    @staticmethod
+    def finalize(nodes_found, unrecognized_tokens):
+        if not unrecognized_tokens:
+            return nodes_found, unrecognized_tokens
+
+        unrecognized_tokens.fix_source()
+        if unrecognized_tokens.not_whitespace():
+            nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
+
+        return nodes_found, None
+
+    @staticmethod
+    def create_or_add(unrecognized_tokens, token, index):
+        if unrecognized_tokens:
+            unrecognized_tokens.add_token(token, index)
+        else:
+            unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
+        return unrecognized_tokens
+
    def parse(self, context, text):
        sheerka = context.sheerka
        if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
@@ -29,50 +51,42 @@ class MultipleConceptsParser(BaseParser):
        sheerka = context.sheerka
        nodes = text.value
        nodes_found = [[]]
-        source = ""
        concepts_only = True

        for node in nodes:
            if isinstance(node, UnrecognizedTokensNode):
                unrecognized_tokens = None
-                for i, token in enumerate(node.tokens):
-                    index = node.start + i
+                i = 0

-                    if token.type == TokenKind.IDENTIFIER:
-                        # it may be a concept
-                        concept = context.new_concept(token.value)
-                        if hasattr(concept, "__iter__") or not sheerka.is_unknown(concept):
-                            # finish processing unrecognized_tokens
-                            if unrecognized_tokens:
-                                unrecognized_tokens.fix_source()
-                                source += unrecognized_tokens.source
-                                if unrecognized_tokens.not_whitespace():
-                                    nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
-                                unrecognized_tokens = None
+                while i < len(node.tokens):

-                            source += token.value
-                            concepts = concept if hasattr(concept, "__iter__") else [concept]
-                            concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
-                            nodes_found = core.utils.product(nodes_found, concepts_nodes)
-                            continue
-                    else:
-                        # it cannot be a concept
-                        concepts_only &= token.type == TokenKind.WHITESPACE or token.type == TokenKind.NEWLINE
+                    token_index = node.start + i
+                    token = node.tokens[i]

-                        if unrecognized_tokens:
-                            unrecognized_tokens.add_token(token, index)
-                        else:
-                            unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
+                    concepts_nodes = self.get_concepts_nodes(context, token_index, token)
+                    if concepts_nodes is not None:
+                        nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)
+                        nodes_found = core.utils.product(nodes_found, concepts_nodes)
+                        i += 1
+                        continue

-                if unrecognized_tokens:
-                    unrecognized_tokens.fix_source()
-                    source += unrecognized_tokens.source
-                    if unrecognized_tokens.not_whitespace():
-                        nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
+                    source_code_node = self.get_source_code_node(context, token_index, node.tokens[i:])
+                    if source_code_node:
+                        nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)
+                        nodes_found = core.utils.product(nodes_found, [source_code_node])
+                        i += len(source_code_node.tokens)
+                        continue
+
+                    # not a concept nor some source code
+                    unrecognized_tokens = self.create_or_add(unrecognized_tokens, token, token_index)
+                    concepts_only &= token.type in (TokenKind.WHITESPACE, TokenKind.NEWLINE)
+                    i += 1
+
+                # finish processing if needed
+                nodes_found, unrecognized_tokens = self.finalize(nodes_found, unrecognized_tokens)

            else:
                nodes_found = core.utils.product(nodes_found, [node])
-                source += node.source

        ret = []
        for choice in nodes_found:
@@ -83,14 +97,68 @@ class MultipleConceptsParser(BaseParser):
                    sheerka.new(
                        BuiltinConcepts.PARSER_RESULT,
                        parser=self,
-                        source=source,
+                        source=text.source,
                        body=choice,
                        try_parsed=None))
            )

        if len(ret) == 1:
-            self.log_result(context, source, ret[0])
+            self.log_result(context, text.source, ret[0])
            return ret[0]
        else:
-            self.log_multiple_results(context, source, ret)
+            self.log_multiple_results(context, text.source, ret)
            return ret
+
+    @staticmethod
+    def get_concepts_nodes(context, index, token):
+        """
+        Tries to recognize a concept
+        from the univers of all known concepts
+        """
+
+        if token.type != TokenKind.IDENTIFIER:
+            return None
+
+        concept = context.new_concept(token.value)
+        if hasattr(concept, "__iter__") or context.sheerka.is_known(concept):
+            concepts = concept if hasattr(concept, "__iter__") else [concept]
+            concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
+            return concepts_nodes
+
+        return None
+
+    @staticmethod
+    def get_source_code_node(context, index, tokens):
+        """
+        Tries to recognize source code.
+        For the time being, only Python is supported
+        :param context:
+        :param tokens:
+        :param index:
+        :return:
+        """
+
+        if len(tokens) == 0 or (len(tokens) == 1 and tokens[0].type == TokenKind.EOF):
+            return None
+
+        end_index = len(tokens)
+        while end_index > 0:
+            parser = PythonParser()
+            tokens_to_parse = tokens[:end_index]
+            res = parser.parse(context, tokens_to_parse)
+            if res.status:
+                # only expression are accepted
+                ast_ = res.value.value.ast_
+                if not isinstance(ast_, ast.Expression):
+                    return None
+                try:
+                    compiled = compile(ast_, "<string>", "eval")
+                    eval(compiled, {}, {})
+                except Exception:
+                    return None
+
+                source = BaseParser.get_text_from_tokens(tokens_to_parse)
+                return SourceCodeNode(res.value.value, index, index + end_index - 1, tokens_to_parse, source)
+            end_index -= 1
+
+        return None
@@ -5,6 +5,8 @@ from dataclasses import dataclass, field
 import ast
 import logging

+from parsers.ConceptLexerParser import ConceptNode
+
 log = logging.getLogger(__name__)


@@ -22,7 +24,7 @@ class PythonNode(Node):
    def __init__(self, source, ast_, concepts=None):
        self.source = source
        self.ast_ = ast_
-        self.concepts = concepts or {}
+        self.concepts = concepts or {}  # when concepts are recognized in the expression

    # def __repr__(self):
    #     return "PythonNode(source='" + self.source + "', ast=" + self.get_dump(self.ast_) + ")"
@@ -133,3 +135,80 @@ class PythonGetNamesVisitor(ast.NodeVisitor):

    def visit_Name(self, node):
        self.names.add(node.id)
+
+class LexerNodeParserHelperForPython:
+    """Helper class to parse mix of concepts and Python"""
+
+    def __init__(self):
+        self.identifiers = {}  # cache for already created identifier (the key is id(concept))
+        self.identifiers_key = {}  # number of identifiers with the same root (prefix)
+
+    def _get_identifier(self, concept):
+        """
+        Get an identifier for a concept.
+        Make sure to return the same identifier if the same concept
+        Make sure to return a different identifier if same name but different concept
+
+        Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
+        to be instance variables
+        I would like to keep this parser as stateless as possible
+        :param concept:
+        :return:
+        """
+        if id(concept) in self.identifiers:
+            return self.identifiers[id(concept)]
+
+        identifier = "__C__" + self._sanitize(concept.key or concept.name)
+        if concept.id:
+            identifier += "__" + concept.id
+
+        if identifier in self.identifiers_key:
+            self.identifiers_key[identifier] += 1
+            identifier += f"_{self.identifiers_key[identifier]}"
+        else:
+            self.identifiers_key[identifier] = 0
+
+        identifier += "__C__"
+
+        self.identifiers[id(concept)] = identifier
+        return identifier
+
+    @staticmethod
+    def _sanitize(identifier):
+        res = ""
+        for c in identifier:
+            res += c if c.isalnum() else "0"
+        return res
+
+    def parse(self, context, nodes):
+        source = ""
+        to_parse = ""
+
+        concepts = {}  # the key is the Python identifier
+
+        for node in nodes:
+            if isinstance(node, ConceptNode):
+                source += node.source
+                if to_parse:
+                    to_parse += " "
+                concept = node.concept
+                python_id = self._get_identifier(concept)
+                to_parse += python_id
+                concepts[python_id] = concept
+            else:
+                source += node.source
+                to_parse += node.source
+
+        with context.push(self, desc="Trying Python for '" + to_parse + "'") as sub_context:
+            sub_context.add_inputs(to_parse=to_parse)
+            python_parser = PythonParser()
+            result = python_parser.parse(sub_context, to_parse)
+            sub_context.add_values(return_values=result)
+
+        if result.status:
+            python_node = result.body.body
+            python_node.source = source
+            python_node.concepts = concepts
+            return python_node
+
+        return result.body  # the error
@@ -37,6 +37,10 @@ class PythonWithConceptsParser(BaseParser):

        def _get_identifier(c):
            """
+            Get an identifier for a concept.
+            Make sure to return the same identifier if the same concept
+            Make sure to return a different identifier if same name but different concept
+
            Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
            to be instance variables
            I would like to keep this parser as stateless as possible
@@ -99,14 +103,3 @@ class PythonWithConceptsParser(BaseParser):
                self.name,
                False,
                result.body)
-
-    def concept_identifier(self, concept):
-        if id(concept) in self.identifiers:
-            return self.identifiers[id(concept)]
-
-        identifier = "__C__" + (concept.key or concept.name)
-        if concept.id:
-            identifier += "__" + concept.id
-        identifier += "__C__"
-
-        return identifier