Enhanced complex concepts handling

2020-01-11 08:03:35 +01:00
parent a62c1f0f13
commit 40416ac337
24 changed files with 1647 additions and 961 deletions
@@ -34,10 +34,10 @@ def flatten(iterable):

@dataclass()
 class LexerNode(Node):
-    start: int
-    end: int
-    tokens: list = None
-    source: str = None
+    start: int  # starting index in the tokens list
+    end: int  # ending index in the tokens list
+    tokens: list = None  # tokens
+    source: str = None  # string representation of what was parsed

    def __post_init__(self):
        if self.source is None:
@@ -64,7 +64,15 @@ class UnrecognizedTokensNode(LexerNode):
    def fix_source(self):
        self.source = BaseParser.get_text_from_tokens(self.tokens)

+    def not_whitespace(self):
+        return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))
+
    def __eq__(self, other):
+        if isinstance(other, tuple):
+            if len(other) != 3:
+                return False
+            return self.start == other[0] and self.end == other[1] and self.source == other[2]
+
        if not isinstance(other, UnrecognizedTokensNode):
            return False

@@ -93,9 +101,9 @@ class ConceptNode(LexerNode):
    def __eq__(self, other):
        if isinstance(other, tuple):
            if len(other) == 2:
-                return self.concept == other[0] and self.source == other[1]
+                return self.concept.key == other[0] and self.source == other[1]
            else:
-                return self.concept == other[0] and \
+                return self.concept.key == other[0] and \
                       self.start == other[1] and \
                       self.end == other[2] and \
                       self.source == other[3]
@@ -567,7 +575,7 @@ class ConceptLexerParser(BaseParser):

        self.token = None
        self.pos = -1
-        self.next_token()
+        self.next_token(False)
        return True

    def get_token(self) -> Token:
@@ -762,8 +770,9 @@ class ConceptLexerParser(BaseParser):
                self.seek(init_pos)
                node = grammar.parse(self)  # a node is TerminalNode or NonTerminalNode
                if node is not None and node.end != -1:
+                    updated_concept = self.finalize_concept(context.sheerka, concept, node)
                    concept_node = ConceptNode(
-                        concept,
+                        updated_concept,
                        node.start,
                        node.end,
                        self.tokens[node.start: node.end + 1],
@@ -777,27 +786,30 @@ class ConceptLexerParser(BaseParser):
                    unrecognized_tokens.add_token(self.get_token(), init_pos)
                else:
                    unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
-                    concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
-                    has_unrecognized = True

                if not self.next_token(False):
                    break

            else:  # some concepts are recognized
-                if unrecognized_tokens:
+                if unrecognized_tokens and unrecognized_tokens.not_whitespace():
                    unrecognized_tokens.fix_source()
-                    unrecognized_tokens = None
+                    concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
+                    has_unrecognized = True
+                unrecognized_tokens = None
+
                res = self.get_bests(res)  # only keep the concepts that eat the more tokens
                concepts_found = core.utils.product(concepts_found, res)

                # loop
                self.seek(res[0].end)
-                if not self.next_token():
+                if not self.next_token(False):
                    break

        # Fix the source for unrecognized tokens
-        if unrecognized_tokens:
+        if unrecognized_tokens and unrecognized_tokens.not_whitespace():
            unrecognized_tokens.fix_source()
+            concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
+            has_unrecognized = True

        # else
        # returns as many ReturnValue than choices found
@@ -821,6 +833,82 @@ class ConceptLexerParser(BaseParser):
            self.log_multiple_results(context, text, ret)
            return ret

+    def finalize_concept(self, sheerka, template, underlying, init_empty_body=True):
+        """
+        Updates the properties of the concept
+        Goes in recursion if the property is a concept
+        """
+
+        # this cache is to make sure that we return the same concept for the same ConceptMatch
+        _underlying_value_cache = {}
+
+        def _add_prop(_concept, prop_name, value):
+            """
+            Adds a new entry,
+            makes a list if the property already exists
+            """
+            if prop_name not in _concept.props or _concept.props[prop_name].value is None:
+                # new entry
+                _concept.set_prop(prop_name, value)
+            else:
+                # make a list if there was a value
+                previous_value = _concept.props[prop_name].value
+                if isinstance(previous_value, list):
+                    previous_value.append(value)
+                else:
+                    new_value = [previous_value, value]
+                    _concept.set_prop(prop_name, new_value)
+
+        def _look_for_concept_match(_underlying):
+            if isinstance(_underlying.parsing_expression, ConceptMatch):
+                return _underlying
+
+            if not isinstance(_underlying, NonTerminalNode):
+                return None
+
+            if len(_underlying.children) != 1:
+                return None
+
+            return _look_for_concept_match(_underlying.children[0])
+
+        def _get_underlying_value(_underlying):
+            concept_match_node = _look_for_concept_match(_underlying)
+            if concept_match_node:
+                if id(concept_match_node) in _underlying_value_cache:
+                    result = _underlying_value_cache[id(concept_match_node)]
+                else:
+                    ref_tpl = concept_match_node.parsing_expression.concept
+                    result = self.finalize_concept(sheerka, ref_tpl, concept_match_node.children[0], init_empty_body)
+                    _underlying_value_cache[id(concept_match_node)] = result
+            else:
+                result = _underlying.source
+
+            return result
+
+        def _process_rule_name(_concept, _underlying):
+            if _underlying.parsing_expression.rule_name:
+                value = _get_underlying_value(_underlying)
+                _add_prop(_concept, _underlying.parsing_expression.rule_name, value)
+
+            if isinstance(_underlying, NonTerminalNode):
+                for child in _underlying.children:
+                    _process_rule_name(_concept, child)
+
+        key = (template.key, template.id) if template.id else template.key
+        concept = sheerka.new(key)
+        if init_empty_body and concept.body is None:
+            value = _get_underlying_value(underlying)
+            concept.metadata.body = value
+            concept.metadata.is_evaluated = True
+            if underlying.parsing_expression.rule_name:
+                _add_prop(concept, underlying.parsing_expression.rule_name, value)
+
+        if isinstance(underlying, NonTerminalNode):
+            for node in underlying.children:
+                _process_rule_name(concept, node)
+
+        return concept
+
    @staticmethod
    def get_bests(results):
        """
@@ -92,7 +92,8 @@ class DefConceptNode(DefaultParserNode):
            if isinstance(prop_value, ReturnValueConcept) and isinstance(prop_value.body,
                                                                         ParserResultConcept) and hasattr(
                prop_value.body.body, "ast_"):
-                asts[part_key] = prop_value.body.body.ast_
+                asts[part_key] = prop_value
+                #asts[part_key] = prop_value.body.body.ast_
        return asts


@@ -46,7 +46,8 @@ class ExactConceptParser(BaseParser):
            if sheerka.isinstance(result, BuiltinConcepts.UNKNOWN_CONCEPT):
                continue

-            concepts = result.body if sheerka.isinstance(result, BuiltinConcepts.ENUMERATION) else [result]
+            # concepts = result.body if sheerka.isinstance(result, BuiltinConcepts.ENUMERATION) else [result]
+            concepts = result if isinstance(result, list) else [result]

            for concept in concepts:
                context.log(self.verbose_log, f"Recognized concept {concept}.", self.name)
@@ -0,0 +1,96 @@
+from core.builtin_concepts import BuiltinConcepts
+from core.tokenizer import TokenKind
+from parsers.BaseParser import BaseParser
+from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode
+import core.utils
+
+concept_lexer_parser = ConceptLexerParser()
+
+
+class MultipleConceptsParser(BaseParser):
+    """
+    Parser that will take the result of ConceptLexerParser and
+    try to resolve the unrecognized tokens token by token
+
+    It is a success when it returns a list ConceptNode exclusively
+    """
+
+    def __init__(self, **kwargs):
+        BaseParser.__init__(self, "MultipleConcepts", 45)
+
+    def parse(self, context, text):
+        sheerka = context.sheerka
+        if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
+            return None
+
+        if not text.parser == concept_lexer_parser:
+            return None
+
+        sheerka = context.sheerka
+        nodes = text.value
+        nodes_found = [[]]
+        source = ""
+        concepts_only = True
+
+        for node in nodes:
+            if isinstance(node, UnrecognizedTokensNode):
+                unrecognized_tokens = None
+                for i, token in enumerate(node.tokens):
+                    index = node.start + i
+
+                    if token.type == TokenKind.IDENTIFIER:
+                        # it may be a concept
+                        concept = context.new_concept(token.value)
+                        if hasattr(concept, "__iter__") or not sheerka.is_unknown(concept):
+                            # finish processing unrecognized_tokens
+                            if unrecognized_tokens:
+                                unrecognized_tokens.fix_source()
+                                source += unrecognized_tokens.source
+                                if unrecognized_tokens.not_whitespace():
+                                    nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
+                                unrecognized_tokens = None
+
+                            source += token.value
+                            concepts = concept if hasattr(concept, "__iter__") else [concept]
+                            concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
+                            nodes_found = core.utils.product(nodes_found, concepts_nodes)
+                            continue
+                    else:
+                        # it cannot be a concept
+                        concepts_only &= token.type == TokenKind.WHITESPACE or token.type == TokenKind.NEWLINE
+
+                        if unrecognized_tokens:
+                            unrecognized_tokens.add_token(token, index)
+                        else:
+                            unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
+
+                if unrecognized_tokens:
+                    unrecognized_tokens.fix_source()
+                    source += unrecognized_tokens.source
+                    if unrecognized_tokens.not_whitespace():
+                        nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
+
+            else:
+                nodes_found = core.utils.product(nodes_found, [node])
+                source += node.source
+
+        ret = []
+        for choice in nodes_found:
+            ret.append(
+                sheerka.ret(
+                    self.name,
+                    concepts_only,
+                    sheerka.new(
+                        BuiltinConcepts.PARSER_RESULT,
+                        parser=self,
+                        source=source,
+                        body=choice,
+                        try_parsed=None))
+            )
+
+        if len(ret) == 1:
+            self.log_result(context, source, ret[0])
+            return ret[0]
+        else:
+            self.log_multiple_results(context, source, ret)
+            return ret
@@ -1,7 +1,7 @@
 from core.builtin_concepts import BuiltinConcepts
 from core.tokenizer import Tokenizer, LexerError, TokenKind
 from parsers.BaseParser import BaseParser, Node, ErrorNode
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import ast
 import logging

@@ -17,10 +17,12 @@ class PythonErrorNode(ErrorNode):
    #     self.log.debug("-> PythonErrorNode: " + str(self.exception))


-@dataclass()
 class PythonNode(Node):
-    source: str
-    ast_: ast.AST
+
+    def __init__(self, source, ast_, concepts=None):
+        self.source = source
+        self.ast_ = ast_
+        self.concepts = concepts or {}

    # def __repr__(self):
    #     return "PythonNode(source='" + self.source + "', ast=" + self.get_dump(self.ast_) + ")"
@@ -67,7 +69,7 @@ class PythonParser(BaseParser):
        tree = None

        python_switcher = {
-            TokenKind.CONCEPT: lambda t: f"__C__{t.value}__C__"
+            TokenKind.CONCEPT: lambda t: f"__C__USE_CONCEPT__{t.value}__C__"
        }

        try:
@@ -0,0 +1,116 @@
+from core.builtin_concepts import BuiltinConcepts
+from parsers.BaseParser import BaseParser
+from parsers.ConceptLexerParser import UnrecognizedTokensNode, ConceptNode
+from parsers.PythonParser import PythonParser
+
+
+class PythonWithConceptsParser(BaseParser):
+    def __init__(self, **kwargs):
+        super().__init__("PythonWithConcepts", 20)
+        self.identifiers = None
+        self.identifiers_key = None
+
+    @staticmethod
+    def sanitize(identifier):
+        res = ""
+        for c in identifier:
+            res += c if c.isalnum() else "0"
+        return res
+
+    def parse(self, context, text):
+        sheerka = context.sheerka
+        if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
+            return None
+
+        nodes = text.body
+        if not isinstance(nodes, list):
+            return None
+
+        if len(nodes) == 0:
+            return None
+
+        if not isinstance(nodes[0], (ConceptNode, UnrecognizedTokensNode)):
+            return None
+
+        source = ""
+        to_parse = ""
+        identifiers = {}
+        identifiers_key = {}
+        python_ids_mappings = {}
+
+        def _get_identifier(c):
+            """
+            Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
+            to be instance variables
+            I would like to keep this parser as stateless as possible
+            :param c:
+            :return:
+            """
+            if id(c) in identifiers:
+                return identifiers[id(c)]
+
+            identifier = "__C__" + self.sanitize(c.key or c.name)
+            if c.id:
+                identifier += "__" + c.id
+
+            if identifier in identifiers_key:
+                identifiers_key[identifier] += 1
+                identifier += f"_{identifiers_key[identifier]}"
+            else:
+                identifiers_key[identifier] = 0
+
+            identifier += "__C__"
+
+            identifiers[id(c)] = identifier
+            return identifier
+
+        for node in nodes:
+            if isinstance(node, ConceptNode):
+                source += node.source
+                if to_parse:
+                    to_parse += " "
+                concept = node.concept
+                python_id = _get_identifier(concept)
+                to_parse += python_id
+                python_ids_mappings[python_id] = concept
+            else:
+                source += node.source
+                to_parse += node.source
+
+        with context.push(self, "Trying Python for '" + to_parse + "'") as sub_context:
+            python_parser = PythonParser()
+            result = python_parser.parse(sub_context, to_parse)
+
+        if result.status:
+            python_node = result.body.body
+            python_node.source = source
+            python_node.concepts = python_ids_mappings
+
+            return sheerka.ret(
+                self.name,
+                True,
+                sheerka.new(
+                    BuiltinConcepts.PARSER_RESULT,
+                    parser=self,
+                    source=source,
+                    body=result.body.body,
+                    try_parsed=None))
+
+        else:
+
+            return sheerka.ret(
+                self.name,
+                False,
+                result.body)
+
+    def concept_identifier(self, concept):
+        if id(concept) in self.identifiers:
+            return self.identifiers[id(concept)]
+
+
+        identifier = "__C__" + (concept.key or concept.name)
+        if concept.id:
+            identifier += "__" + concept.id
+        identifier += "__C__"
+
+        return identifier