Enhanced complex concepts handling

2020-01-11 08:03:35 +01:00
parent a62c1f0f13
commit 40416ac337
24 changed files with 1647 additions and 961 deletions
@@ -34,10 +34,10 @@ def flatten(iterable):

@dataclass()
 class LexerNode(Node):
-    start: int
-    end: int
-    tokens: list = None
-    source: str = None
+    start: int  # starting index in the tokens list
+    end: int  # ending index in the tokens list
+    tokens: list = None  # tokens
+    source: str = None  # string representation of what was parsed

    def __post_init__(self):
        if self.source is None:
@@ -64,7 +64,15 @@ class UnrecognizedTokensNode(LexerNode):
    def fix_source(self):
        self.source = BaseParser.get_text_from_tokens(self.tokens)

+    def not_whitespace(self):
+        return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))
+
    def __eq__(self, other):
+        if isinstance(other, tuple):
+            if len(other) != 3:
+                return False
+            return self.start == other[0] and self.end == other[1] and self.source == other[2]
+
        if not isinstance(other, UnrecognizedTokensNode):
            return False

@@ -93,9 +101,9 @@ class ConceptNode(LexerNode):
    def __eq__(self, other):
        if isinstance(other, tuple):
            if len(other) == 2:
-                return self.concept == other[0] and self.source == other[1]
+                return self.concept.key == other[0] and self.source == other[1]
            else:
-                return self.concept == other[0] and \
+                return self.concept.key == other[0] and \
                       self.start == other[1] and \
                       self.end == other[2] and \
                       self.source == other[3]
@@ -567,7 +575,7 @@ class ConceptLexerParser(BaseParser):

        self.token = None
        self.pos = -1
-        self.next_token()
+        self.next_token(False)
        return True

    def get_token(self) -> Token:
@@ -762,8 +770,9 @@ class ConceptLexerParser(BaseParser):
                self.seek(init_pos)
                node = grammar.parse(self)  # a node is TerminalNode or NonTerminalNode
                if node is not None and node.end != -1:
+                    updated_concept = self.finalize_concept(context.sheerka, concept, node)
                    concept_node = ConceptNode(
-                        concept,
+                        updated_concept,
                        node.start,
                        node.end,
                        self.tokens[node.start: node.end + 1],
@@ -777,27 +786,30 @@ class ConceptLexerParser(BaseParser):
                    unrecognized_tokens.add_token(self.get_token(), init_pos)
                else:
                    unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
-                    concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
-                    has_unrecognized = True

                if not self.next_token(False):
                    break

            else:  # some concepts are recognized
-                if unrecognized_tokens:
+                if unrecognized_tokens and unrecognized_tokens.not_whitespace():
                    unrecognized_tokens.fix_source()
-                    unrecognized_tokens = None
+                    concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
+                    has_unrecognized = True
+                unrecognized_tokens = None
+
                res = self.get_bests(res)  # only keep the concepts that eat the more tokens
                concepts_found = core.utils.product(concepts_found, res)

                # loop
                self.seek(res[0].end)
-                if not self.next_token():
+                if not self.next_token(False):
                    break

        # Fix the source for unrecognized tokens
-        if unrecognized_tokens:
+        if unrecognized_tokens and unrecognized_tokens.not_whitespace():
            unrecognized_tokens.fix_source()
+            concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
+            has_unrecognized = True

        # else
        # returns as many ReturnValue than choices found
@@ -821,6 +833,82 @@ class ConceptLexerParser(BaseParser):
            self.log_multiple_results(context, text, ret)
            return ret

+    def finalize_concept(self, sheerka, template, underlying, init_empty_body=True):
+        """
+        Updates the properties of the concept
+        Goes in recursion if the property is a concept
+        """
+
+        # this cache is to make sure that we return the same concept for the same ConceptMatch
+        _underlying_value_cache = {}
+
+        def _add_prop(_concept, prop_name, value):
+            """
+            Adds a new entry,
+            makes a list if the property already exists
+            """
+            if prop_name not in _concept.props or _concept.props[prop_name].value is None:
+                # new entry
+                _concept.set_prop(prop_name, value)
+            else:
+                # make a list if there was a value
+                previous_value = _concept.props[prop_name].value
+                if isinstance(previous_value, list):
+                    previous_value.append(value)
+                else:
+                    new_value = [previous_value, value]
+                    _concept.set_prop(prop_name, new_value)
+
+        def _look_for_concept_match(_underlying):
+            if isinstance(_underlying.parsing_expression, ConceptMatch):
+                return _underlying
+
+            if not isinstance(_underlying, NonTerminalNode):
+                return None
+
+            if len(_underlying.children) != 1:
+                return None
+
+            return _look_for_concept_match(_underlying.children[0])
+
+        def _get_underlying_value(_underlying):
+            concept_match_node = _look_for_concept_match(_underlying)
+            if concept_match_node:
+                if id(concept_match_node) in _underlying_value_cache:
+                    result = _underlying_value_cache[id(concept_match_node)]
+                else:
+                    ref_tpl = concept_match_node.parsing_expression.concept
+                    result = self.finalize_concept(sheerka, ref_tpl, concept_match_node.children[0], init_empty_body)
+                    _underlying_value_cache[id(concept_match_node)] = result
+            else:
+                result = _underlying.source
+
+            return result
+
+        def _process_rule_name(_concept, _underlying):
+            if _underlying.parsing_expression.rule_name:
+                value = _get_underlying_value(_underlying)
+                _add_prop(_concept, _underlying.parsing_expression.rule_name, value)
+
+            if isinstance(_underlying, NonTerminalNode):
+                for child in _underlying.children:
+                    _process_rule_name(_concept, child)
+
+        key = (template.key, template.id) if template.id else template.key
+        concept = sheerka.new(key)
+        if init_empty_body and concept.body is None:
+            value = _get_underlying_value(underlying)
+            concept.metadata.body = value
+            concept.metadata.is_evaluated = True
+            if underlying.parsing_expression.rule_name:
+                _add_prop(concept, underlying.parsing_expression.rule_name, value)
+
+        if isinstance(underlying, NonTerminalNode):
+            for node in underlying.children:
+                _process_rule_name(concept, node)
+
+        return concept
+
    @staticmethod
    def get_bests(results):
        """