Fixed SyaNodeParser false positive recognition issue

2020-05-15 10:36:05 +02:00
parent 6e343ba996
commit 5489ef00b9
24 changed files with 484 additions and 5741 deletions
@@ -1,15 +1,16 @@
 from collections import namedtuple
 from dataclasses import dataclass, field
+from operator import attrgetter
 from typing import List

 from core import builtin_helpers
 from core.builtin_concepts import BuiltinConcepts
-from core.concept import VARIABLE_PREFIX, Concept, DEFINITION_TYPE_BNF
+from core.concept import Concept, DEFINITION_TYPE_BNF
 from core.sheerka.ExecutionContext import ExecutionContext
-from core.tokenizer import Token, TokenKind
+from core.tokenizer import Token, TokenKind, Tokenizer
 from parsers.BaseNodeParser import UnrecognizedTokensNode, ConceptNode, SourceCodeNode, SyaAssociativity, \
    SourceCodeWithConceptNode, BaseNodeParser
-from parsers.BaseParser import ErrorNode, UnexpectedTokenErrorNode
+from parsers.BaseParser import ErrorNode

 PARSERS = ["BnfNode", "AtomNode", "Python"]

@@ -88,10 +89,13 @@ class SyaConceptParserHelper:
    concept: Concept
    start: int  # position of the token in the tokenizer (Caution, it is not token.index)
    end: int = field(default=-1, repr=False, compare=False, hash=None)
-    expected: List[str] = field(default_factory=list, repr=False, compare=False, hash=None)
+    expected: List[Token] = field(default_factory=list, repr=False, compare=False, hash=None)
    expected_parameters_before_first_token: int = field(default=0, repr=False, compare=False, hash=None)
+    last_token_before_first_token: Token = field(default=None, repr=False, compare=False, hash=None)
    potential_pos: int = field(default=-1, repr=False, compare=False, hash=None)
    parameters_list_at_init: list = field(default_factory=list, repr=False, compare=False, hash=None)
+    tokens: List[Token] = field(default_factory=list, repr=False, compare=False, hash=None)  # tokens eaten
+    remember_whitespace: Token = field(default=None, repr=False, compare=False, hash=None)
    error: str = None

    def __post_init__(self):
@@ -99,17 +103,20 @@ class SyaConceptParserHelper:
        if self.end == -1:
            self.end = self.start

-        first_keyword_found = False
-        for name in concept.key.split():
-            if not name.startswith(VARIABLE_PREFIX) and not first_keyword_found:
-                first_keyword_found = True
+        first_keyword_found = None
+        for token in Tokenizer(concept.key, yield_eof=False):
+            if not first_keyword_found and token.type != TokenKind.WHITESPACE and token.type != TokenKind.VAR_DEF:
+                first_keyword_found = token

            if first_keyword_found:
-                self.expected.append(name)
+                self.expected.append(token)
            else:
-                self.expected_parameters_before_first_token += 1
+                self.last_token_before_first_token = token
+                if token.type != TokenKind.WHITESPACE:
+                    self.expected_parameters_before_first_token += 1

-        self.eat_token()  # remove the fist token
+        self.eat_token(first_keyword_found)  # remove the first token
+        self.tokens.append(first_keyword_found)

    def is_matched(self):
        return len(self.expected) == 0
@@ -117,23 +124,38 @@ class SyaConceptParserHelper:
    def is_atom(self):
        return len(self.concept.concept.metadata.variables) == 0 and len(self.expected) == 0

-    def is_expected(self, token):
-        if self.is_matched():
+    def is_next(self, token):
+        if self.is_matched() or len(self.expected) == 0:
            return False

-        token_value = BaseNodeParser.get_token_value(token)
+        # True if the next token is the one that is expected
+        # Or if the next token is a whitespace and the expected one is the one after
+        # (whitespace are sometimes not mandatory)
+        return token.str_value == self.expected[0].str_value or \
+               self.expected[0].type == TokenKind.WHITESPACE and token.str_value == self.expected[1].str_value
+
+    def is_expected(self, token):
+        if self.is_matched() or token.type == TokenKind.WHITESPACE:
+            return False

        for expected in self.expected:
-            if not expected.startswith(VARIABLE_PREFIX) and expected == token_value:
+            if expected.type != TokenKind.VAR_DEF and expected.str_value == token.str_value:
                return True

        return False

    def expected_parameters(self):
-        return sum(map(lambda e: e.startswith(VARIABLE_PREFIX), self.expected))
+        return sum(map(lambda e: e.type == TokenKind.VAR_DEF, self.expected))

-    def eat_token(self):
-        # No check, as it is used only after is_expected
+    def eat_token(self, until_token):
+        """
+        eat until token 'until'
+        :param until_token:
+        :return:
+        """
+        # No check, as it is used only after is_expected() or is_next()
+        while self.expected[0].str_value != until_token.str_value:
+            del self.expected[0]
        del self.expected[0]

        # return True is a whole sequence of keyword is eaten
@@ -143,7 +165,10 @@ class SyaConceptParserHelper:
        if len(self.expected) == 0:
            return True

-        return self.expected[0].startswith(VARIABLE_PREFIX)
+        # also return True at the end of a name sequence
+        # ... <var0> bar baz qux <var1>
+        # return True after 'qux', to indicate all the parameters from <var0> must be processed
+        return self.expected[0].type == TokenKind.VAR_DEF

    def eat_parameter(self, parameter):
        if self.is_matched() and parameter == self:
@@ -153,7 +178,7 @@ class SyaConceptParserHelper:
            self.error = "No more parameter expected"
            return

-        if not self.expected[0].startswith(VARIABLE_PREFIX):
+        if self.expected[0].type != TokenKind.VAR_DEF:
            self.error = "Parameter was not expected"
            return

@@ -202,6 +227,7 @@ class InFixToPostFix:
        self.errors = []  # Not quite sure that I can handle more than one error

        self.debug = []
+        self.false_positives = []  # concepts that looks like known one, but not (for debug purpose)
        self.forked = []  # use to fork InFixToPostFix when multiple parsers recognize the unrecognized_tokens

    def __repr__(self):
@@ -245,7 +271,6 @@ class InFixToPostFix:
        Note that when we are parsing non recognized tokens,
        we consider that the parenthesis are part of the non recognized
        :param token:
-        :param stack:
        :return:
        """
        return isinstance(token, Token) and token.type == TokenKind.RPAR
@@ -268,10 +293,10 @@ class InFixToPostFix:
        :return:
        """
        if isinstance(item, SyaConceptParserHelper) and len(item.expected) > 0 and not item.error:
-            if item.expected[0].startswith(VARIABLE_PREFIX):
+            if item.expected[0].type == TokenKind.VAR_DEF:
                item.error = "Not enough suffix parameters"
            else:
-                item.error = f"token '{item.expected[0]}' not found"
+                item.error = f"token '{item.expected[0].str_value}' not found"

        if isinstance(item, SyaConceptParserHelper) and item.potential_pos != -1:
            self.out.insert(item.potential_pos, item)
@@ -328,6 +353,16 @@ class InFixToPostFix:
        ).pseudo_fix_source()
        return source_code

+    def _transform_to_unrecognized(self, parser_helper):
+        # an Unrecognized when sent to out too prematurely
+        if len(self.out) > 0 and isinstance(self.out[-1], UnrecognizedTokensNode):
+            self.unrecognized_tokens = self.out.pop()
+
+        if parser_helper.remember_whitespace:
+            self.unrecognized_tokens.add_token(parser_helper.remember_whitespace, parser_helper.start - 1)
+        for i, token in enumerate(parser_helper.tokens):
+            self.unrecognized_tokens.add_token(token, parser_helper.start + i)
+
    def get_errors(self):
        res = []
        res.extend(self.errors)
@@ -343,28 +378,28 @@ class InFixToPostFix:

        self.is_locked = False

-    def manage_parameters_when_new_concept(self, temp_concept_node):
+    def manage_parameters_when_new_concept(self, parser_helper):
        """
        When a new concept is create, we need to check what to do with the parameters
        that were queued
-        :param temp_concept_node: new concept
+        :param parser_helper: new concept
        :return:
        """
-        if len(self.parameters_list) < temp_concept_node.expected_parameters_before_first_token:
+        if len(self.parameters_list) < parser_helper.expected_parameters_before_first_token:
            # The new concept expect some prefix parameters, but there's not enough
-            temp_concept_node.error = "Not enough prefix parameters"
+            parser_helper.error = "Not enough prefix parameters"
            return

-        if len(self.parameters_list) > temp_concept_node.expected_parameters_before_first_token:
+        if len(self.parameters_list) > parser_helper.expected_parameters_before_first_token:
            # There are more parameters than needed by the new concept
            # The others are either
            #   - parameters for the previous concept (if any)
            #   - concepts on their own
            #   - syntax error
            # In all the cases, the only thing that matter is to pop what is expected by the new concept
-            for i in range(temp_concept_node.expected_parameters_before_first_token):
+            for i in range(parser_helper.expected_parameters_before_first_token):
                self.parameters_list.pop()
-            temp_concept_node.parameters_list_at_init.extend(self.parameters_list)
+            parser_helper.parameters_list_at_init.extend(self.parameters_list)
            return

        # len(self.parameters_list) == temp_concept_node.expected_parameters_before_first_token
@@ -385,14 +420,18 @@ class InFixToPostFix:
        :return:
        """

+        # manage parenthesis that didn't find any match
+        if self._is_lpar(self.stack[-1]):
+            self._add_error(ParenthesisMismatchErrorNode(self.stack[-1]))
+
        # The parameter must be part the current concept being parsed
        assert len(self._concepts()) != 0  # sanity check

        current_concept = self._concepts()[-1]
-        while len(current_concept.expected) > 0 and current_concept.expected[0].startswith(VARIABLE_PREFIX):
+        while len(current_concept.expected) > 0 and current_concept.expected[0].type == TokenKind.VAR_DEF:
            # eat everything that was expected
            if len(self.parameters_list) == 0:
-                # current_concept.error = f"Failed to match parameter '{current_concept.expected[0]}'"
+                current_concept.error = f"Failed to match parameter '{current_concept.expected[0].str_value}'"
                return
            del self.parameters_list[0]
            del current_concept.expected[0]
@@ -506,6 +545,11 @@ class InFixToPostFix:
        if stack.associativity == SyaAssociativity.No and current.associativity == SyaAssociativity.No:
            self._add_error(NoneAssociativeSequenceErrorNode(current.concept, stack_head.start, concept_node.start))

+        if not current.precedence:
+            # precedence is not set (None or zero)
+            # Do not apply any rule
+            return False
+
        if current.associativity == SyaAssociativity.Left and current.precedence <= stack.precedence:
            return True

@@ -528,9 +572,55 @@ class InFixToPostFix:
        :return:
        """

+        def _pop_stack(c):
+            while self.stack[-1] != c and not self._is_lpar(c):
+                self.pop_stack_to_out()
+
+            if self._is_lpar(self.stack[-1]):
+                self._add_error(ParenthesisMismatchErrorNode(self.stack[-1]))
+                return False
+
+            # Manage concepts ending with long names
+            if self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1].is_matched():
+                self.pop_stack_to_out()
+
        for current_concept in reversed(self._concepts()):
+            # As I may loose memory again ;-)
+            # it's a reversed loop to manage cases like
+            # if a plus b then ...
+            # The current concept is 'plus', but the token is 'then'
+            # It's means that I have finished to parse the 'plus' and started the second part of the 'if'
+
+            if current_concept.is_next(token):
+                current_concept.end = pos
+                current_concept.tokens.append(token)
+                if current_concept.eat_token(token):
+                    _pop_stack(current_concept)
+                return True
+
+            if len(current_concept.expected) > 0 and current_concept.expected[0].type != TokenKind.VAR_DEF:
+                if current_concept.expected[0].type == TokenKind.WHITESPACE:
+                    # drop it. It's the case where an optional whitespace is missing
+                    del (current_concept.expected[0])
+                else:
+                    # error
+                    # We are not parsing the concept we tought we were parsing.
+                    # Transform the eaten tokens into unrecognized
+                    # and discard the current SyaConceptParserHelper
+                    # TODO: manage the pending LPAR, RPAR ?
+                    self._transform_to_unrecognized(current_concept)
+                    self.false_positives.append(current_concept)
+                    self.stack.pop()
+                    return False

            if current_concept.is_expected(token):
+
+                # Fix the whitespace between var and expected if needed
+                # current_concept.expected[0] is '<var>'
+                # current_concept.expected[1] is what separate var from expected (normally a whitespace)
+                if current_concept.expected[1].type == TokenKind.WHITESPACE:
+                    self.unrecognized_tokens.pop(TokenKind.WHITESPACE)
+
                current_concept.end = pos
                self.manage_unrecognized()
                # manage that some clones may have been forked
@@ -550,36 +640,33 @@ class InFixToPostFix:
                        self.parameters_list[:]))
                    return True  # no need to continue

-                while self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1].is_matched():
-                    self.pop_stack_to_out()
+                while self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1] != current_concept:
+                    current = self.stack[-1]
+                    if current.error:
+                        self._transform_to_unrecognized(current)
+                        self.false_positives.append(current)
+                        self.stack.pop()
+
+                        if current_concept.expected[1].type == TokenKind.WHITESPACE:
+                            self.unrecognized_tokens.pop(TokenKind.WHITESPACE)
+
+                        self.manage_unrecognized()
+                        # manage that some clones may have been forked
+                        for forked in self.forked:
+                            forked.handle_expected_token(token, pos)
+                    else:
+                        self.pop_stack_to_out()
                    self.manage_parameters()

-                if current_concept.eat_token():
-                    while self.stack[-1] != current_concept and not self._is_lpar(current_concept):
-                        self.pop_stack_to_out()
+                # maybe eat whitespace that was between <var> and expected token
+                if current_concept.expected[0].type == TokenKind.WHITESPACE:
+                    del current_concept.expected[0]

-                    if self._is_lpar(self.stack[-1]):
-                        self._add_error(ParenthesisMismatchErrorNode(self.stack[-1]))
-                        return False
-
-                    # Manage concepts ending with long names
-                    if self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1].is_matched():
-                        self.pop_stack_to_out()
+                if current_concept.eat_token(token):
+                    _pop_stack(current_concept)

                return True

-            # else:
-            #     if token.type != TokenKind.WHITESPACE:
-            #         # hack, because whitespaces are not correctly parsed in self.expected
-            #         # KSI 2020/04/25
-            #         # I no longer understand why we are in a loop (the reverse one)
-            #         # if we are parsing a concept and the expected token does not match
-            #         # The whole class should be in error
-            #         self._add_error(UnexpectedTokenErrorNode(
-            #             f"Failed to parse '{current_concept.concept.concept}'",
-            #             token, current_concept.expected))
-            #         return False
-
        return False

    def eat_token(self, token, pos):
@@ -692,10 +779,11 @@ class InFixToPostFix:

        return False

-    def eat_concept(self, sya_concept_def, pos):
+    def eat_concept(self, sya_concept_def, token, pos):
        """
        a concept is found
        :param sya_concept_def:
+        :param token:
        :param pos:
        :return:
        """
@@ -704,37 +792,43 @@ class InFixToPostFix:
            return
        self.debug.append(sya_concept_def)

-        temp_concept_node = SyaConceptParserHelper(sya_concept_def, pos)
+        parser_helper = SyaConceptParserHelper(sya_concept_def, pos)
+
+        if self.unrecognized_tokens.last_token_type() == TokenKind.WHITESPACE:
+            parser_helper.remember_whitespace = self.unrecognized_tokens.tokens[-1]
+
+        if Token.is_whitespace(parser_helper.last_token_before_first_token):
+            self.unrecognized_tokens.pop(TokenKind.WHITESPACE)

        # First, try to recognize the tokens that are waiting
        self.manage_unrecognized()
        for forked in self.forked:
            # manage the fact that some clone may have been forked
-            forked.eat_concept(sya_concept_def, pos)
+            forked.eat_concept(sya_concept_def, token, pos)

        # then, check if this new concept is linked to the previous ones
        # ie, is the previous concept fully matched ?
-        if temp_concept_node.expected_parameters_before_first_token == 0:
+        if parser_helper.expected_parameters_before_first_token == 0:
            # => does not expect pending parameter (it's suffixed concept)
            while self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1].potential_pos != -1:
                # => previous seems to have everything it needs in the parameter list
                self.pop_stack_to_out()

-        if temp_concept_node.is_atom():
-            self._put_to_out(temp_concept_node.fix_concept())
+        if parser_helper.is_atom():
+            self._put_to_out(parser_helper.fix_concept())
        else:
            # call shunting yard algorithm
-            while self.i_can_pop(temp_concept_node):
+            while self.i_can_pop(parser_helper):
                self.pop_stack_to_out()

-            if temp_concept_node.is_matched():
+            if parser_helper.is_matched():
                # case of a prefix concept which has found happiness with self.parameters_list
                # directly put it in out
-                self.manage_parameters_when_new_concept(temp_concept_node)
-                self._put_to_out(temp_concept_node.fix_concept())
+                self.manage_parameters_when_new_concept(parser_helper)
+                self._put_to_out(parser_helper.fix_concept())
            else:
-                self.stack.append(temp_concept_node)
-                self.manage_parameters_when_new_concept(temp_concept_node)
+                self.stack.append(parser_helper)
+                self.manage_parameters_when_new_concept(parser_helper)

    def eat_unrecognized(self, token, pos):
        """
@@ -762,18 +856,34 @@ class InFixToPostFix:
        if len(self.stack) == 0 and len(self.out) == 0:
            return  # no need to pop the buffer, as no concept is found

+        while len(self.stack) > 0:
+            parser_helper = self.stack[-1]
+
+            # validate parenthesis
+            if self._is_lpar(parser_helper) or self._is_rpar(parser_helper):
+                self._add_error(ParenthesisMismatchErrorNode(parser_helper))
+                return None
+
+            self.manage_unrecognized()
+            for forked in self.forked:
+                # manage that some clones may have been forked
+                forked.finalize()
+
+            failed_to_match = sum(map(lambda e: e.type != TokenKind.VAR_DEF, parser_helper.expected))
+            if failed_to_match > 0:
+                # didn't manage to read all tokens.
+                # Transform them into unrecognized
+                self._transform_to_unrecognized(parser_helper)
+                self.false_positives.append(parser_helper)
+                self.stack.pop()  # discard the parser helper
+            else:
+                self.pop_stack_to_out()  # process it
+
        self.manage_unrecognized()
        for forked in self.forked:
            # manage that some clones may have been forked
            forked.finalize()

-        while len(self.stack) > 0:
-            if self._is_lpar(self.stack[-1]) or self._is_rpar(self.stack[-1]):
-                self._add_error(ParenthesisMismatchErrorNode(self.stack[-1]))
-                return None
-
-            self.pop_stack_to_out()
-
    def clone(self):
        clone = InFixToPostFix(self.context)
        clone.is_locked = self.is_locked
@@ -975,7 +1085,7 @@ class SyaNodeParser(BaseNodeParser):

            try:
                if token.type in (TokenKind.LPAR, TokenKind.RPAR):
-                    # little optim, no need to get the concept when parenthesis
+                    # little optim, no need to lock, unlock or get the concept when parenthesis
                    for infix_to_postfix in res:
                        infix_to_postfix.eat_token(token, self.pos)
                    continue
@@ -992,7 +1102,7 @@ class SyaNodeParser(BaseNodeParser):

                if len(concepts) == 1:
                    for infix_to_postfix in res:
-                        infix_to_postfix.eat_concept(concepts[0], self.pos)
+                        infix_to_postfix.eat_concept(concepts[0], token, self.pos)
                    continue

                # make the cartesian product
@@ -1001,7 +1111,7 @@ class SyaNodeParser(BaseNodeParser):
                    for concept in concepts:
                        clone = infix_to_postfix.clone()
                        temp_res.append(clone)
-                        clone.eat_concept(concept, self.pos)
+                        clone.eat_concept(concept, token, self.pos)
                res = temp_res

            finally:
@@ -1100,6 +1210,11 @@ class SyaNodeParser(BaseNodeParser):
                    to_insert = item
                sequence.insert(0, to_insert)

+            if has_unrecognized:
+                # Manage some sick cases where missing parenthesis mess the order or the sequence
+                # example "foo bar(one plus two"
+                sequence.sort(key=attrgetter("start"))
+
            ret.append(
                self.sheerka.ret(
                    self.name,