Fixed SyaNodeParser false positive recognition issue

This commit is contained in:
2020-05-15 10:36:05 +02:00
parent 6e343ba996
commit 5489ef00b9
24 changed files with 484 additions and 5741 deletions
+193 -78
View File
@@ -1,15 +1,16 @@
from collections import namedtuple
from dataclasses import dataclass, field
from operator import attrgetter
from typing import List
from core import builtin_helpers
from core.builtin_concepts import BuiltinConcepts
from core.concept import VARIABLE_PREFIX, Concept, DEFINITION_TYPE_BNF
from core.concept import Concept, DEFINITION_TYPE_BNF
from core.sheerka.ExecutionContext import ExecutionContext
from core.tokenizer import Token, TokenKind
from core.tokenizer import Token, TokenKind, Tokenizer
from parsers.BaseNodeParser import UnrecognizedTokensNode, ConceptNode, SourceCodeNode, SyaAssociativity, \
SourceCodeWithConceptNode, BaseNodeParser
from parsers.BaseParser import ErrorNode, UnexpectedTokenErrorNode
from parsers.BaseParser import ErrorNode
PARSERS = ["BnfNode", "AtomNode", "Python"]
@@ -88,10 +89,13 @@ class SyaConceptParserHelper:
concept: Concept
start: int # position of the token in the tokenizer (Caution, it is not token.index)
end: int = field(default=-1, repr=False, compare=False, hash=None)
expected: List[str] = field(default_factory=list, repr=False, compare=False, hash=None)
expected: List[Token] = field(default_factory=list, repr=False, compare=False, hash=None)
expected_parameters_before_first_token: int = field(default=0, repr=False, compare=False, hash=None)
last_token_before_first_token: Token = field(default=None, repr=False, compare=False, hash=None)
potential_pos: int = field(default=-1, repr=False, compare=False, hash=None)
parameters_list_at_init: list = field(default_factory=list, repr=False, compare=False, hash=None)
tokens: List[Token] = field(default_factory=list, repr=False, compare=False, hash=None) # tokens eaten
remember_whitespace: Token = field(default=None, repr=False, compare=False, hash=None)
error: str = None
def __post_init__(self):
@@ -99,17 +103,20 @@ class SyaConceptParserHelper:
if self.end == -1:
self.end = self.start
first_keyword_found = False
for name in concept.key.split():
if not name.startswith(VARIABLE_PREFIX) and not first_keyword_found:
first_keyword_found = True
first_keyword_found = None
for token in Tokenizer(concept.key, yield_eof=False):
if not first_keyword_found and token.type != TokenKind.WHITESPACE and token.type != TokenKind.VAR_DEF:
first_keyword_found = token
if first_keyword_found:
self.expected.append(name)
self.expected.append(token)
else:
self.expected_parameters_before_first_token += 1
self.last_token_before_first_token = token
if token.type != TokenKind.WHITESPACE:
self.expected_parameters_before_first_token += 1
self.eat_token() # remove the fist token
self.eat_token(first_keyword_found) # remove the first token
self.tokens.append(first_keyword_found)
def is_matched(self):
return len(self.expected) == 0
@@ -117,23 +124,38 @@ class SyaConceptParserHelper:
def is_atom(self):
return len(self.concept.concept.metadata.variables) == 0 and len(self.expected) == 0
def is_expected(self, token):
if self.is_matched():
def is_next(self, token):
if self.is_matched() or len(self.expected) == 0:
return False
token_value = BaseNodeParser.get_token_value(token)
# True if the next token is the one that is expected
# Or if the next token is a whitespace and the expected one is the one after
# (whitespace are sometimes not mandatory)
return token.str_value == self.expected[0].str_value or \
self.expected[0].type == TokenKind.WHITESPACE and token.str_value == self.expected[1].str_value
def is_expected(self, token):
if self.is_matched() or token.type == TokenKind.WHITESPACE:
return False
for expected in self.expected:
if not expected.startswith(VARIABLE_PREFIX) and expected == token_value:
if expected.type != TokenKind.VAR_DEF and expected.str_value == token.str_value:
return True
return False
def expected_parameters(self):
return sum(map(lambda e: e.startswith(VARIABLE_PREFIX), self.expected))
return sum(map(lambda e: e.type == TokenKind.VAR_DEF, self.expected))
def eat_token(self):
# No check, as it is used only after is_expected
def eat_token(self, until_token):
"""
eat until token 'until'
:param until_token:
:return:
"""
# No check, as it is used only after is_expected() or is_next()
while self.expected[0].str_value != until_token.str_value:
del self.expected[0]
del self.expected[0]
# return True is a whole sequence of keyword is eaten
@@ -143,7 +165,10 @@ class SyaConceptParserHelper:
if len(self.expected) == 0:
return True
return self.expected[0].startswith(VARIABLE_PREFIX)
# also return True at the end of a name sequence
# ... <var0> bar baz qux <var1>
# return True after 'qux', to indicate all the parameters from <var0> must be processed
return self.expected[0].type == TokenKind.VAR_DEF
def eat_parameter(self, parameter):
if self.is_matched() and parameter == self:
@@ -153,7 +178,7 @@ class SyaConceptParserHelper:
self.error = "No more parameter expected"
return
if not self.expected[0].startswith(VARIABLE_PREFIX):
if self.expected[0].type != TokenKind.VAR_DEF:
self.error = "Parameter was not expected"
return
@@ -202,6 +227,7 @@ class InFixToPostFix:
self.errors = [] # Not quite sure that I can handle more than one error
self.debug = []
self.false_positives = [] # concepts that looks like known one, but not (for debug purpose)
self.forked = [] # use to fork InFixToPostFix when multiple parsers recognize the unrecognized_tokens
def __repr__(self):
@@ -245,7 +271,6 @@ class InFixToPostFix:
Note that when we are parsing non recognized tokens,
we consider that the parenthesis are part of the non recognized
:param token:
:param stack:
:return:
"""
return isinstance(token, Token) and token.type == TokenKind.RPAR
@@ -268,10 +293,10 @@ class InFixToPostFix:
:return:
"""
if isinstance(item, SyaConceptParserHelper) and len(item.expected) > 0 and not item.error:
if item.expected[0].startswith(VARIABLE_PREFIX):
if item.expected[0].type == TokenKind.VAR_DEF:
item.error = "Not enough suffix parameters"
else:
item.error = f"token '{item.expected[0]}' not found"
item.error = f"token '{item.expected[0].str_value}' not found"
if isinstance(item, SyaConceptParserHelper) and item.potential_pos != -1:
self.out.insert(item.potential_pos, item)
@@ -328,6 +353,16 @@ class InFixToPostFix:
).pseudo_fix_source()
return source_code
def _transform_to_unrecognized(self, parser_helper):
# an Unrecognized when sent to out too prematurely
if len(self.out) > 0 and isinstance(self.out[-1], UnrecognizedTokensNode):
self.unrecognized_tokens = self.out.pop()
if parser_helper.remember_whitespace:
self.unrecognized_tokens.add_token(parser_helper.remember_whitespace, parser_helper.start - 1)
for i, token in enumerate(parser_helper.tokens):
self.unrecognized_tokens.add_token(token, parser_helper.start + i)
def get_errors(self):
res = []
res.extend(self.errors)
@@ -343,28 +378,28 @@ class InFixToPostFix:
self.is_locked = False
def manage_parameters_when_new_concept(self, temp_concept_node):
def manage_parameters_when_new_concept(self, parser_helper):
"""
When a new concept is create, we need to check what to do with the parameters
that were queued
:param temp_concept_node: new concept
:param parser_helper: new concept
:return:
"""
if len(self.parameters_list) < temp_concept_node.expected_parameters_before_first_token:
if len(self.parameters_list) < parser_helper.expected_parameters_before_first_token:
# The new concept expect some prefix parameters, but there's not enough
temp_concept_node.error = "Not enough prefix parameters"
parser_helper.error = "Not enough prefix parameters"
return
if len(self.parameters_list) > temp_concept_node.expected_parameters_before_first_token:
if len(self.parameters_list) > parser_helper.expected_parameters_before_first_token:
# There are more parameters than needed by the new concept
# The others are either
# - parameters for the previous concept (if any)
# - concepts on their own
# - syntax error
# In all the cases, the only thing that matter is to pop what is expected by the new concept
for i in range(temp_concept_node.expected_parameters_before_first_token):
for i in range(parser_helper.expected_parameters_before_first_token):
self.parameters_list.pop()
temp_concept_node.parameters_list_at_init.extend(self.parameters_list)
parser_helper.parameters_list_at_init.extend(self.parameters_list)
return
# len(self.parameters_list) == temp_concept_node.expected_parameters_before_first_token
@@ -385,14 +420,18 @@ class InFixToPostFix:
:return:
"""
# manage parenthesis that didn't find any match
if self._is_lpar(self.stack[-1]):
self._add_error(ParenthesisMismatchErrorNode(self.stack[-1]))
# The parameter must be part the current concept being parsed
assert len(self._concepts()) != 0 # sanity check
current_concept = self._concepts()[-1]
while len(current_concept.expected) > 0 and current_concept.expected[0].startswith(VARIABLE_PREFIX):
while len(current_concept.expected) > 0 and current_concept.expected[0].type == TokenKind.VAR_DEF:
# eat everything that was expected
if len(self.parameters_list) == 0:
# current_concept.error = f"Failed to match parameter '{current_concept.expected[0]}'"
current_concept.error = f"Failed to match parameter '{current_concept.expected[0].str_value}'"
return
del self.parameters_list[0]
del current_concept.expected[0]
@@ -506,6 +545,11 @@ class InFixToPostFix:
if stack.associativity == SyaAssociativity.No and current.associativity == SyaAssociativity.No:
self._add_error(NoneAssociativeSequenceErrorNode(current.concept, stack_head.start, concept_node.start))
if not current.precedence:
# precedence is not set (None or zero)
# Do not apply any rule
return False
if current.associativity == SyaAssociativity.Left and current.precedence <= stack.precedence:
return True
@@ -528,9 +572,55 @@ class InFixToPostFix:
:return:
"""
def _pop_stack(c):
while self.stack[-1] != c and not self._is_lpar(c):
self.pop_stack_to_out()
if self._is_lpar(self.stack[-1]):
self._add_error(ParenthesisMismatchErrorNode(self.stack[-1]))
return False
# Manage concepts ending with long names
if self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1].is_matched():
self.pop_stack_to_out()
for current_concept in reversed(self._concepts()):
# As I may loose memory again ;-)
# it's a reversed loop to manage cases like
# if a plus b then ...
# The current concept is 'plus', but the token is 'then'
# It's means that I have finished to parse the 'plus' and started the second part of the 'if'
if current_concept.is_next(token):
current_concept.end = pos
current_concept.tokens.append(token)
if current_concept.eat_token(token):
_pop_stack(current_concept)
return True
if len(current_concept.expected) > 0 and current_concept.expected[0].type != TokenKind.VAR_DEF:
if current_concept.expected[0].type == TokenKind.WHITESPACE:
# drop it. It's the case where an optional whitespace is missing
del (current_concept.expected[0])
else:
# error
# We are not parsing the concept we tought we were parsing.
# Transform the eaten tokens into unrecognized
# and discard the current SyaConceptParserHelper
# TODO: manage the pending LPAR, RPAR ?
self._transform_to_unrecognized(current_concept)
self.false_positives.append(current_concept)
self.stack.pop()
return False
if current_concept.is_expected(token):
# Fix the whitespace between var and expected if needed
# current_concept.expected[0] is '<var>'
# current_concept.expected[1] is what separate var from expected (normally a whitespace)
if current_concept.expected[1].type == TokenKind.WHITESPACE:
self.unrecognized_tokens.pop(TokenKind.WHITESPACE)
current_concept.end = pos
self.manage_unrecognized()
# manage that some clones may have been forked
@@ -550,36 +640,33 @@ class InFixToPostFix:
self.parameters_list[:]))
return True # no need to continue
while self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1].is_matched():
self.pop_stack_to_out()
while self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1] != current_concept:
current = self.stack[-1]
if current.error:
self._transform_to_unrecognized(current)
self.false_positives.append(current)
self.stack.pop()
if current_concept.expected[1].type == TokenKind.WHITESPACE:
self.unrecognized_tokens.pop(TokenKind.WHITESPACE)
self.manage_unrecognized()
# manage that some clones may have been forked
for forked in self.forked:
forked.handle_expected_token(token, pos)
else:
self.pop_stack_to_out()
self.manage_parameters()
if current_concept.eat_token():
while self.stack[-1] != current_concept and not self._is_lpar(current_concept):
self.pop_stack_to_out()
# maybe eat whitespace that was between <var> and expected token
if current_concept.expected[0].type == TokenKind.WHITESPACE:
del current_concept.expected[0]
if self._is_lpar(self.stack[-1]):
self._add_error(ParenthesisMismatchErrorNode(self.stack[-1]))
return False
# Manage concepts ending with long names
if self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1].is_matched():
self.pop_stack_to_out()
if current_concept.eat_token(token):
_pop_stack(current_concept)
return True
# else:
# if token.type != TokenKind.WHITESPACE:
# # hack, because whitespaces are not correctly parsed in self.expected
# # KSI 2020/04/25
# # I no longer understand why we are in a loop (the reverse one)
# # if we are parsing a concept and the expected token does not match
# # The whole class should be in error
# self._add_error(UnexpectedTokenErrorNode(
# f"Failed to parse '{current_concept.concept.concept}'",
# token, current_concept.expected))
# return False
return False
def eat_token(self, token, pos):
@@ -692,10 +779,11 @@ class InFixToPostFix:
return False
def eat_concept(self, sya_concept_def, pos):
def eat_concept(self, sya_concept_def, token, pos):
"""
a concept is found
:param sya_concept_def:
:param token:
:param pos:
:return:
"""
@@ -704,37 +792,43 @@ class InFixToPostFix:
return
self.debug.append(sya_concept_def)
temp_concept_node = SyaConceptParserHelper(sya_concept_def, pos)
parser_helper = SyaConceptParserHelper(sya_concept_def, pos)
if self.unrecognized_tokens.last_token_type() == TokenKind.WHITESPACE:
parser_helper.remember_whitespace = self.unrecognized_tokens.tokens[-1]
if Token.is_whitespace(parser_helper.last_token_before_first_token):
self.unrecognized_tokens.pop(TokenKind.WHITESPACE)
# First, try to recognize the tokens that are waiting
self.manage_unrecognized()
for forked in self.forked:
# manage the fact that some clone may have been forked
forked.eat_concept(sya_concept_def, pos)
forked.eat_concept(sya_concept_def, token, pos)
# then, check if this new concept is linked to the previous ones
# ie, is the previous concept fully matched ?
if temp_concept_node.expected_parameters_before_first_token == 0:
if parser_helper.expected_parameters_before_first_token == 0:
# => does not expect pending parameter (it's suffixed concept)
while self._stack_isinstance(SyaConceptParserHelper) and self.stack[-1].potential_pos != -1:
# => previous seems to have everything it needs in the parameter list
self.pop_stack_to_out()
if temp_concept_node.is_atom():
self._put_to_out(temp_concept_node.fix_concept())
if parser_helper.is_atom():
self._put_to_out(parser_helper.fix_concept())
else:
# call shunting yard algorithm
while self.i_can_pop(temp_concept_node):
while self.i_can_pop(parser_helper):
self.pop_stack_to_out()
if temp_concept_node.is_matched():
if parser_helper.is_matched():
# case of a prefix concept which has found happiness with self.parameters_list
# directly put it in out
self.manage_parameters_when_new_concept(temp_concept_node)
self._put_to_out(temp_concept_node.fix_concept())
self.manage_parameters_when_new_concept(parser_helper)
self._put_to_out(parser_helper.fix_concept())
else:
self.stack.append(temp_concept_node)
self.manage_parameters_when_new_concept(temp_concept_node)
self.stack.append(parser_helper)
self.manage_parameters_when_new_concept(parser_helper)
def eat_unrecognized(self, token, pos):
"""
@@ -762,18 +856,34 @@ class InFixToPostFix:
if len(self.stack) == 0 and len(self.out) == 0:
return # no need to pop the buffer, as no concept is found
while len(self.stack) > 0:
parser_helper = self.stack[-1]
# validate parenthesis
if self._is_lpar(parser_helper) or self._is_rpar(parser_helper):
self._add_error(ParenthesisMismatchErrorNode(parser_helper))
return None
self.manage_unrecognized()
for forked in self.forked:
# manage that some clones may have been forked
forked.finalize()
failed_to_match = sum(map(lambda e: e.type != TokenKind.VAR_DEF, parser_helper.expected))
if failed_to_match > 0:
# didn't manage to read all tokens.
# Transform them into unrecognized
self._transform_to_unrecognized(parser_helper)
self.false_positives.append(parser_helper)
self.stack.pop() # discard the parser helper
else:
self.pop_stack_to_out() # process it
self.manage_unrecognized()
for forked in self.forked:
# manage that some clones may have been forked
forked.finalize()
while len(self.stack) > 0:
if self._is_lpar(self.stack[-1]) or self._is_rpar(self.stack[-1]):
self._add_error(ParenthesisMismatchErrorNode(self.stack[-1]))
return None
self.pop_stack_to_out()
def clone(self):
clone = InFixToPostFix(self.context)
clone.is_locked = self.is_locked
@@ -975,7 +1085,7 @@ class SyaNodeParser(BaseNodeParser):
try:
if token.type in (TokenKind.LPAR, TokenKind.RPAR):
# little optim, no need to get the concept when parenthesis
# little optim, no need to lock, unlock or get the concept when parenthesis
for infix_to_postfix in res:
infix_to_postfix.eat_token(token, self.pos)
continue
@@ -992,7 +1102,7 @@ class SyaNodeParser(BaseNodeParser):
if len(concepts) == 1:
for infix_to_postfix in res:
infix_to_postfix.eat_concept(concepts[0], self.pos)
infix_to_postfix.eat_concept(concepts[0], token, self.pos)
continue
# make the cartesian product
@@ -1001,7 +1111,7 @@ class SyaNodeParser(BaseNodeParser):
for concept in concepts:
clone = infix_to_postfix.clone()
temp_res.append(clone)
clone.eat_concept(concept, self.pos)
clone.eat_concept(concept, token, self.pos)
res = temp_res
finally:
@@ -1100,6 +1210,11 @@ class SyaNodeParser(BaseNodeParser):
to_insert = item
sequence.insert(0, to_insert)
if has_unrecognized:
# Manage some sick cases where missing parenthesis mess the order or the sequence
# example "foo bar(one plus two"
sequence.sort(key=attrgetter("start"))
ret.append(
self.sheerka.ret(
self.name,