Implemented FunctionParser

This commit is contained in:
2020-09-17 14:11:09 +02:00
parent 8a866880bc
commit 177a6b1d5f
40 changed files with 1752 additions and 561 deletions
+242 -87
View File
@@ -5,10 +5,12 @@ from typing import List
from core import builtin_helpers
from core.builtin_concepts import BuiltinConcepts
from core.builtin_helpers import parse_function
from core.concept import Concept, DEFINITION_TYPE_BNF
from core.sheerka.services.SheerkaComparisonManager import SheerkaComparisonManager
from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import Token, TokenKind, Tokenizer
from core.utils import get_n_clones
from parsers.BaseNodeParser import UnrecognizedTokensNode, ConceptNode, SourceCodeNode, SyaAssociativity, \
SourceCodeWithConceptNode, BaseNodeParser
from parsers.BaseParser import ErrorNode
@@ -17,39 +19,73 @@ PARSERS = ["BnfNode", "AtomNode", "Python"]
function_parser_res = namedtuple("FunctionParserRes", 'to_out function')
DEBUG_PUSH = "PUSH"
DEBUG_PUSH_UNREC = "PUSH_UNREC"
DEBUG_POP = "POP"
DEBUG_EAT = "EAT"
DEBUG_RECOG = "RECOG"
@dataclass()
class DebugInfo:
"""
Debug item to trace how the sya parser worked
Possible action:
PUSH: push the token or the concept to the stack
PUSH_UNREC: push the token to the UnrecognizedTokensNode
POP: pop item to out
EAT: eat the current token (it means that it was part of the concept currently being parsed)
RECOG: when tokens from UnrecognizedTokensNode are parsed and recognized
"""
pos: int = -1 # position of the parser input
token: Token = None # current token
concept: Concept = None # current concept if ay
action: str = None # action taken
def __repr__(self):
token_repr = self.token.repr_value if isinstance(self.token, Token) else self.token
msg = f"{self.pos:3}:{token_repr}" if self.pos != -1 else " _:"
if self.concept:
msg += f"({self.concept})"
return msg + f" => {self.action}"
class ParenthesisMismatchErrorNode(ErrorNode):
def __init__(self, error_int):
if isinstance(error_int, tuple):
self.token = error_int[0]
if isinstance(error_int[0], Token):
self.token_value = error_int[0].value
self.token = error_int[0]
else:
self.token_value = error_int[0]
self.token = None
self.pos = error_int[1]
elif isinstance(error_int, Token):
self.token = error_int
self.token_value = error_int.value
self.pos = -1
else: # isinstance(UnrecognizedTokensNode)
for i, t in reversed(list(enumerate(error_int.tokens))):
if t.type == TokenKind.LPAR:
self.token = t
self.token_value = t.value
self.pos = i + error_int.start
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, tuple):
return other[0] == self.token.value and other[1] == self.pos
if not isinstance(other, ParenthesisMismatchErrorNode):
return False
return self.token == other.token and self.pos == other.pos
return self.token_value == other.token_value and self.pos == other.pos
def __hash__(self):
return hash(self.pos)
def __repr__(self):
return f"ParenthesisMismatchErrorNode('{self.token.value}', {self.pos}"
return f"ParenthesisMismatchErrorNode('{self.token_value}', {self.pos}"
@dataclass()
@@ -211,8 +247,9 @@ class SyaConceptParserHelper:
class InFixToPostFix:
def __init__(self, context):
def __init__(self, context, debug_enabled=False):
self.context = context
self.debug_enabled = debug_enabled
self.is_locked = False # when locked, cannot process input
@@ -227,6 +264,8 @@ class InFixToPostFix:
self.false_positives = [] # concepts that looks like known one, but not (for debug purpose)
self.forked = [] # use to fork InFixToPostFix when multiple parsers recognize the unrecognized_tokens
self.parsing_function = False # indicate that we are currently parsing a function
def __repr__(self):
return f"InFixToPostFix({self.debug})"
@@ -243,6 +282,8 @@ class InFixToPostFix:
return len(self.sequence) + len(self.errors)
def _add_error(self, error):
if self.debug_enabled:
self.debug.append(DebugInfo(action=f"=> ERROR {error}"))
self.errors.append(error)
def _is_lpar(self, token):
@@ -294,7 +335,11 @@ class InFixToPostFix:
item.error = "Not enough suffix parameters"
else:
item.error = f"token '{item.expected[0].strip_quote}' not found"
if self.debug_enabled:
self.debug.append(DebugInfo(action=f"ERROR {item.error}"))
if self.debug_enabled:
self.debug.append(DebugInfo(action=f"{DEBUG_POP} {item}"))
if isinstance(item, SyaConceptParserHelper) and item.potential_pos != -1:
self.out.insert(item.potential_pos, item)
else:
@@ -345,6 +390,26 @@ class InFixToPostFix:
for i, token in enumerate(parser_helper.tokens):
self.unrecognized_tokens.add_token(token, parser_helper.start + i)
def _remove_debug_info_if_needed(self):
"""
Before trying to manage the unrecognized, a line is added to explain the token which has triggered
the recognition try
This line is useless if self.unrecognized_tokens was irrelevant
:return:
"""
if len(self.debug) > 0 and self.debug[-1].action == "??":
self.debug.pop()
def _debug_nodes(self, nodes_sequences):
res = "["
first = True
for sequence in nodes_sequences:
if not first:
res += ", "
res += "[" + ", ".join([n.to_short_str() for n in sequence]) + "]"
first = False
return res + "]"
def get_errors(self):
def has_error(item):
if isinstance(item, SyaConceptParserHelper) and item.error:
@@ -439,41 +504,40 @@ class InFixToPostFix:
self.unrecognized_tokens.fix_source()
# try to recognize concepts
nodes_sequences = builtin_helpers.get_lexer_nodes_from_unrecognized(
self.context,
self.unrecognized_tokens,
PARSERS)
if nodes_sequences:
# There are more than one solution found
# In the case, we create a new InfixToPostfix for each new possibility
if len(nodes_sequences) > 1:
for node_sequence in nodes_sequences[1:]:
clone = self.clone()
for node in node_sequence:
clone._put_to_out(node)
clone.unrecognized_tokens = UnrecognizedTokensNode(-1, -1, [])
self.forked.append(clone)
# Do not forget the first result that will go with the current InfixToPostfix
for node in nodes_sequences[0]:
self._put_to_out(node)
else:
if self.unrecognized_tokens.parenthesis_count > 0:
# parenthesis mismatch detected, do not try to resolve the unrecognized
self._add_error(ParenthesisMismatchErrorNode(self.unrecognized_tokens))
self._put_to_out(self.unrecognized_tokens)
else:
# try to recognize concepts
nodes_sequences = builtin_helpers.get_lexer_nodes_from_unrecognized(
self.context,
self.unrecognized_tokens,
PARSERS)
# # try to recognize concepts
# nodes = self._get_lexer_nodes_from_unrecognized()
# if nodes:
# for node in nodes:
# self._put_to_out(node)
# else:
# self._put_to_out(self.unrecognized_tokens)
if nodes_sequences:
# There are more than one solution found
# In the case, we create a new InfixToPostfix for each new possibility
if self.debug_enabled:
self.debug.append(DebugInfo(action=f"{DEBUG_RECOG} {self._debug_nodes(nodes_sequences)}"))
if len(nodes_sequences) > 1:
for node_sequence in nodes_sequences[1:]:
clone = self.clone()
for node in node_sequence:
clone._put_to_out(node)
clone.unrecognized_tokens = UnrecognizedTokensNode(-1, -1, [])
self.forked.append(clone)
# Do not forget the first result that will go with the current InfixToPostfix
for node in nodes_sequences[0]:
self._put_to_out(node)
else:
self._put_to_out(self.unrecognized_tokens)
# create another instance
self.unrecognized_tokens = UnrecognizedTokensNode(-1, -1, [])
def get_functions_from_unrecognized(self, token, pos):
def get_functions_names_from_unrecognized(self, token, pos):
"""
The unrecognized ends with an lpar '('
It means that its a function like foo(something)
@@ -489,19 +553,32 @@ class InFixToPostFix:
self.context,
self.unrecognized_tokens,
PARSERS)
if nodes_sequences is None:
return None
if not nodes_sequences:
nodes_sequences = [[self.unrecognized_tokens.clone()]]
res = []
for sequence in nodes_sequences:
if isinstance(sequence[-1], UnrecognizedTokensNode):
function = sequence[-1]
else:
function = UnrecognizedTokensNode(sequence[-1].start, sequence[-1].end, sequence[-1].tokens)
function.add_token(token, pos).fix_source()
last_node = sequence[-1]
res.append(function_parser_res(sequence[:-1], function))
if len(last_node.tokens) > 1:
if isinstance(last_node, UnrecognizedTokensNode):
to_out = [UnrecognizedTokensNode(last_node.start, pos - 2, last_node.tokens[:-1]).fix_source()]
function_name = UnrecognizedTokensNode(pos - 1, pos - 1, [last_node.tokens[-1]])
function_name.add_token(token, pos)
else:
to_out = [last_node.fix_source()]
function_name = None
else: # len(last_node.tokens) == 1
if not isinstance(last_node, UnrecognizedTokensNode):
function_name = UnrecognizedTokensNode(last_node.start, last_node.end, last_node.tokens)
else:
function_name = last_node
function_name.add_token(token, pos)
to_out = []
res.append(function_parser_res(sequence[:-1] + to_out, function_name))
return res
def pop_stack_to_out(self):
@@ -614,6 +691,8 @@ class InFixToPostFix:
self.unrecognized_tokens.pop(TokenKind.WHITESPACE)
current_concept.end = pos
if self.debug_enabled:
self.debug.append(DebugInfo(pos, token, None, "??"))
self.manage_unrecognized()
# manage that some clones may have been forked
for forked in self.forked:
@@ -673,17 +752,53 @@ class InFixToPostFix:
if self.is_locked:
return
if self.parsing_function:
if self.debug_enabled:
self.debug.append(DebugInfo(pos, token, None, DEBUG_PUSH_UNREC))
self.unrecognized_tokens.add_token(token, pos)
if self.unrecognized_tokens.parenthesis_count == 0:
self.unrecognized_tokens.fix_source()
res = parse_function(self.context,
self.unrecognized_tokens.source,
self.unrecognized_tokens.tokens[:],
self.unrecognized_tokens.start)
instances = get_n_clones(self, len(res))
self.forked.extend(instances[1:])
for instance, res_i in zip(instances, res):
if res_i.status or instance.context.sheerka.isinstance(res_i.body, BuiltinConcepts.PARSER_RESULT):
# 1. we manage to recognize a function
# 2. we almost manage, ex func(one two). It's not a function but almost
instance._put_to_out(res_i.body.body)
instance.unrecognized_tokens.reset()
else:
# it is not a function, try to recognized the token
# This situation is unlikely to occur
instance.manage_unrecognized()
instance.parsing_function = False
return True
if self.handle_expected_token(token, pos):
# a token is found, let's check if it's part of a concepts being parsed
# example Concept(name="foo", definition="foo a bar b").def_var("a").def_var("b")
# if the token 'bar' is found, it has to be considered as part of the concept foo
self.debug.append(token)
if self.debug_enabled:
self._remove_debug_info_if_needed()
self.debug.append(DebugInfo(pos, token, None, DEBUG_EAT))
return True
elif self._is_lpar(token):
self.debug.append(token)
if self.debug_enabled:
self.debug.append(DebugInfo(pos, token, None, DEBUG_PUSH_UNREC))
if self.unrecognized_tokens.is_empty() or self.unrecognized_tokens.is_whitespace():
# first, remove what was in the buffer
self.manage_unrecognized()
for forked in self.forked:
@@ -691,40 +806,65 @@ class InFixToPostFix:
forked.eat_token(token, pos)
self.stack.append((token, pos))
else:
# the parenthesis is part of the unrecognized
# So it's a function
# So it's maybe a function call
list_of_results = self.get_functions_from_unrecognized(token, pos)
if list_of_results:
instances = [self]
for i in range(len(list_of_results) - 1):
clone = self.clone()
self.forked.append(clone)
instances.append(clone)
list_of_results = self.get_functions_names_from_unrecognized(token, pos)
instances = [self]
for i in range(len(list_of_results) - 1):
clone = self.clone()
self.forked.append(clone)
instances.append(clone)
# Manage the result for self and its clones
for instance, parsing_res in zip(instances, list_of_results):
for to_out in parsing_res.to_out:
instance._put_to_out(to_out)
# Manage the result for self and its clones
for instance, parsing_res in zip(instances, list_of_results):
for to_out in parsing_res.to_out:
instance._put_to_out(to_out)
if parsing_res.function:
instance.unrecognized_tokens = parsing_res.function
instance.parsing_function = True
else:
# special case of "twenty two(". It's not considered as a function
# The manage_unrecognized() what somewhat done by get_functions_names_from_unrecognized()
# So we just put the unrecognized to out
instance.unrecognized_tokens.reset()
# make sure to pop the current concept
if self._stack_isinstance(SyaConceptParserHelper):
self.pop_stack_to_out()
instance._put_to_out(")") # mark where the function should end
instance.stack.append(parsing_res.function)
instance.unrecognized_tokens = UnrecognizedTokensNode(-1, -1, []) # reset unrecognized
else:
self._put_to_out(")") # mark where the function should end
self.eat_unrecognized(token, pos) # add the '(' to the rest of the unknown
self.stack.append(self.unrecognized_tokens.fix_source())
self.unrecognized_tokens = UnrecognizedTokensNode(-1, -1, [])
instance.stack.append((token, pos))
# # instance._put_to_out(")") # mark where the function should end
# # instance.stack.append(parsing_res.function)
# # instance.unrecognized_tokens = UnrecognizedTokensNode(-1, -1, []) # reset unrecognized
# else:
# # handle when there are multiple pending tokens
# if len(self.unrecognized_tokens.tokens) > 1:
# unrecognized = UnrecognizedTokensNode(self.unrecognized_tokens.start,
# pos - 2,
# self.unrecognized_tokens.tokens[:-1])
# unrecognized.fix_source()
# self._put_to_out(unrecognized)
# last_token = self.unrecognized_tokens.tokens[-1]
# self.unrecognized_tokens.reset()
# self.unrecognized_tokens.add_token(last_token, pos - 1)
#
# self.eat_unrecognized(token, pos) # add the '(' to the rest of the unknown
# self.parsing_function = True
# # self.stack.append(self.unrecognized_tokens.fix_source())
# # self.unrecognized_tokens = UnrecognizedTokensNode(-1, -1, [])
return True
elif self._is_rpar(token):
self.debug.append(token)
if self.debug_enabled:
self.debug.append(DebugInfo(pos, token, None, DEBUG_EAT))
# first, remove what was in the buffer
self.manage_unrecognized()
@@ -775,32 +915,36 @@ class InFixToPostFix:
return False
def eat_concept(self, sya_concept_def, token, pos):
def eat_concept(self, sya_concept_def, token, pos, first_pass=True):
"""
a concept is found
:param sya_concept_def:
:param token:
:param pos:
:param first_pass: When not called from a fork after manage_unrecognized()
:return:
"""
if self.is_locked:
return
self.debug.append(sya_concept_def)
parser_helper = SyaConceptParserHelper(sya_concept_def, pos)
if self.unrecognized_tokens.last_token_type() == TokenKind.WHITESPACE:
parser_helper.remember_whitespace = self.unrecognized_tokens.tokens[-1]
if first_pass:
if self.debug_enabled:
self.debug.append(DebugInfo(pos, token, sya_concept_def, "??"))
if Token.is_whitespace(parser_helper.last_token_before_first_token):
self.unrecognized_tokens.pop(TokenKind.WHITESPACE)
if self.unrecognized_tokens.last_token_type() == TokenKind.WHITESPACE:
parser_helper.remember_whitespace = self.unrecognized_tokens.tokens[-1]
# First, try to recognize the tokens that are waiting
self.manage_unrecognized()
for forked in self.forked:
# manage the fact that some clone may have been forked
forked.eat_concept(sya_concept_def, token, pos)
if Token.is_whitespace(parser_helper.last_token_before_first_token):
self.unrecognized_tokens.pop(TokenKind.WHITESPACE)
# First, try to recognize the tokens that are waiting
self.manage_unrecognized()
for forked in self.forked:
# manage the fact that some clone may have been forked
forked.eat_concept(sya_concept_def, token, pos, first_pass=False)
# then, check if this new concept is linked to the previous ones
# ie, is the previous concept fully matched ?
@@ -823,6 +967,9 @@ class InFixToPostFix:
self.manage_parameters_when_new_concept(parser_helper)
self._put_to_out(parser_helper.fix_concept())
else:
if self.debug_enabled:
self._remove_debug_info_if_needed()
self.debug.append(DebugInfo(pos, token, sya_concept_def, DEBUG_PUSH))
self.stack.append(parser_helper)
self.manage_parameters_when_new_concept(parser_helper)
@@ -836,11 +983,12 @@ class InFixToPostFix:
if self.is_locked:
return
self.debug.append(token)
if self.debug_enabled:
self.debug.append(DebugInfo(pos, token, None, DEBUG_PUSH_UNREC))
self.unrecognized_tokens.add_token(token, pos)
def finalize(self):
def finalize(self, pos):
"""
Put the remaining items from the stack to out
:return:
@@ -850,8 +998,14 @@ class InFixToPostFix:
return
if len(self.stack) == 0 and len(self.out) == 0:
# check for parenthesis mismatch
if self.unrecognized_tokens.parenthesis_count > 0:
self._add_error(ParenthesisMismatchErrorNode(self.unrecognized_tokens))
return # no need to pop the buffer, as no concept is found
if self.debug_enabled:
self.debug.append(DebugInfo(pos, "<EOF>", None, "??"))
while len(self.stack) > 0:
parser_helper = self.stack[-1]
@@ -863,7 +1017,7 @@ class InFixToPostFix:
self.manage_unrecognized()
for forked in self.forked:
# manage that some clones may have been forked
forked.finalize()
forked.finalize(pos)
failed_to_match = sum(map(lambda e: e.type != TokenKind.VAR_DEF, parser_helper.expected))
if failed_to_match > 0:
@@ -878,10 +1032,10 @@ class InFixToPostFix:
self.manage_unrecognized()
for forked in self.forked:
# manage that some clones may have been forked
forked.finalize()
forked.finalize(pos)
def clone(self):
clone = InFixToPostFix(self.context)
clone = InFixToPostFix(self.context, self.debug_enabled)
clone.is_locked = self.is_locked
clone.out = self.out[:]
clone.stack = [i.clone() if hasattr(i, "clone") else i for i in self.stack]
@@ -983,7 +1137,7 @@ class SyaNodeParser(BaseNodeParser):
res.extend(forked)
forked.clear()
res = [InFixToPostFix(context)]
res = [InFixToPostFix(context, context.in_context(BuiltinConcepts.DEBUG))]
while self.parser_input.next_token(False):
for infix_to_postfix in res:
infix_to_postfix.reset()
@@ -1027,7 +1181,7 @@ class SyaNodeParser(BaseNodeParser):
# make sure that remaining items in stack are moved to out
for infix_to_postfix in res:
infix_to_postfix.reset()
infix_to_postfix.finalize()
infix_to_postfix.finalize(self.parser_input.pos)
_add_forked_to_res()
return res
@@ -1058,14 +1212,14 @@ class SyaNodeParser(BaseNodeParser):
start = item.start
end = item.end
has_unrecognized = False
concept = sheerka.new_from_template(item.concept, item.concept.id)
concept = sheerka.new_from_template(item.concept, item.concept.key)
for param_index in reversed(range(len(concept.metadata.variables))):
inner_item = self.postfix_to_item(sheerka, postfixed)
if inner_item.start < start:
start = inner_item.start
if inner_item.end > end:
end = inner_item.end
has_unrecognized |= isinstance(inner_item, UnrecognizedTokensNode)
has_unrecognized |= isinstance(inner_item, (UnrecognizedTokensNode, SourceCodeWithConceptNode))
param_name = concept.metadata.variables[param_index][0]
param_value = inner_item.concept if hasattr(inner_item, "concept") else \
@@ -1128,6 +1282,7 @@ class SyaNodeParser(BaseNodeParser):
if has_unrecognized:
# Manage some sick cases where missing parenthesis mess the order or the sequence
# example "foo bar(one plus two"
# too lazy to fix the why...
sequence.sort(key=attrgetter("start"))
ret.append(