Fixed BnfNodeParser to allow expressions like 'number hundred' when number is a group

This commit is contained in:
2020-06-27 18:56:04 +02:00
parent d4468da8a3
commit 2c5840752a
14 changed files with 593 additions and 228 deletions
+185 -65
View File
@@ -38,7 +38,7 @@ class NonTerminalNode(LexerNode):
self.children = children
def __repr__(self):
name = self.parsing_expression.rule_name or self.parsing_expression.__class__.__name__
name = "Node:" + (self.parsing_expression.rule_name or self.parsing_expression.__class__.__name__)
if len(self.children) > 0:
sub_names = "(" + ",".join([repr(child) for child in self.children]) + ")"
else:
@@ -69,7 +69,7 @@ class TerminalNode(LexerNode):
self.value = value
def __repr__(self):
name = self.parsing_expression.rule_name or ""
name = "Node:" + (self.parsing_expression.rule_name or "")
return name + f"'{self.value}'"
def __eq__(self, other):
@@ -186,7 +186,7 @@ class Sequence(ParsingExpression):
class OrderedChoice(ParsingExpression):
"""
Will match one among multiple
Will match the first one among multiple
It will stop at the first match (so the order of definition is important)
"""
@@ -211,6 +211,42 @@ class OrderedChoice(ParsingExpression):
return self.add_rule_name_if_needed(f"({to_str})")
class LongestChoice(ParsingExpression):
"""
Will match the longest one among multiple
All elements will be tested, so the order is not important
The behaviour when multiple candidate is found is not defined yet
"""
def _parse(self, parser_helper):
init_pos = parser_helper.pos
longest_node = None
end_pos = -1
for e in self.nodes:
node = e.parse(parser_helper)
if node:
if longest_node is None or node.end > longest_node.end:
longest_node = node
end_pos = parser_helper.pos
parser_helper.seek(init_pos) # backtrack
if longest_node is None:
return None
parser_helper.seek(end_pos)
return NonTerminalNode(self,
init_pos,
longest_node.end,
parser_helper.parser.parser_input.tokens[init_pos: longest_node.end + 1],
[longest_node])
def __repr__(self):
to_str = "# ".join(repr(n) for n in self.elements)
return self.add_rule_name_if_needed(f"({to_str})")
class Optional(ParsingExpression):
"""
Will match or not the elements
@@ -386,7 +422,12 @@ class StrMatch(Match):
self.skip_white_space = skip_whitespace
def __repr__(self):
return self.add_rule_name_if_needed(f"'{self.to_match}'")
text = self.to_match
if not self.ignore_case:
text += "#!ic"
if not self.skip_white_space:
text += "#!sw"
return self.add_rule_name_if_needed(f"'{text}'")
def __eq__(self, other):
if not super().__eq__(other):
@@ -395,7 +436,9 @@ class StrMatch(Match):
if not isinstance(other, StrMatch):
return False
return self.to_match == other.to_match and self.ignore_case == other.ignore_case
return self.to_match == other.to_match and \
self.ignore_case == other.ignore_case and \
self.skip_white_space == other.skip_white_space
def _parse(self, parser_helper):
token = parser_helper.get_token()
@@ -766,7 +809,7 @@ class BnfConceptParserHelper:
_add_prop(_concept, _underlying.parsing_expression.rule_name, value)
_concept.metadata.need_validation = True
if isinstance(_underlying, NonTerminalNode):
elif isinstance(_underlying, NonTerminalNode):
for child in _underlying.children:
_process_rule_name(_concept, child)
@@ -789,6 +832,15 @@ class UnderConstruction:
concept_id: str
@dataclass()
class ToUpdate:
parent_id: int
parsing_expression: ParsingExpression
def __hash__(self):
return hash(self.parent_id)
class BnfNodeParser(BaseNodeParser):
def __init__(self, **kwargs):
super().__init__("BnfNode", 50, **kwargs)
@@ -824,6 +876,34 @@ class BnfNodeParser(BaseNodeParser):
return valid_parser_helpers
@staticmethod
def get_expression_from_concept_name(name):
"""
Create the parsing expression from the name
This function differs from BNFParser.parse() as it does not try to resolve identifiers into concepts
>>> assert get_expression_from_concept_name('one hundred') == Sequence(StrMatch("one"), StrMatch("hundred"))
while BNFParser.parse("one hundred") will look for concept 'one' and concept 'hundred'
:param name:
:return:
"""
if name is None or name.strip() == "":
return []
res = []
tokens = Tokenizer(name, yield_eof=False)
for token in tokens:
if token.type == TokenKind.WHITESPACE:
continue
elif token.type == TokenKind.STRING:
sub_tokens = list(Tokenizer(token.strip_quote, yield_eof=False))
for sub_token in sub_tokens[:-1]:
res.append(StrMatch(sub_token.str_value, skip_whitespace=False))
res.append(StrMatch(sub_tokens[-1].str_value))
else:
res.append(StrMatch(token.str_value))
return res[0] if len(res) == 1 else Sequence(*res)
def get_concepts_sequences(self):
"""
Main method that parses the tokens and extract the concepts
@@ -900,10 +980,10 @@ class BnfNodeParser(BaseNodeParser):
def check_for_infinite_recursion(self, parsing_expression, already_found, only_first=False):
if isinstance(parsing_expression, ConceptExpression):
if parsing_expression.concept in already_found:
if parsing_expression.concept.id in already_found:
return True
already_found.add(parsing_expression.concept)
return self.check_for_infinite_recursion(parsing_expression.nodes[0], already_found, False)
already_found.add(parsing_expression.concept.id)
return self.check_for_infinite_recursion(parsing_expression.nodes[0], already_found, only_first)
if isinstance(parsing_expression, Sequence):
# for sequence, we need to check all nodes
@@ -930,43 +1010,93 @@ class BnfNodeParser(BaseNodeParser):
return False
return False
if isinstance(parsing_expression, LongestChoice):
for node in parsing_expression.nodes:
already_found_for_current_node = already_found.copy()
if self.check_for_infinite_recursion(node, already_found_for_current_node, True):
already_found.update(already_found_for_current_node)
return True
return False
if isinstance(parsing_expression, UnderConstruction):
if parsing_expression.concept_id in already_found:
return True
already_found.add(parsing_expression.concept_id)
return False
def get_parsing_expression(self, context, concept):
"""
Compute the parsing expression for a given concept
:param context:
:param concept:
:return:
"""
if concept.id in self.concepts_grammars:
return self.concepts_grammars.get(concept.id)
grammar = self.concepts_grammars.copy()
to_resolve = {} # the key is the instance id of the parsing expression
isa_concepts = set()
self.resolve_concept_parsing_expression(context, concept, grammar, to_resolve, isa_concepts)
# internal cache of already computed parsing expression to use during the recursion
grammar = {}
for _id, pe in to_resolve.items():
for i, node in enumerate(pe.nodes):
if isinstance(node, UnderConstruction):
pe.nodes[i] = grammar.get(node.concept_id)
# concept that are not totally resolved, because they reference parsing expression under construction
to_update = set() # the key is the instance id of the parsing expression
# during the parsing of concept, we will resolve other concepts
# keep the track of the concepts that can safely be added to self.concept_grammars
to_keep = {concept.id}
desc = f"Get parsing expression for concept {concept}"
with context.push(BuiltinConcepts.INIT_BNF, concept,
who=self.name,
obj=concept,
root_concept=concept,
desc=desc) as sub_context:
# get the parsing expression
ret = self.resolve_concept_parsing_expression(sub_context, concept, grammar, to_update, to_keep)
# check and update parsing expression that are still under construction
# Note that we only update the concept that will update concepts_grammars
# because pe.node may be large
for item in to_update:
if item.parent_id in to_keep:
pe = item.parsing_expression
for i, node in enumerate(pe.nodes):
if isinstance(node, UnderConstruction):
pe.nodes[i] = grammar.get(node.concept_id)
# check for infinite recursion.
# We are adding a new concept. Does it create an infinite recursion ?
concepts_in_recursion = set()
if self.check_for_infinite_recursion(pe, concepts_in_recursion):
cycle = context.sheerka.new(BuiltinConcepts.CHICKEN_AND_EGG, body={c.id for c in concepts_in_recursion})
for concept in concepts_in_recursion:
grammar[concept.id] = cycle
if self.check_for_infinite_recursion(ret, concepts_in_recursion):
cycle = context.sheerka.new(BuiltinConcepts.CHICKEN_AND_EGG, body=concepts_in_recursion)
for concept_id in concepts_in_recursion:
grammar[concept_id] = cycle
# Make sure you do not put isa concepts in cache
# why :
# twenties = 'twenty' number where number < 10
# hundreds = number 'hundred' where number < 99
# the concept of number depends on its utilisation
for concept_id in [c for c in grammar if c not in isa_concepts]:
self.concepts_grammars.put(concept_id, grammar[concept_id])
# update, in case of infinite circular recursion
ret = grammar[concept.id]
return self.concepts_grammars.get(concept.id)
# finally, update concept grammar
for k, v in grammar.items():
if k in to_keep:
self.concepts_grammars.put(k, v)
def resolve_concept_parsing_expression(self, context, concept, grammar, to_resolve, isa_concepts):
if concept.id in grammar:
# not quite sure that it is a good idea.
# Why do we want to corrupt previous valid entries ?
if context.sheerka.isinstance(v, BuiltinConcepts.CHICKEN_AND_EGG):
self.concepts_grammars.put(k, v)
sub_context.add_values(return_values=ret)
return ret
def resolve_concept_parsing_expression(self, context, concept, grammar, to_update, to_keep):
if concept.id in self.concepts_grammars: # validated entry
return self.concepts_grammars.get(concept.id)
if concept.id in grammar: # under construction entry
return grammar.get(concept.id)
desc = f"Get parsing expression for '{concept}'"
desc = f"Resolve concept parsing expression for '{concept}'"
with context.push(BuiltinConcepts.INIT_BNF, concept, who=self.name, obj=concept, desc=desc) as sub_context:
if not concept.bnf: # to save a function call. Not sure it worth it.
BaseNodeParser.ensure_bnf(sub_context, concept, self.name)
@@ -979,52 +1109,41 @@ class BnfNodeParser(BaseNodeParser):
desc = f"Bnf concept detected. Resolving parsing expression '{expression}'"
with sub_context.push(BuiltinConcepts.INIT_BNF, concept, who=self.name, obj=concept, desc=desc) as ssc:
ssc.add_inputs(expression=expression)
resolved = self.resolve_parsing_expression(ssc, expression, grammar, to_resolve, isa_concepts)
resolved = self.resolve_parsing_expression(ssc, expression, grammar, to_update, to_keep)
ssc.add_values(return_values=resolved)
elif sheerka.isaset(context, concept):
desc = f"Concept is a group. Resolving parsing expression using 'isa'"
with sub_context.push(BuiltinConcepts.INIT_BNF, concept, who=self.name, obj=concept, desc=desc) as ssc:
ssc.add_inputs(concept=concept)
isa_concepts.add(concept.id)
concepts_in_group = self.sheerka.get_set_elements(ssc, concept)
# concepts_in_group comes from a set, so the order of its elements is not guaranteed
# to avoid random failure (ie random CHICKEN_AND_EGG), we need to rearrange
# We also remove the root concept (the one from get_parsing_expression())
root_concept_as_set = set(context.search(
predicate=lambda ec: ec.action == BuiltinConcepts.INIT_BNF,
get_obj=lambda ec: ec.obj,
stop=lambda ec: ec.action != BuiltinConcepts.INIT_BNF)) # there only one item in the set
root_concept = list(root_concept_as_set)[0]
reordered = []
valid_concepts = []
for c in concepts_in_group:
if c.id == root_concept.id:
if c.id == context.root_concept.id:
continue
# I do not guaranty the same order every time, but I minimize the ChickenAndEgg random issue
if c.metadata.definition_type == DEFINITION_TYPE_BNF or sheerka.isaset(ssc, c):
reordered.append(c)
else:
reordered.insert(0, c)
c_pe = self.resolve_concept_parsing_expression(context, c, grammar, to_update, to_keep)
if self.check_for_infinite_recursion(c_pe, {concept.id}, True):
continue
nodes = [ConceptExpression(c, rule_name=c.name) for c in reordered]
valid_concepts.append(c)
nodes = [ConceptExpression(c, rule_name=c.name) for c in valid_concepts]
resolved = self.resolve_parsing_expression(ssc,
OrderedChoice(*nodes),
LongestChoice(*nodes),
grammar,
to_resolve,
isa_concepts)
to_update,
to_keep)
ssc.add_values(concepts_in_group=concepts_in_group)
ssc.add_values(return_values=resolved)
else:
desc = f"Concept is a simple concept."
with sub_context.push(BuiltinConcepts.INIT_BNF, concept, who=self.name, obj=concept, desc=desc) as ssc:
tokens = Tokenizer(concept.name, yield_eof=False)
nodes = [StrMatch(token.strip_quote) for token in tokens]
expression = nodes[0] if len(nodes) == 1 else Sequence(nodes)
resolved = self.resolve_parsing_expression(ssc, expression, grammar, to_resolve, isa_concepts)
to_keep.add(concept.id)
expression = self.get_expression_from_concept_name(concept.name)
resolved = self.resolve_parsing_expression(ssc, expression, grammar, to_update, to_keep)
grammar[concept.id] = resolved
@@ -1035,7 +1154,7 @@ class BnfNodeParser(BaseNodeParser):
sub_context.add_values(return_values=resolved)
return resolved
def resolve_parsing_expression(self, context, expression, grammar, to_resolve, isa_concepts):
def resolve_parsing_expression(self, context, expression, grammar, to_update, to_keep):
if isinstance(expression, str):
ret = StrMatch(expression, ignore_case=self.ignore_case)
@@ -1051,7 +1170,7 @@ class BnfNodeParser(BaseNodeParser):
unknown_concept = self.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body=concept)
return self.add_error(unknown_concept)
pe = self.resolve_concept_parsing_expression(context, concept, grammar, to_resolve, isa_concepts)
pe = self.resolve_concept_parsing_expression(context, concept, grammar, to_update, to_keep)
if not isinstance(pe, (ParsingExpression, UnderConstruction)):
return pe # an error is detected, escalate it
@@ -1060,7 +1179,7 @@ class BnfNodeParser(BaseNodeParser):
# return pe # we are looking for ourself, just return it
if isinstance(pe, UnderConstruction):
to_resolve[id(expression)] = expression
to_update.add(ToUpdate(context.obj.id, expression))
expression.nodes = [pe]
expression.rule_name = expression.rule_name or concept.name
@@ -1073,17 +1192,18 @@ class BnfNodeParser(BaseNodeParser):
elif isinstance(expression, Sequence) or \
isinstance(expression, OrderedChoice) or \
isinstance(expression, LongestChoice) or \
isinstance(expression, ZeroOrMore) or \
isinstance(expression, OneOrMore) or \
isinstance(expression, Optional):
ret = expression
ret.nodes = []
for e in ret.elements:
pe = self.resolve_parsing_expression(context, e, grammar, to_resolve, isa_concepts)
pe = self.resolve_parsing_expression(context, e, grammar, to_update, to_keep)
if not isinstance(pe, (ParsingExpression, UnderConstruction)):
return pe # an error is detected, escalate it
if isinstance(pe, UnderConstruction):
to_resolve[id(ret)] = ret # remember that there is an unresolved parsing expression
to_update.add(ToUpdate(context.obj.id, ret))
ret.nodes.append(pe)
else:
@@ -1094,8 +1214,8 @@ class BnfNodeParser(BaseNodeParser):
expression.sep = self.resolve_parsing_expression(context,
expression.sep,
grammar,
to_resolve,
isa_concepts)
to_update,
to_keep)
return ret