Fixed BnfNodeParser to allow expressions like 'number hundred' when number is a group

This commit is contained in:
2020-06-27 18:56:04 +02:00
parent d4468da8a3
commit 2c5840752a
14 changed files with 593 additions and 228 deletions
+8
View File
@@ -36,6 +36,7 @@ class Sheerka(Concept):
CONCEPTS_SYA_DEFINITION_ENTRY = "Concepts_Sya_Definitions"
RESOLVED_CONCEPTS_SYA_DEFINITION_ENTRY = "Resolved_Concepts_Sya_Definitions"
CONCEPTS_GRAMMARS_ENTRY = "Concepts_Grammars"
CHICKEN_AND_EGG_CONCEPTS_ENTRY = "Chicken_And_Egg_Concepts"
CONCEPTS_KEYS_ENTRY = "Concepts_Keys"
BUILTIN_CONCEPTS_KEYS = "Builtins_Concepts" # sequential key for builtin concepts
@@ -105,6 +106,10 @@ class Sheerka(Concept):
@property
def concepts_grammars(self):
return self.cache_manager.caches[self.CHICKEN_AND_EGG_CONCEPTS_ENTRY].cache
@property
def chicken_and_eggs(self):
return self.cache_manager.caches[self.CONCEPTS_GRAMMARS_ENTRY].cache
def bind_service_method(self, bound_method, as_name=None):
@@ -227,6 +232,9 @@ class Sheerka(Concept):
cache = Cache()
self.cache_manager.register_cache(self.CONCEPTS_GRAMMARS_ENTRY, cache, persist=False)
cache = Cache()
self.cache_manager.register_cache(self.CHICKEN_AND_EGG_CONCEPTS_ENTRY, cache, persist=False)
def initialize_services(self):
"""
Introspect to find services and bind them
@@ -26,7 +26,7 @@ class SheerkaEvaluateConcept(BaseService):
parent = context.get_parent()
while parent is not None:
if parent.who == context.who and parent.obj == concept:
if parent.who == context.who and parent.obj == concept and parent.obj.compiled == concept.compiled:
return True
parent = parent.get_parent()
@@ -147,6 +147,11 @@ class SheerkaEvaluateConcept(BaseService):
def resolve(self, context, to_resolve, current_prop, current_concept, force_evaluation):
def get_path(context_, prop_name):
prefix = context_.path if hasattr(context_, "path") else "<N/A>"
value = prop_name.name if isinstance(current_prop, ConceptParts) else prop_name
return prefix + "." + value
if isinstance(to_resolve, DoNotResolve):
return to_resolve.value
@@ -161,12 +166,14 @@ class SheerkaEvaluateConcept(BaseService):
sub_context.add_values(return_values=ret_val)
return ret_val.body
desc = f"Evaluating {current_prop} (concept={current_concept})"
path = get_path(context, current_prop)
desc = f"Evaluating {path} (concept={current_concept})"
context.log(desc, self.NAME)
with context.push(BuiltinConcepts.EVALUATING_CONCEPT,
current_prop,
desc=desc,
obj=current_concept) as sub_context:
obj=current_concept,
path=path) as sub_context:
if force_evaluation:
sub_context.local_hints.add(BuiltinConcepts.EVAL_BODY_REQUESTED)
@@ -76,6 +76,7 @@ class SheerkaSetsManager(BaseService):
self.sheerka.new(BuiltinConcepts.CONCEPT_ALREADY_IN_SET, body=concept, concept_set=concept_set))
self.sets.put(concept_set.id, concept.id)
self.concepts_in_set.delete(concept_set.id)
return self.sheerka.ret(self.NAME, True, self.sheerka.new(BuiltinConcepts.SUCCESS))
def add_concepts_to_set(self, context, concepts, concept_set):
@@ -95,6 +96,7 @@ class SheerkaSetsManager(BaseService):
concept_set=concept_set)
else:
body = self.sheerka.new(BuiltinConcepts.SUCCESS)
self.concepts_in_set.delete(concept_set.id)
return self.sheerka.ret(self.NAME, len(already_in_set) != len(concepts), body)
+3 -3
View File
@@ -205,9 +205,9 @@ def make_unique(lst, get_id=None):
yield x
else:
for x in seq:
x = get_id(x)
if x not in seen:
seen.add(x)
_id = get_id(x)
if _id not in seen:
seen.add(_id)
yield x
return list(_make_unique(lst, get_id))
+1
View File
@@ -264,6 +264,7 @@ class AtomNodeParser(BaseNodeParser):
concept_parser.lock()
concepts = self.get_concepts(token, self._is_eligible, custom=_get_concepts_by_name)
#self.context.log(f"concepts found for {token=}: {concepts}", who=self.name)
if not concepts:
for concept_parser in concept_parser_helpers:
concept_parser.eat_unrecognized(token, pos)
+79 -12
View File
@@ -525,7 +525,7 @@ class CNC(CN):
to_compare = {k: v for k, v in other.concept.compiled.items() if k != ConceptParts.BODY}
else:
to_compare = other.concept.compiled
if self.compiled == to_compare:
if self.compiled == to_compare: # expanded form to ease the debug
return True
else:
return False
@@ -673,7 +673,8 @@ class BaseNodeParser(BaseParser):
concept = to_map(self, concept) if to_map else concept
result.append(concept)
return result + custom_concepts
return core.utils.make_unique(result + custom_concepts,
lambda c: c.concept.id if hasattr(c, "concept") else c.id)
return custom_concepts if custom else None
@@ -707,16 +708,20 @@ class BaseNodeParser(BaseParser):
@staticmethod
def resolve_concepts_by_first_keyword(context, concepts_by_first_keyword):
sheerka = context.sheerka
res = {}
def resolve_concepts(concept_str):
c_key, c_id = core.utils.unstr_concept(concept_str)
if c_id in already_seen:
return ChickenAndEggError(already_seen)
already_seen.add(c_id)
resolved = set()
to_resolve = set()
concept = sheerka.get_by_id(core.utils.unstr_concept(concept_str)[1])
chicken_and_egg = set()
if concept.id in already_seen:
raise ChickenAndEggError(already_seen)
else:
already_seen.add(concept.id)
concept = sheerka.get_by_id(c_id)
if sheerka.isaset(context, concept):
concepts = sheerka.get_set_elements(context, concept)
@@ -730,11 +735,18 @@ class BaseNodeParser(BaseParser):
(to_resolve if keyword.startswith("c:|") else resolved).add(keyword)
for concept_to_resolve_str in to_resolve:
resolved |= resolve_concepts(concept_to_resolve_str)
res = resolve_concepts(concept_to_resolve_str)
if isinstance(res, ChickenAndEggError):
chicken_and_egg |= res.concepts
else:
resolved |= res
to_resolve.clear()
return resolved
if len(resolved) == 0 and len(chicken_and_egg) > 0:
raise ChickenAndEggError(chicken_and_egg)
else:
return resolved
res = {}
for k, v in concepts_by_first_keyword.items():
if k.startswith("c:|"):
try:
@@ -744,8 +756,16 @@ class BaseNodeParser(BaseParser):
res.setdefault(resolved, []).extend(v)
except ChickenAndEggError as ex:
context.log(f"Chicken and egg detected for {k}, concepts={ex.concepts}")
# res[k] = sheerka.new(BuiltinConcepts.CHICKEN_AND_EGG,
# body=[sheerka.get_by_id(c) for c in ex.concepts])
concepts_in_recursion = ex.concepts
# make sure to have all the parents
for parent in v:
concepts_in_recursion.add(parent)
for concept_id in concepts_in_recursion:
# make sure we keep the longest chain
old = sheerka.chicken_and_eggs.get(concept_id)
if old is None or len(old) < len(ex.concepts):
sheerka.chicken_and_eggs.put(concept_id, concepts_in_recursion)
else:
res.setdefault(k, []).extend(v)
@@ -755,6 +775,53 @@ class BaseNodeParser(BaseParser):
return sheerka.ret("BaseNodeParser", True, res)
@staticmethod
def get_referenced_concepts(context, concept_id, already_seen):
"""
Gets all the tokens that may allow to recognize concept concept_id
Basically, it returns all the starting tokens for concept concept_id
CHICKEN_AND_EGG is returned when a circular references are found
:param context:
:param concept_id:
:param already_seen:
:return:
"""
if concept_id in already_seen:
return ChickenAndEggError(already_seen)
already_seen.add(concept_id)
resolved = set()
to_resolve = set()
chicken_and_egg = set()
sheerka = context.sheerka
concept = sheerka.get_by_id(concept_id)
if sheerka.isaset(context, concept):
concepts = sheerka.get_set_elements(context, concept)
else:
concepts = [concept]
for concept in concepts:
BaseNodeParser.ensure_bnf(context, concept) # need to make sure that it cannot fail
keywords = BaseNodeParser.get_first_tokens(sheerka, concept)
for keyword in keywords:
(to_resolve if keyword.startswith("c:|") else resolved).add(keyword)
for concept_to_resolve_str in to_resolve:
c_key, c_id = core.utils.unstr_concept(concept_to_resolve_str)
res = BaseNodeParser.get_referenced_concepts(context, c_id, already_seen)
if isinstance(res, ChickenAndEggError):
chicken_and_egg |= res.concepts
else:
resolved |= res
to_resolve.clear()
if len(resolved) == 0 and len(chicken_and_egg) > 0:
raise ChickenAndEggError(chicken_and_egg)
else:
return resolved
@staticmethod
def resolve_sya_associativity_and_precedence(context, sya):
pass
+185 -65
View File
@@ -38,7 +38,7 @@ class NonTerminalNode(LexerNode):
self.children = children
def __repr__(self):
name = self.parsing_expression.rule_name or self.parsing_expression.__class__.__name__
name = "Node:" + (self.parsing_expression.rule_name or self.parsing_expression.__class__.__name__)
if len(self.children) > 0:
sub_names = "(" + ",".join([repr(child) for child in self.children]) + ")"
else:
@@ -69,7 +69,7 @@ class TerminalNode(LexerNode):
self.value = value
def __repr__(self):
name = self.parsing_expression.rule_name or ""
name = "Node:" + (self.parsing_expression.rule_name or "")
return name + f"'{self.value}'"
def __eq__(self, other):
@@ -186,7 +186,7 @@ class Sequence(ParsingExpression):
class OrderedChoice(ParsingExpression):
"""
Will match one among multiple
Will match the first one among multiple
It will stop at the first match (so the order of definition is important)
"""
@@ -211,6 +211,42 @@ class OrderedChoice(ParsingExpression):
return self.add_rule_name_if_needed(f"({to_str})")
class LongestChoice(ParsingExpression):
"""
Will match the longest one among multiple
All elements will be tested, so the order is not important
The behaviour when multiple candidate is found is not defined yet
"""
def _parse(self, parser_helper):
init_pos = parser_helper.pos
longest_node = None
end_pos = -1
for e in self.nodes:
node = e.parse(parser_helper)
if node:
if longest_node is None or node.end > longest_node.end:
longest_node = node
end_pos = parser_helper.pos
parser_helper.seek(init_pos) # backtrack
if longest_node is None:
return None
parser_helper.seek(end_pos)
return NonTerminalNode(self,
init_pos,
longest_node.end,
parser_helper.parser.parser_input.tokens[init_pos: longest_node.end + 1],
[longest_node])
def __repr__(self):
to_str = "# ".join(repr(n) for n in self.elements)
return self.add_rule_name_if_needed(f"({to_str})")
class Optional(ParsingExpression):
"""
Will match or not the elements
@@ -386,7 +422,12 @@ class StrMatch(Match):
self.skip_white_space = skip_whitespace
def __repr__(self):
return self.add_rule_name_if_needed(f"'{self.to_match}'")
text = self.to_match
if not self.ignore_case:
text += "#!ic"
if not self.skip_white_space:
text += "#!sw"
return self.add_rule_name_if_needed(f"'{text}'")
def __eq__(self, other):
if not super().__eq__(other):
@@ -395,7 +436,9 @@ class StrMatch(Match):
if not isinstance(other, StrMatch):
return False
return self.to_match == other.to_match and self.ignore_case == other.ignore_case
return self.to_match == other.to_match and \
self.ignore_case == other.ignore_case and \
self.skip_white_space == other.skip_white_space
def _parse(self, parser_helper):
token = parser_helper.get_token()
@@ -766,7 +809,7 @@ class BnfConceptParserHelper:
_add_prop(_concept, _underlying.parsing_expression.rule_name, value)
_concept.metadata.need_validation = True
if isinstance(_underlying, NonTerminalNode):
elif isinstance(_underlying, NonTerminalNode):
for child in _underlying.children:
_process_rule_name(_concept, child)
@@ -789,6 +832,15 @@ class UnderConstruction:
concept_id: str
@dataclass()
class ToUpdate:
parent_id: int
parsing_expression: ParsingExpression
def __hash__(self):
return hash(self.parent_id)
class BnfNodeParser(BaseNodeParser):
def __init__(self, **kwargs):
super().__init__("BnfNode", 50, **kwargs)
@@ -824,6 +876,34 @@ class BnfNodeParser(BaseNodeParser):
return valid_parser_helpers
@staticmethod
def get_expression_from_concept_name(name):
"""
Create the parsing expression from the name
This function differs from BNFParser.parse() as it does not try to resolve identifiers into concepts
>>> assert get_expression_from_concept_name('one hundred') == Sequence(StrMatch("one"), StrMatch("hundred"))
while BNFParser.parse("one hundred") will look for concept 'one' and concept 'hundred'
:param name:
:return:
"""
if name is None or name.strip() == "":
return []
res = []
tokens = Tokenizer(name, yield_eof=False)
for token in tokens:
if token.type == TokenKind.WHITESPACE:
continue
elif token.type == TokenKind.STRING:
sub_tokens = list(Tokenizer(token.strip_quote, yield_eof=False))
for sub_token in sub_tokens[:-1]:
res.append(StrMatch(sub_token.str_value, skip_whitespace=False))
res.append(StrMatch(sub_tokens[-1].str_value))
else:
res.append(StrMatch(token.str_value))
return res[0] if len(res) == 1 else Sequence(*res)
def get_concepts_sequences(self):
"""
Main method that parses the tokens and extract the concepts
@@ -900,10 +980,10 @@ class BnfNodeParser(BaseNodeParser):
def check_for_infinite_recursion(self, parsing_expression, already_found, only_first=False):
if isinstance(parsing_expression, ConceptExpression):
if parsing_expression.concept in already_found:
if parsing_expression.concept.id in already_found:
return True
already_found.add(parsing_expression.concept)
return self.check_for_infinite_recursion(parsing_expression.nodes[0], already_found, False)
already_found.add(parsing_expression.concept.id)
return self.check_for_infinite_recursion(parsing_expression.nodes[0], already_found, only_first)
if isinstance(parsing_expression, Sequence):
# for sequence, we need to check all nodes
@@ -930,43 +1010,93 @@ class BnfNodeParser(BaseNodeParser):
return False
return False
if isinstance(parsing_expression, LongestChoice):
for node in parsing_expression.nodes:
already_found_for_current_node = already_found.copy()
if self.check_for_infinite_recursion(node, already_found_for_current_node, True):
already_found.update(already_found_for_current_node)
return True
return False
if isinstance(parsing_expression, UnderConstruction):
if parsing_expression.concept_id in already_found:
return True
already_found.add(parsing_expression.concept_id)
return False
def get_parsing_expression(self, context, concept):
"""
Compute the parsing expression for a given concept
:param context:
:param concept:
:return:
"""
if concept.id in self.concepts_grammars:
return self.concepts_grammars.get(concept.id)
grammar = self.concepts_grammars.copy()
to_resolve = {} # the key is the instance id of the parsing expression
isa_concepts = set()
self.resolve_concept_parsing_expression(context, concept, grammar, to_resolve, isa_concepts)
# internal cache of already computed parsing expression to use during the recursion
grammar = {}
for _id, pe in to_resolve.items():
for i, node in enumerate(pe.nodes):
if isinstance(node, UnderConstruction):
pe.nodes[i] = grammar.get(node.concept_id)
# concept that are not totally resolved, because they reference parsing expression under construction
to_update = set() # the key is the instance id of the parsing expression
# during the parsing of concept, we will resolve other concepts
# keep the track of the concepts that can safely be added to self.concept_grammars
to_keep = {concept.id}
desc = f"Get parsing expression for concept {concept}"
with context.push(BuiltinConcepts.INIT_BNF, concept,
who=self.name,
obj=concept,
root_concept=concept,
desc=desc) as sub_context:
# get the parsing expression
ret = self.resolve_concept_parsing_expression(sub_context, concept, grammar, to_update, to_keep)
# check and update parsing expression that are still under construction
# Note that we only update the concept that will update concepts_grammars
# because pe.node may be large
for item in to_update:
if item.parent_id in to_keep:
pe = item.parsing_expression
for i, node in enumerate(pe.nodes):
if isinstance(node, UnderConstruction):
pe.nodes[i] = grammar.get(node.concept_id)
# check for infinite recursion.
# We are adding a new concept. Does it create an infinite recursion ?
concepts_in_recursion = set()
if self.check_for_infinite_recursion(pe, concepts_in_recursion):
cycle = context.sheerka.new(BuiltinConcepts.CHICKEN_AND_EGG, body={c.id for c in concepts_in_recursion})
for concept in concepts_in_recursion:
grammar[concept.id] = cycle
if self.check_for_infinite_recursion(ret, concepts_in_recursion):
cycle = context.sheerka.new(BuiltinConcepts.CHICKEN_AND_EGG, body=concepts_in_recursion)
for concept_id in concepts_in_recursion:
grammar[concept_id] = cycle
# Make sure you do not put isa concepts in cache
# why :
# twenties = 'twenty' number where number < 10
# hundreds = number 'hundred' where number < 99
# the concept of number depends on its utilisation
for concept_id in [c for c in grammar if c not in isa_concepts]:
self.concepts_grammars.put(concept_id, grammar[concept_id])
# update, in case of infinite circular recursion
ret = grammar[concept.id]
return self.concepts_grammars.get(concept.id)
# finally, update concept grammar
for k, v in grammar.items():
if k in to_keep:
self.concepts_grammars.put(k, v)
def resolve_concept_parsing_expression(self, context, concept, grammar, to_resolve, isa_concepts):
if concept.id in grammar:
# not quite sure that it is a good idea.
# Why do we want to corrupt previous valid entries ?
if context.sheerka.isinstance(v, BuiltinConcepts.CHICKEN_AND_EGG):
self.concepts_grammars.put(k, v)
sub_context.add_values(return_values=ret)
return ret
def resolve_concept_parsing_expression(self, context, concept, grammar, to_update, to_keep):
if concept.id in self.concepts_grammars: # validated entry
return self.concepts_grammars.get(concept.id)
if concept.id in grammar: # under construction entry
return grammar.get(concept.id)
desc = f"Get parsing expression for '{concept}'"
desc = f"Resolve concept parsing expression for '{concept}'"
with context.push(BuiltinConcepts.INIT_BNF, concept, who=self.name, obj=concept, desc=desc) as sub_context:
if not concept.bnf: # to save a function call. Not sure it worth it.
BaseNodeParser.ensure_bnf(sub_context, concept, self.name)
@@ -979,52 +1109,41 @@ class BnfNodeParser(BaseNodeParser):
desc = f"Bnf concept detected. Resolving parsing expression '{expression}'"
with sub_context.push(BuiltinConcepts.INIT_BNF, concept, who=self.name, obj=concept, desc=desc) as ssc:
ssc.add_inputs(expression=expression)
resolved = self.resolve_parsing_expression(ssc, expression, grammar, to_resolve, isa_concepts)
resolved = self.resolve_parsing_expression(ssc, expression, grammar, to_update, to_keep)
ssc.add_values(return_values=resolved)
elif sheerka.isaset(context, concept):
desc = f"Concept is a group. Resolving parsing expression using 'isa'"
with sub_context.push(BuiltinConcepts.INIT_BNF, concept, who=self.name, obj=concept, desc=desc) as ssc:
ssc.add_inputs(concept=concept)
isa_concepts.add(concept.id)
concepts_in_group = self.sheerka.get_set_elements(ssc, concept)
# concepts_in_group comes from a set, so the order of its elements is not guaranteed
# to avoid random failure (ie random CHICKEN_AND_EGG), we need to rearrange
# We also remove the root concept (the one from get_parsing_expression())
root_concept_as_set = set(context.search(
predicate=lambda ec: ec.action == BuiltinConcepts.INIT_BNF,
get_obj=lambda ec: ec.obj,
stop=lambda ec: ec.action != BuiltinConcepts.INIT_BNF)) # there only one item in the set
root_concept = list(root_concept_as_set)[0]
reordered = []
valid_concepts = []
for c in concepts_in_group:
if c.id == root_concept.id:
if c.id == context.root_concept.id:
continue
# I do not guaranty the same order every time, but I minimize the ChickenAndEgg random issue
if c.metadata.definition_type == DEFINITION_TYPE_BNF or sheerka.isaset(ssc, c):
reordered.append(c)
else:
reordered.insert(0, c)
c_pe = self.resolve_concept_parsing_expression(context, c, grammar, to_update, to_keep)
if self.check_for_infinite_recursion(c_pe, {concept.id}, True):
continue
nodes = [ConceptExpression(c, rule_name=c.name) for c in reordered]
valid_concepts.append(c)
nodes = [ConceptExpression(c, rule_name=c.name) for c in valid_concepts]
resolved = self.resolve_parsing_expression(ssc,
OrderedChoice(*nodes),
LongestChoice(*nodes),
grammar,
to_resolve,
isa_concepts)
to_update,
to_keep)
ssc.add_values(concepts_in_group=concepts_in_group)
ssc.add_values(return_values=resolved)
else:
desc = f"Concept is a simple concept."
with sub_context.push(BuiltinConcepts.INIT_BNF, concept, who=self.name, obj=concept, desc=desc) as ssc:
tokens = Tokenizer(concept.name, yield_eof=False)
nodes = [StrMatch(token.strip_quote) for token in tokens]
expression = nodes[0] if len(nodes) == 1 else Sequence(nodes)
resolved = self.resolve_parsing_expression(ssc, expression, grammar, to_resolve, isa_concepts)
to_keep.add(concept.id)
expression = self.get_expression_from_concept_name(concept.name)
resolved = self.resolve_parsing_expression(ssc, expression, grammar, to_update, to_keep)
grammar[concept.id] = resolved
@@ -1035,7 +1154,7 @@ class BnfNodeParser(BaseNodeParser):
sub_context.add_values(return_values=resolved)
return resolved
def resolve_parsing_expression(self, context, expression, grammar, to_resolve, isa_concepts):
def resolve_parsing_expression(self, context, expression, grammar, to_update, to_keep):
if isinstance(expression, str):
ret = StrMatch(expression, ignore_case=self.ignore_case)
@@ -1051,7 +1170,7 @@ class BnfNodeParser(BaseNodeParser):
unknown_concept = self.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body=concept)
return self.add_error(unknown_concept)
pe = self.resolve_concept_parsing_expression(context, concept, grammar, to_resolve, isa_concepts)
pe = self.resolve_concept_parsing_expression(context, concept, grammar, to_update, to_keep)
if not isinstance(pe, (ParsingExpression, UnderConstruction)):
return pe # an error is detected, escalate it
@@ -1060,7 +1179,7 @@ class BnfNodeParser(BaseNodeParser):
# return pe # we are looking for ourself, just return it
if isinstance(pe, UnderConstruction):
to_resolve[id(expression)] = expression
to_update.add(ToUpdate(context.obj.id, expression))
expression.nodes = [pe]
expression.rule_name = expression.rule_name or concept.name
@@ -1073,17 +1192,18 @@ class BnfNodeParser(BaseNodeParser):
elif isinstance(expression, Sequence) or \
isinstance(expression, OrderedChoice) or \
isinstance(expression, LongestChoice) or \
isinstance(expression, ZeroOrMore) or \
isinstance(expression, OneOrMore) or \
isinstance(expression, Optional):
ret = expression
ret.nodes = []
for e in ret.elements:
pe = self.resolve_parsing_expression(context, e, grammar, to_resolve, isa_concepts)
pe = self.resolve_parsing_expression(context, e, grammar, to_update, to_keep)
if not isinstance(pe, (ParsingExpression, UnderConstruction)):
return pe # an error is detected, escalate it
if isinstance(pe, UnderConstruction):
to_resolve[id(ret)] = ret # remember that there is an unresolved parsing expression
to_update.add(ToUpdate(context.obj.id, ret))
ret.nodes.append(pe)
else:
@@ -1094,8 +1214,8 @@ class BnfNodeParser(BaseNodeParser):
expression.sep = self.resolve_parsing_expression(context,
expression.sep,
grammar,
to_resolve,
isa_concepts)
to_update,
to_keep)
return ret