Enhanced complex concepts handling

This commit is contained in:
2020-01-11 08:03:35 +01:00
parent a62c1f0f13
commit 40416ac337
24 changed files with 1647 additions and 961 deletions
+102 -14
View File
@@ -34,10 +34,10 @@ def flatten(iterable):
@dataclass()
class LexerNode(Node):
start: int
end: int
tokens: list = None
source: str = None
start: int # starting index in the tokens list
end: int # ending index in the tokens list
tokens: list = None # tokens
source: str = None # string representation of what was parsed
def __post_init__(self):
if self.source is None:
@@ -64,7 +64,15 @@ class UnrecognizedTokensNode(LexerNode):
def fix_source(self):
self.source = BaseParser.get_text_from_tokens(self.tokens)
def not_whitespace(self):
return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))
def __eq__(self, other):
if isinstance(other, tuple):
if len(other) != 3:
return False
return self.start == other[0] and self.end == other[1] and self.source == other[2]
if not isinstance(other, UnrecognizedTokensNode):
return False
@@ -93,9 +101,9 @@ class ConceptNode(LexerNode):
def __eq__(self, other):
if isinstance(other, tuple):
if len(other) == 2:
return self.concept == other[0] and self.source == other[1]
return self.concept.key == other[0] and self.source == other[1]
else:
return self.concept == other[0] and \
return self.concept.key == other[0] and \
self.start == other[1] and \
self.end == other[2] and \
self.source == other[3]
@@ -567,7 +575,7 @@ class ConceptLexerParser(BaseParser):
self.token = None
self.pos = -1
self.next_token()
self.next_token(False)
return True
def get_token(self) -> Token:
@@ -762,8 +770,9 @@ class ConceptLexerParser(BaseParser):
self.seek(init_pos)
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
if node is not None and node.end != -1:
updated_concept = self.finalize_concept(context.sheerka, concept, node)
concept_node = ConceptNode(
concept,
updated_concept,
node.start,
node.end,
self.tokens[node.start: node.end + 1],
@@ -777,27 +786,30 @@ class ConceptLexerParser(BaseParser):
unrecognized_tokens.add_token(self.get_token(), init_pos)
else:
unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
if not self.next_token(False):
break
else: # some concepts are recognized
if unrecognized_tokens:
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
unrecognized_tokens.fix_source()
unrecognized_tokens = None
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
unrecognized_tokens = None
res = self.get_bests(res) # only keep the concepts that eat the more tokens
concepts_found = core.utils.product(concepts_found, res)
# loop
self.seek(res[0].end)
if not self.next_token():
if not self.next_token(False):
break
# Fix the source for unrecognized tokens
if unrecognized_tokens:
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
unrecognized_tokens.fix_source()
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
# else
# returns as many ReturnValue than choices found
@@ -821,6 +833,82 @@ class ConceptLexerParser(BaseParser):
self.log_multiple_results(context, text, ret)
return ret
def finalize_concept(self, sheerka, template, underlying, init_empty_body=True):
"""
Updates the properties of the concept
Goes in recursion if the property is a concept
"""
# this cache is to make sure that we return the same concept for the same ConceptMatch
_underlying_value_cache = {}
def _add_prop(_concept, prop_name, value):
"""
Adds a new entry,
makes a list if the property already exists
"""
if prop_name not in _concept.props or _concept.props[prop_name].value is None:
# new entry
_concept.set_prop(prop_name, value)
else:
# make a list if there was a value
previous_value = _concept.props[prop_name].value
if isinstance(previous_value, list):
previous_value.append(value)
else:
new_value = [previous_value, value]
_concept.set_prop(prop_name, new_value)
def _look_for_concept_match(_underlying):
if isinstance(_underlying.parsing_expression, ConceptMatch):
return _underlying
if not isinstance(_underlying, NonTerminalNode):
return None
if len(_underlying.children) != 1:
return None
return _look_for_concept_match(_underlying.children[0])
def _get_underlying_value(_underlying):
concept_match_node = _look_for_concept_match(_underlying)
if concept_match_node:
if id(concept_match_node) in _underlying_value_cache:
result = _underlying_value_cache[id(concept_match_node)]
else:
ref_tpl = concept_match_node.parsing_expression.concept
result = self.finalize_concept(sheerka, ref_tpl, concept_match_node.children[0], init_empty_body)
_underlying_value_cache[id(concept_match_node)] = result
else:
result = _underlying.source
return result
def _process_rule_name(_concept, _underlying):
if _underlying.parsing_expression.rule_name:
value = _get_underlying_value(_underlying)
_add_prop(_concept, _underlying.parsing_expression.rule_name, value)
if isinstance(_underlying, NonTerminalNode):
for child in _underlying.children:
_process_rule_name(_concept, child)
key = (template.key, template.id) if template.id else template.key
concept = sheerka.new(key)
if init_empty_body and concept.body is None:
value = _get_underlying_value(underlying)
concept.metadata.body = value
concept.metadata.is_evaluated = True
if underlying.parsing_expression.rule_name:
_add_prop(concept, underlying.parsing_expression.rule_name, value)
if isinstance(underlying, NonTerminalNode):
for node in underlying.children:
_process_rule_name(concept, node)
return concept
@staticmethod
def get_bests(results):
"""