Enhanced complex concepts handling
This commit is contained in:
+102
-14
@@ -34,10 +34,10 @@ def flatten(iterable):
|
||||
|
||||
@dataclass()
|
||||
class LexerNode(Node):
|
||||
start: int
|
||||
end: int
|
||||
tokens: list = None
|
||||
source: str = None
|
||||
start: int # starting index in the tokens list
|
||||
end: int # ending index in the tokens list
|
||||
tokens: list = None # tokens
|
||||
source: str = None # string representation of what was parsed
|
||||
|
||||
def __post_init__(self):
|
||||
if self.source is None:
|
||||
@@ -64,7 +64,15 @@ class UnrecognizedTokensNode(LexerNode):
|
||||
def fix_source(self):
|
||||
self.source = BaseParser.get_text_from_tokens(self.tokens)
|
||||
|
||||
def not_whitespace(self):
|
||||
return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, tuple):
|
||||
if len(other) != 3:
|
||||
return False
|
||||
return self.start == other[0] and self.end == other[1] and self.source == other[2]
|
||||
|
||||
if not isinstance(other, UnrecognizedTokensNode):
|
||||
return False
|
||||
|
||||
@@ -93,9 +101,9 @@ class ConceptNode(LexerNode):
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, tuple):
|
||||
if len(other) == 2:
|
||||
return self.concept == other[0] and self.source == other[1]
|
||||
return self.concept.key == other[0] and self.source == other[1]
|
||||
else:
|
||||
return self.concept == other[0] and \
|
||||
return self.concept.key == other[0] and \
|
||||
self.start == other[1] and \
|
||||
self.end == other[2] and \
|
||||
self.source == other[3]
|
||||
@@ -567,7 +575,7 @@ class ConceptLexerParser(BaseParser):
|
||||
|
||||
self.token = None
|
||||
self.pos = -1
|
||||
self.next_token()
|
||||
self.next_token(False)
|
||||
return True
|
||||
|
||||
def get_token(self) -> Token:
|
||||
@@ -762,8 +770,9 @@ class ConceptLexerParser(BaseParser):
|
||||
self.seek(init_pos)
|
||||
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
|
||||
if node is not None and node.end != -1:
|
||||
updated_concept = self.finalize_concept(context.sheerka, concept, node)
|
||||
concept_node = ConceptNode(
|
||||
concept,
|
||||
updated_concept,
|
||||
node.start,
|
||||
node.end,
|
||||
self.tokens[node.start: node.end + 1],
|
||||
@@ -777,27 +786,30 @@ class ConceptLexerParser(BaseParser):
|
||||
unrecognized_tokens.add_token(self.get_token(), init_pos)
|
||||
else:
|
||||
unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
|
||||
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
|
||||
has_unrecognized = True
|
||||
|
||||
if not self.next_token(False):
|
||||
break
|
||||
|
||||
else: # some concepts are recognized
|
||||
if unrecognized_tokens:
|
||||
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
|
||||
unrecognized_tokens.fix_source()
|
||||
unrecognized_tokens = None
|
||||
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
|
||||
has_unrecognized = True
|
||||
unrecognized_tokens = None
|
||||
|
||||
res = self.get_bests(res) # only keep the concepts that eat the more tokens
|
||||
concepts_found = core.utils.product(concepts_found, res)
|
||||
|
||||
# loop
|
||||
self.seek(res[0].end)
|
||||
if not self.next_token():
|
||||
if not self.next_token(False):
|
||||
break
|
||||
|
||||
# Fix the source for unrecognized tokens
|
||||
if unrecognized_tokens:
|
||||
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
|
||||
unrecognized_tokens.fix_source()
|
||||
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
|
||||
has_unrecognized = True
|
||||
|
||||
# else
|
||||
# returns as many ReturnValue than choices found
|
||||
@@ -821,6 +833,82 @@ class ConceptLexerParser(BaseParser):
|
||||
self.log_multiple_results(context, text, ret)
|
||||
return ret
|
||||
|
||||
def finalize_concept(self, sheerka, template, underlying, init_empty_body=True):
|
||||
"""
|
||||
Updates the properties of the concept
|
||||
Goes in recursion if the property is a concept
|
||||
"""
|
||||
|
||||
# this cache is to make sure that we return the same concept for the same ConceptMatch
|
||||
_underlying_value_cache = {}
|
||||
|
||||
def _add_prop(_concept, prop_name, value):
|
||||
"""
|
||||
Adds a new entry,
|
||||
makes a list if the property already exists
|
||||
"""
|
||||
if prop_name not in _concept.props or _concept.props[prop_name].value is None:
|
||||
# new entry
|
||||
_concept.set_prop(prop_name, value)
|
||||
else:
|
||||
# make a list if there was a value
|
||||
previous_value = _concept.props[prop_name].value
|
||||
if isinstance(previous_value, list):
|
||||
previous_value.append(value)
|
||||
else:
|
||||
new_value = [previous_value, value]
|
||||
_concept.set_prop(prop_name, new_value)
|
||||
|
||||
def _look_for_concept_match(_underlying):
|
||||
if isinstance(_underlying.parsing_expression, ConceptMatch):
|
||||
return _underlying
|
||||
|
||||
if not isinstance(_underlying, NonTerminalNode):
|
||||
return None
|
||||
|
||||
if len(_underlying.children) != 1:
|
||||
return None
|
||||
|
||||
return _look_for_concept_match(_underlying.children[0])
|
||||
|
||||
def _get_underlying_value(_underlying):
|
||||
concept_match_node = _look_for_concept_match(_underlying)
|
||||
if concept_match_node:
|
||||
if id(concept_match_node) in _underlying_value_cache:
|
||||
result = _underlying_value_cache[id(concept_match_node)]
|
||||
else:
|
||||
ref_tpl = concept_match_node.parsing_expression.concept
|
||||
result = self.finalize_concept(sheerka, ref_tpl, concept_match_node.children[0], init_empty_body)
|
||||
_underlying_value_cache[id(concept_match_node)] = result
|
||||
else:
|
||||
result = _underlying.source
|
||||
|
||||
return result
|
||||
|
||||
def _process_rule_name(_concept, _underlying):
|
||||
if _underlying.parsing_expression.rule_name:
|
||||
value = _get_underlying_value(_underlying)
|
||||
_add_prop(_concept, _underlying.parsing_expression.rule_name, value)
|
||||
|
||||
if isinstance(_underlying, NonTerminalNode):
|
||||
for child in _underlying.children:
|
||||
_process_rule_name(_concept, child)
|
||||
|
||||
key = (template.key, template.id) if template.id else template.key
|
||||
concept = sheerka.new(key)
|
||||
if init_empty_body and concept.body is None:
|
||||
value = _get_underlying_value(underlying)
|
||||
concept.metadata.body = value
|
||||
concept.metadata.is_evaluated = True
|
||||
if underlying.parsing_expression.rule_name:
|
||||
_add_prop(concept, underlying.parsing_expression.rule_name, value)
|
||||
|
||||
if isinstance(underlying, NonTerminalNode):
|
||||
for node in underlying.children:
|
||||
_process_rule_name(concept, node)
|
||||
|
||||
return concept
|
||||
|
||||
@staticmethod
|
||||
def get_bests(results):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user