Enhanced complex concepts handling
This commit is contained in:
+102
-14
@@ -34,10 +34,10 @@ def flatten(iterable):
|
||||
|
||||
@dataclass()
|
||||
class LexerNode(Node):
|
||||
start: int
|
||||
end: int
|
||||
tokens: list = None
|
||||
source: str = None
|
||||
start: int # starting index in the tokens list
|
||||
end: int # ending index in the tokens list
|
||||
tokens: list = None # tokens
|
||||
source: str = None # string representation of what was parsed
|
||||
|
||||
def __post_init__(self):
|
||||
if self.source is None:
|
||||
@@ -64,7 +64,15 @@ class UnrecognizedTokensNode(LexerNode):
|
||||
def fix_source(self):
|
||||
self.source = BaseParser.get_text_from_tokens(self.tokens)
|
||||
|
||||
def not_whitespace(self):
|
||||
return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, tuple):
|
||||
if len(other) != 3:
|
||||
return False
|
||||
return self.start == other[0] and self.end == other[1] and self.source == other[2]
|
||||
|
||||
if not isinstance(other, UnrecognizedTokensNode):
|
||||
return False
|
||||
|
||||
@@ -93,9 +101,9 @@ class ConceptNode(LexerNode):
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, tuple):
|
||||
if len(other) == 2:
|
||||
return self.concept == other[0] and self.source == other[1]
|
||||
return self.concept.key == other[0] and self.source == other[1]
|
||||
else:
|
||||
return self.concept == other[0] and \
|
||||
return self.concept.key == other[0] and \
|
||||
self.start == other[1] and \
|
||||
self.end == other[2] and \
|
||||
self.source == other[3]
|
||||
@@ -567,7 +575,7 @@ class ConceptLexerParser(BaseParser):
|
||||
|
||||
self.token = None
|
||||
self.pos = -1
|
||||
self.next_token()
|
||||
self.next_token(False)
|
||||
return True
|
||||
|
||||
def get_token(self) -> Token:
|
||||
@@ -762,8 +770,9 @@ class ConceptLexerParser(BaseParser):
|
||||
self.seek(init_pos)
|
||||
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
|
||||
if node is not None and node.end != -1:
|
||||
updated_concept = self.finalize_concept(context.sheerka, concept, node)
|
||||
concept_node = ConceptNode(
|
||||
concept,
|
||||
updated_concept,
|
||||
node.start,
|
||||
node.end,
|
||||
self.tokens[node.start: node.end + 1],
|
||||
@@ -777,27 +786,30 @@ class ConceptLexerParser(BaseParser):
|
||||
unrecognized_tokens.add_token(self.get_token(), init_pos)
|
||||
else:
|
||||
unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
|
||||
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
|
||||
has_unrecognized = True
|
||||
|
||||
if not self.next_token(False):
|
||||
break
|
||||
|
||||
else: # some concepts are recognized
|
||||
if unrecognized_tokens:
|
||||
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
|
||||
unrecognized_tokens.fix_source()
|
||||
unrecognized_tokens = None
|
||||
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
|
||||
has_unrecognized = True
|
||||
unrecognized_tokens = None
|
||||
|
||||
res = self.get_bests(res) # only keep the concepts that eat the more tokens
|
||||
concepts_found = core.utils.product(concepts_found, res)
|
||||
|
||||
# loop
|
||||
self.seek(res[0].end)
|
||||
if not self.next_token():
|
||||
if not self.next_token(False):
|
||||
break
|
||||
|
||||
# Fix the source for unrecognized tokens
|
||||
if unrecognized_tokens:
|
||||
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
|
||||
unrecognized_tokens.fix_source()
|
||||
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
|
||||
has_unrecognized = True
|
||||
|
||||
# else
|
||||
# returns as many ReturnValue than choices found
|
||||
@@ -821,6 +833,82 @@ class ConceptLexerParser(BaseParser):
|
||||
self.log_multiple_results(context, text, ret)
|
||||
return ret
|
||||
|
||||
def finalize_concept(self, sheerka, template, underlying, init_empty_body=True):
|
||||
"""
|
||||
Updates the properties of the concept
|
||||
Goes in recursion if the property is a concept
|
||||
"""
|
||||
|
||||
# this cache is to make sure that we return the same concept for the same ConceptMatch
|
||||
_underlying_value_cache = {}
|
||||
|
||||
def _add_prop(_concept, prop_name, value):
|
||||
"""
|
||||
Adds a new entry,
|
||||
makes a list if the property already exists
|
||||
"""
|
||||
if prop_name not in _concept.props or _concept.props[prop_name].value is None:
|
||||
# new entry
|
||||
_concept.set_prop(prop_name, value)
|
||||
else:
|
||||
# make a list if there was a value
|
||||
previous_value = _concept.props[prop_name].value
|
||||
if isinstance(previous_value, list):
|
||||
previous_value.append(value)
|
||||
else:
|
||||
new_value = [previous_value, value]
|
||||
_concept.set_prop(prop_name, new_value)
|
||||
|
||||
def _look_for_concept_match(_underlying):
|
||||
if isinstance(_underlying.parsing_expression, ConceptMatch):
|
||||
return _underlying
|
||||
|
||||
if not isinstance(_underlying, NonTerminalNode):
|
||||
return None
|
||||
|
||||
if len(_underlying.children) != 1:
|
||||
return None
|
||||
|
||||
return _look_for_concept_match(_underlying.children[0])
|
||||
|
||||
def _get_underlying_value(_underlying):
|
||||
concept_match_node = _look_for_concept_match(_underlying)
|
||||
if concept_match_node:
|
||||
if id(concept_match_node) in _underlying_value_cache:
|
||||
result = _underlying_value_cache[id(concept_match_node)]
|
||||
else:
|
||||
ref_tpl = concept_match_node.parsing_expression.concept
|
||||
result = self.finalize_concept(sheerka, ref_tpl, concept_match_node.children[0], init_empty_body)
|
||||
_underlying_value_cache[id(concept_match_node)] = result
|
||||
else:
|
||||
result = _underlying.source
|
||||
|
||||
return result
|
||||
|
||||
def _process_rule_name(_concept, _underlying):
|
||||
if _underlying.parsing_expression.rule_name:
|
||||
value = _get_underlying_value(_underlying)
|
||||
_add_prop(_concept, _underlying.parsing_expression.rule_name, value)
|
||||
|
||||
if isinstance(_underlying, NonTerminalNode):
|
||||
for child in _underlying.children:
|
||||
_process_rule_name(_concept, child)
|
||||
|
||||
key = (template.key, template.id) if template.id else template.key
|
||||
concept = sheerka.new(key)
|
||||
if init_empty_body and concept.body is None:
|
||||
value = _get_underlying_value(underlying)
|
||||
concept.metadata.body = value
|
||||
concept.metadata.is_evaluated = True
|
||||
if underlying.parsing_expression.rule_name:
|
||||
_add_prop(concept, underlying.parsing_expression.rule_name, value)
|
||||
|
||||
if isinstance(underlying, NonTerminalNode):
|
||||
for node in underlying.children:
|
||||
_process_rule_name(concept, node)
|
||||
|
||||
return concept
|
||||
|
||||
@staticmethod
|
||||
def get_bests(results):
|
||||
"""
|
||||
|
||||
@@ -92,7 +92,8 @@ class DefConceptNode(DefaultParserNode):
|
||||
if isinstance(prop_value, ReturnValueConcept) and isinstance(prop_value.body,
|
||||
ParserResultConcept) and hasattr(
|
||||
prop_value.body.body, "ast_"):
|
||||
asts[part_key] = prop_value.body.body.ast_
|
||||
asts[part_key] = prop_value
|
||||
#asts[part_key] = prop_value.body.body.ast_
|
||||
return asts
|
||||
|
||||
|
||||
|
||||
@@ -46,7 +46,8 @@ class ExactConceptParser(BaseParser):
|
||||
if sheerka.isinstance(result, BuiltinConcepts.UNKNOWN_CONCEPT):
|
||||
continue
|
||||
|
||||
concepts = result.body if sheerka.isinstance(result, BuiltinConcepts.ENUMERATION) else [result]
|
||||
# concepts = result.body if sheerka.isinstance(result, BuiltinConcepts.ENUMERATION) else [result]
|
||||
concepts = result if isinstance(result, list) else [result]
|
||||
|
||||
for concept in concepts:
|
||||
context.log(self.verbose_log, f"Recognized concept {concept}.", self.name)
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from core.tokenizer import TokenKind
|
||||
from parsers.BaseParser import BaseParser
|
||||
from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode
|
||||
import core.utils
|
||||
|
||||
concept_lexer_parser = ConceptLexerParser()
|
||||
|
||||
|
||||
class MultipleConceptsParser(BaseParser):
|
||||
"""
|
||||
Parser that will take the result of ConceptLexerParser and
|
||||
try to resolve the unrecognized tokens token by token
|
||||
|
||||
It is a success when it returns a list ConceptNode exclusively
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
BaseParser.__init__(self, "MultipleConcepts", 45)
|
||||
|
||||
def parse(self, context, text):
|
||||
sheerka = context.sheerka
|
||||
if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
|
||||
return None
|
||||
|
||||
if not text.parser == concept_lexer_parser:
|
||||
return None
|
||||
|
||||
sheerka = context.sheerka
|
||||
nodes = text.value
|
||||
nodes_found = [[]]
|
||||
source = ""
|
||||
concepts_only = True
|
||||
|
||||
for node in nodes:
|
||||
if isinstance(node, UnrecognizedTokensNode):
|
||||
unrecognized_tokens = None
|
||||
for i, token in enumerate(node.tokens):
|
||||
index = node.start + i
|
||||
|
||||
if token.type == TokenKind.IDENTIFIER:
|
||||
# it may be a concept
|
||||
concept = context.new_concept(token.value)
|
||||
if hasattr(concept, "__iter__") or not sheerka.is_unknown(concept):
|
||||
# finish processing unrecognized_tokens
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.fix_source()
|
||||
source += unrecognized_tokens.source
|
||||
if unrecognized_tokens.not_whitespace():
|
||||
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
|
||||
unrecognized_tokens = None
|
||||
|
||||
source += token.value
|
||||
concepts = concept if hasattr(concept, "__iter__") else [concept]
|
||||
concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
|
||||
nodes_found = core.utils.product(nodes_found, concepts_nodes)
|
||||
continue
|
||||
else:
|
||||
# it cannot be a concept
|
||||
concepts_only &= token.type == TokenKind.WHITESPACE or token.type == TokenKind.NEWLINE
|
||||
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.add_token(token, index)
|
||||
else:
|
||||
unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
|
||||
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.fix_source()
|
||||
source += unrecognized_tokens.source
|
||||
if unrecognized_tokens.not_whitespace():
|
||||
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
|
||||
|
||||
else:
|
||||
nodes_found = core.utils.product(nodes_found, [node])
|
||||
source += node.source
|
||||
|
||||
ret = []
|
||||
for choice in nodes_found:
|
||||
ret.append(
|
||||
sheerka.ret(
|
||||
self.name,
|
||||
concepts_only,
|
||||
sheerka.new(
|
||||
BuiltinConcepts.PARSER_RESULT,
|
||||
parser=self,
|
||||
source=source,
|
||||
body=choice,
|
||||
try_parsed=None))
|
||||
)
|
||||
|
||||
if len(ret) == 1:
|
||||
self.log_result(context, source, ret[0])
|
||||
return ret[0]
|
||||
else:
|
||||
self.log_multiple_results(context, source, ret)
|
||||
return ret
|
||||
@@ -1,7 +1,7 @@
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from core.tokenizer import Tokenizer, LexerError, TokenKind
|
||||
from parsers.BaseParser import BaseParser, Node, ErrorNode
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
import ast
|
||||
import logging
|
||||
|
||||
@@ -17,10 +17,12 @@ class PythonErrorNode(ErrorNode):
|
||||
# self.log.debug("-> PythonErrorNode: " + str(self.exception))
|
||||
|
||||
|
||||
@dataclass()
|
||||
class PythonNode(Node):
|
||||
source: str
|
||||
ast_: ast.AST
|
||||
|
||||
def __init__(self, source, ast_, concepts=None):
|
||||
self.source = source
|
||||
self.ast_ = ast_
|
||||
self.concepts = concepts or {}
|
||||
|
||||
# def __repr__(self):
|
||||
# return "PythonNode(source='" + self.source + "', ast=" + self.get_dump(self.ast_) + ")"
|
||||
@@ -67,7 +69,7 @@ class PythonParser(BaseParser):
|
||||
tree = None
|
||||
|
||||
python_switcher = {
|
||||
TokenKind.CONCEPT: lambda t: f"__C__{t.value}__C__"
|
||||
TokenKind.CONCEPT: lambda t: f"__C__USE_CONCEPT__{t.value}__C__"
|
||||
}
|
||||
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from parsers.BaseParser import BaseParser
|
||||
from parsers.ConceptLexerParser import UnrecognizedTokensNode, ConceptNode
|
||||
from parsers.PythonParser import PythonParser
|
||||
|
||||
|
||||
class PythonWithConceptsParser(BaseParser):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__("PythonWithConcepts", 20)
|
||||
self.identifiers = None
|
||||
self.identifiers_key = None
|
||||
|
||||
@staticmethod
|
||||
def sanitize(identifier):
|
||||
res = ""
|
||||
for c in identifier:
|
||||
res += c if c.isalnum() else "0"
|
||||
return res
|
||||
|
||||
def parse(self, context, text):
|
||||
sheerka = context.sheerka
|
||||
if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
|
||||
return None
|
||||
|
||||
nodes = text.body
|
||||
if not isinstance(nodes, list):
|
||||
return None
|
||||
|
||||
if len(nodes) == 0:
|
||||
return None
|
||||
|
||||
if not isinstance(nodes[0], (ConceptNode, UnrecognizedTokensNode)):
|
||||
return None
|
||||
|
||||
source = ""
|
||||
to_parse = ""
|
||||
identifiers = {}
|
||||
identifiers_key = {}
|
||||
python_ids_mappings = {}
|
||||
|
||||
def _get_identifier(c):
|
||||
"""
|
||||
Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
|
||||
to be instance variables
|
||||
I would like to keep this parser as stateless as possible
|
||||
:param c:
|
||||
:return:
|
||||
"""
|
||||
if id(c) in identifiers:
|
||||
return identifiers[id(c)]
|
||||
|
||||
identifier = "__C__" + self.sanitize(c.key or c.name)
|
||||
if c.id:
|
||||
identifier += "__" + c.id
|
||||
|
||||
if identifier in identifiers_key:
|
||||
identifiers_key[identifier] += 1
|
||||
identifier += f"_{identifiers_key[identifier]}"
|
||||
else:
|
||||
identifiers_key[identifier] = 0
|
||||
|
||||
identifier += "__C__"
|
||||
|
||||
identifiers[id(c)] = identifier
|
||||
return identifier
|
||||
|
||||
for node in nodes:
|
||||
if isinstance(node, ConceptNode):
|
||||
source += node.source
|
||||
if to_parse:
|
||||
to_parse += " "
|
||||
concept = node.concept
|
||||
python_id = _get_identifier(concept)
|
||||
to_parse += python_id
|
||||
python_ids_mappings[python_id] = concept
|
||||
else:
|
||||
source += node.source
|
||||
to_parse += node.source
|
||||
|
||||
with context.push(self, "Trying Python for '" + to_parse + "'") as sub_context:
|
||||
python_parser = PythonParser()
|
||||
result = python_parser.parse(sub_context, to_parse)
|
||||
|
||||
if result.status:
|
||||
python_node = result.body.body
|
||||
python_node.source = source
|
||||
python_node.concepts = python_ids_mappings
|
||||
|
||||
return sheerka.ret(
|
||||
self.name,
|
||||
True,
|
||||
sheerka.new(
|
||||
BuiltinConcepts.PARSER_RESULT,
|
||||
parser=self,
|
||||
source=source,
|
||||
body=result.body.body,
|
||||
try_parsed=None))
|
||||
|
||||
else:
|
||||
|
||||
return sheerka.ret(
|
||||
self.name,
|
||||
False,
|
||||
result.body)
|
||||
|
||||
def concept_identifier(self, concept):
|
||||
if id(concept) in self.identifiers:
|
||||
return self.identifiers[id(concept)]
|
||||
|
||||
|
||||
identifier = "__C__" + (concept.key or concept.name)
|
||||
if concept.id:
|
||||
identifier += "__" + concept.id
|
||||
identifier += "__C__"
|
||||
|
||||
return identifier
|
||||
Reference in New Issue
Block a user