Enhanced complex concepts handling

This commit is contained in:
2020-01-11 08:03:35 +01:00
parent a62c1f0f13
commit 40416ac337
24 changed files with 1647 additions and 961 deletions
+102 -14
View File
@@ -34,10 +34,10 @@ def flatten(iterable):
@dataclass()
class LexerNode(Node):
start: int
end: int
tokens: list = None
source: str = None
start: int # starting index in the tokens list
end: int # ending index in the tokens list
tokens: list = None # tokens
source: str = None # string representation of what was parsed
def __post_init__(self):
if self.source is None:
@@ -64,7 +64,15 @@ class UnrecognizedTokensNode(LexerNode):
def fix_source(self):
self.source = BaseParser.get_text_from_tokens(self.tokens)
def not_whitespace(self):
return not (len(self.tokens) == 1 and self.tokens[0].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE))
def __eq__(self, other):
if isinstance(other, tuple):
if len(other) != 3:
return False
return self.start == other[0] and self.end == other[1] and self.source == other[2]
if not isinstance(other, UnrecognizedTokensNode):
return False
@@ -93,9 +101,9 @@ class ConceptNode(LexerNode):
def __eq__(self, other):
if isinstance(other, tuple):
if len(other) == 2:
return self.concept == other[0] and self.source == other[1]
return self.concept.key == other[0] and self.source == other[1]
else:
return self.concept == other[0] and \
return self.concept.key == other[0] and \
self.start == other[1] and \
self.end == other[2] and \
self.source == other[3]
@@ -567,7 +575,7 @@ class ConceptLexerParser(BaseParser):
self.token = None
self.pos = -1
self.next_token()
self.next_token(False)
return True
def get_token(self) -> Token:
@@ -762,8 +770,9 @@ class ConceptLexerParser(BaseParser):
self.seek(init_pos)
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
if node is not None and node.end != -1:
updated_concept = self.finalize_concept(context.sheerka, concept, node)
concept_node = ConceptNode(
concept,
updated_concept,
node.start,
node.end,
self.tokens[node.start: node.end + 1],
@@ -777,27 +786,30 @@ class ConceptLexerParser(BaseParser):
unrecognized_tokens.add_token(self.get_token(), init_pos)
else:
unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
if not self.next_token(False):
break
else: # some concepts are recognized
if unrecognized_tokens:
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
unrecognized_tokens.fix_source()
unrecognized_tokens = None
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
unrecognized_tokens = None
res = self.get_bests(res) # only keep the concepts that eat the more tokens
concepts_found = core.utils.product(concepts_found, res)
# loop
self.seek(res[0].end)
if not self.next_token():
if not self.next_token(False):
break
# Fix the source for unrecognized tokens
if unrecognized_tokens:
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
unrecognized_tokens.fix_source()
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
# else
# returns as many ReturnValue than choices found
@@ -821,6 +833,82 @@ class ConceptLexerParser(BaseParser):
self.log_multiple_results(context, text, ret)
return ret
def finalize_concept(self, sheerka, template, underlying, init_empty_body=True):
"""
Updates the properties of the concept
Goes in recursion if the property is a concept
"""
# this cache is to make sure that we return the same concept for the same ConceptMatch
_underlying_value_cache = {}
def _add_prop(_concept, prop_name, value):
"""
Adds a new entry,
makes a list if the property already exists
"""
if prop_name not in _concept.props or _concept.props[prop_name].value is None:
# new entry
_concept.set_prop(prop_name, value)
else:
# make a list if there was a value
previous_value = _concept.props[prop_name].value
if isinstance(previous_value, list):
previous_value.append(value)
else:
new_value = [previous_value, value]
_concept.set_prop(prop_name, new_value)
def _look_for_concept_match(_underlying):
if isinstance(_underlying.parsing_expression, ConceptMatch):
return _underlying
if not isinstance(_underlying, NonTerminalNode):
return None
if len(_underlying.children) != 1:
return None
return _look_for_concept_match(_underlying.children[0])
def _get_underlying_value(_underlying):
concept_match_node = _look_for_concept_match(_underlying)
if concept_match_node:
if id(concept_match_node) in _underlying_value_cache:
result = _underlying_value_cache[id(concept_match_node)]
else:
ref_tpl = concept_match_node.parsing_expression.concept
result = self.finalize_concept(sheerka, ref_tpl, concept_match_node.children[0], init_empty_body)
_underlying_value_cache[id(concept_match_node)] = result
else:
result = _underlying.source
return result
def _process_rule_name(_concept, _underlying):
if _underlying.parsing_expression.rule_name:
value = _get_underlying_value(_underlying)
_add_prop(_concept, _underlying.parsing_expression.rule_name, value)
if isinstance(_underlying, NonTerminalNode):
for child in _underlying.children:
_process_rule_name(_concept, child)
key = (template.key, template.id) if template.id else template.key
concept = sheerka.new(key)
if init_empty_body and concept.body is None:
value = _get_underlying_value(underlying)
concept.metadata.body = value
concept.metadata.is_evaluated = True
if underlying.parsing_expression.rule_name:
_add_prop(concept, underlying.parsing_expression.rule_name, value)
if isinstance(underlying, NonTerminalNode):
for node in underlying.children:
_process_rule_name(concept, node)
return concept
@staticmethod
def get_bests(results):
"""
+2 -1
View File
@@ -92,7 +92,8 @@ class DefConceptNode(DefaultParserNode):
if isinstance(prop_value, ReturnValueConcept) and isinstance(prop_value.body,
ParserResultConcept) and hasattr(
prop_value.body.body, "ast_"):
asts[part_key] = prop_value.body.body.ast_
asts[part_key] = prop_value
#asts[part_key] = prop_value.body.body.ast_
return asts
+2 -1
View File
@@ -46,7 +46,8 @@ class ExactConceptParser(BaseParser):
if sheerka.isinstance(result, BuiltinConcepts.UNKNOWN_CONCEPT):
continue
concepts = result.body if sheerka.isinstance(result, BuiltinConcepts.ENUMERATION) else [result]
# concepts = result.body if sheerka.isinstance(result, BuiltinConcepts.ENUMERATION) else [result]
concepts = result if isinstance(result, list) else [result]
for concept in concepts:
context.log(self.verbose_log, f"Recognized concept {concept}.", self.name)
+96
View File
@@ -0,0 +1,96 @@
from core.builtin_concepts import BuiltinConcepts
from core.tokenizer import TokenKind
from parsers.BaseParser import BaseParser
from parsers.ConceptLexerParser import ConceptLexerParser, UnrecognizedTokensNode, ConceptNode
import core.utils
concept_lexer_parser = ConceptLexerParser()
class MultipleConceptsParser(BaseParser):
"""
Parser that will take the result of ConceptLexerParser and
try to resolve the unrecognized tokens token by token
It is a success when it returns a list ConceptNode exclusively
"""
def __init__(self, **kwargs):
BaseParser.__init__(self, "MultipleConcepts", 45)
def parse(self, context, text):
sheerka = context.sheerka
if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
return None
if not text.parser == concept_lexer_parser:
return None
sheerka = context.sheerka
nodes = text.value
nodes_found = [[]]
source = ""
concepts_only = True
for node in nodes:
if isinstance(node, UnrecognizedTokensNode):
unrecognized_tokens = None
for i, token in enumerate(node.tokens):
index = node.start + i
if token.type == TokenKind.IDENTIFIER:
# it may be a concept
concept = context.new_concept(token.value)
if hasattr(concept, "__iter__") or not sheerka.is_unknown(concept):
# finish processing unrecognized_tokens
if unrecognized_tokens:
unrecognized_tokens.fix_source()
source += unrecognized_tokens.source
if unrecognized_tokens.not_whitespace():
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
unrecognized_tokens = None
source += token.value
concepts = concept if hasattr(concept, "__iter__") else [concept]
concepts_nodes = [ConceptNode(c, index, index, [token], token.value) for c in concepts]
nodes_found = core.utils.product(nodes_found, concepts_nodes)
continue
else:
# it cannot be a concept
concepts_only &= token.type == TokenKind.WHITESPACE or token.type == TokenKind.NEWLINE
if unrecognized_tokens:
unrecognized_tokens.add_token(token, index)
else:
unrecognized_tokens = UnrecognizedTokensNode(index, index, [token])
if unrecognized_tokens:
unrecognized_tokens.fix_source()
source += unrecognized_tokens.source
if unrecognized_tokens.not_whitespace():
nodes_found = core.utils.product(nodes_found, [unrecognized_tokens])
else:
nodes_found = core.utils.product(nodes_found, [node])
source += node.source
ret = []
for choice in nodes_found:
ret.append(
sheerka.ret(
self.name,
concepts_only,
sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=source,
body=choice,
try_parsed=None))
)
if len(ret) == 1:
self.log_result(context, source, ret[0])
return ret[0]
else:
self.log_multiple_results(context, source, ret)
return ret
+7 -5
View File
@@ -1,7 +1,7 @@
from core.builtin_concepts import BuiltinConcepts
from core.tokenizer import Tokenizer, LexerError, TokenKind
from parsers.BaseParser import BaseParser, Node, ErrorNode
from dataclasses import dataclass
from dataclasses import dataclass, field
import ast
import logging
@@ -17,10 +17,12 @@ class PythonErrorNode(ErrorNode):
# self.log.debug("-> PythonErrorNode: " + str(self.exception))
@dataclass()
class PythonNode(Node):
source: str
ast_: ast.AST
def __init__(self, source, ast_, concepts=None):
self.source = source
self.ast_ = ast_
self.concepts = concepts or {}
# def __repr__(self):
# return "PythonNode(source='" + self.source + "', ast=" + self.get_dump(self.ast_) + ")"
@@ -67,7 +69,7 @@ class PythonParser(BaseParser):
tree = None
python_switcher = {
TokenKind.CONCEPT: lambda t: f"__C__{t.value}__C__"
TokenKind.CONCEPT: lambda t: f"__C__USE_CONCEPT__{t.value}__C__"
}
try:
+116
View File
@@ -0,0 +1,116 @@
from core.builtin_concepts import BuiltinConcepts
from parsers.BaseParser import BaseParser
from parsers.ConceptLexerParser import UnrecognizedTokensNode, ConceptNode
from parsers.PythonParser import PythonParser
class PythonWithConceptsParser(BaseParser):
def __init__(self, **kwargs):
super().__init__("PythonWithConcepts", 20)
self.identifiers = None
self.identifiers_key = None
@staticmethod
def sanitize(identifier):
res = ""
for c in identifier:
res += c if c.isalnum() else "0"
return res
def parse(self, context, text):
sheerka = context.sheerka
if not sheerka.isinstance(text, BuiltinConcepts.PARSER_RESULT):
return None
nodes = text.body
if not isinstance(nodes, list):
return None
if len(nodes) == 0:
return None
if not isinstance(nodes[0], (ConceptNode, UnrecognizedTokensNode)):
return None
source = ""
to_parse = ""
identifiers = {}
identifiers_key = {}
python_ids_mappings = {}
def _get_identifier(c):
"""
Internal function because I don't want identifiers, identifiers_key and python_ids_mappings
to be instance variables
I would like to keep this parser as stateless as possible
:param c:
:return:
"""
if id(c) in identifiers:
return identifiers[id(c)]
identifier = "__C__" + self.sanitize(c.key or c.name)
if c.id:
identifier += "__" + c.id
if identifier in identifiers_key:
identifiers_key[identifier] += 1
identifier += f"_{identifiers_key[identifier]}"
else:
identifiers_key[identifier] = 0
identifier += "__C__"
identifiers[id(c)] = identifier
return identifier
for node in nodes:
if isinstance(node, ConceptNode):
source += node.source
if to_parse:
to_parse += " "
concept = node.concept
python_id = _get_identifier(concept)
to_parse += python_id
python_ids_mappings[python_id] = concept
else:
source += node.source
to_parse += node.source
with context.push(self, "Trying Python for '" + to_parse + "'") as sub_context:
python_parser = PythonParser()
result = python_parser.parse(sub_context, to_parse)
if result.status:
python_node = result.body.body
python_node.source = source
python_node.concepts = python_ids_mappings
return sheerka.ret(
self.name,
True,
sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=source,
body=result.body.body,
try_parsed=None))
else:
return sheerka.ret(
self.name,
False,
result.body)
def concept_identifier(self, concept):
if id(concept) in self.identifiers:
return self.identifiers[id(concept)]
identifier = "__C__" + (concept.key or concept.name)
if concept.id:
identifier += "__" + concept.id
identifier += "__C__"
return identifier