Refactored parsers to introduce priority

This commit is contained in:
2020-01-08 19:45:54 +01:00
parent b4346b5af0
commit a62c1f0f13
13 changed files with 531 additions and 42 deletions
+4 -1
View File
@@ -227,7 +227,10 @@ class ParserResultConcept(Concept):
self.set_prop("try_parsed", try_parsed) # in case of error, what was found before the error
def __repr__(self):
return f"ParserResult({self.body})"
text = f"ParserResult(parser={self.props['parser'].value}"
source = self.props['source'].value
text += f", source='{source}')" if source else f", body='{self.body}')"
return text
def __eq__(self, other):
if not isinstance(other, ParserResultConcept):
+57 -17
View File
@@ -211,42 +211,82 @@ class Sheerka(Concept):
def _call_parsers(self, execution_context, return_values, logger=None):
result = []
# return_values must be a list
if not isinstance(return_values, list):
return_values = [return_values]
for return_value in return_values:
# make sure we only parse user input
if not return_value.status or not self.isinstance(return_value.body, BuiltinConcepts.USER_INPUT):
result.append(return_value)
continue
# first make the distinguish between what is for the parsers and what is not
result = []
to_process = []
for r in return_values:
if not r.status or not self.isinstance(r.body, BuiltinConcepts.USER_INPUT):
result.append(r)
else:
to_process.append(r)
to_parse = return_value.body.body # get the underlying text
if not to_process:
return result
if self.log.isEnabledFor(logging.DEBUG):
debug_text = "'" + to_parse + "'" if isinstance(to_parse, str) \
else "'" + BaseParser.get_text_from_tokens(to_parse) + "' as tokens"
execution_context.log(logger or self.log, f"Parsing {debug_text}")
# keep track of the originals user inputs, as they need to be removed at the end
user_inputs = to_process[:]
for parser in self.parsers.values():
p = parser(sheerka=self)
# group the parsers by priorities
instantiated_parsers = [parser(sheerka=self) for parser in self.parsers.values()]
grouped_parsers = {}
for parser in [p for p in instantiated_parsers if p.enabled]:
if logger:
p.log = logger
parser.log = logger
grouped_parsers.setdefault(parser.priority, []).append(parser)
sorted_priorities = sorted(grouped_parsers.keys(), reverse=True)
with execution_context.push(desc=f"Parsing using {p.name}") as sub_context:
res = p.parse(sub_context, to_parse)
stop_processing = False
for priority in sorted_priorities:
inputs_for_this_group = to_process[:]
for parser in grouped_parsers[priority]:
return_value_success_found = False
for return_value in inputs_for_this_group:
to_parse = return_value.body.body \
if self.isinstance(return_value.body, BuiltinConcepts.USER_INPUT) \
else return_value.body
# if self.log.isEnabledFor(logging.DEBUG):
# debug_text = "'" + to_parse + "'" if isinstance(to_parse, str) \
# else "'" + BaseParser.get_text_from_tokens(to_parse) + "' as tokens"
# execution_context.log(logger or self.log, f"Parsing {debug_text}")
with execution_context.push(desc=f"Parsing using {parser.name}") as sub_context:
res = parser.parse(sub_context, to_parse)
if res is not None:
if hasattr(res, "__iter__"):
for r in res:
if r is None:
continue
r.parents = [return_value]
result.append(r)
if self.isinstance(r.body, BuiltinConcepts.PARSER_RESULT):
to_process.append(r)
if r.status:
return_value_success_found = True
else:
res.parents = [return_value]
result.append(res)
if self.isinstance(res.body, BuiltinConcepts.PARSER_RESULT):
to_process.append(res)
if res.status:
return_value_success_found = True
sub_context.add_values(return_values=res)
if return_value_success_found:
stop_processing = True
break # Stop the other return_values (but not the other parsers with the same priority)
if stop_processing:
break # Do not try the other priorities if a match is found
result = core.utils.remove_list_from_list(result, user_inputs)
return result
def _call_evaluators(self, execution_context, return_values, process_step, evaluation_context=None, logger=None):
-1
View File
@@ -228,7 +228,6 @@ def escape_char(text, to_escape):
return res
def pp(items):
if not hasattr(items, "__iter__"):
return str(items)
+63
View File
@@ -828,3 +828,66 @@ It will means that the concept is required.
If the name is required, you can use :code:`"'name'"` or :code:`'"name"'`.
It's already working. There is nothing to do for this one.
2020-07-01
**********
How do we perform the parsing ?
"""""""""""""""""""""""""""""""
The basic flow of an execution is :
* Parse the data -> Nodes
* Evaluate the nodes -> Concepts
* Display the results
The theories says that there can exist as many parsers as necessary. Each one of them will
be specialized to recognize a specific pattern. They will then send there information to
the evaluators.
As of now, I have implemented the following parsers:
* EmptyStringParser
To recognize empty strings and react accordingly
* PythonParser
To recognize Python source code
* ExactConceptParser
To recognize simple form of concepts
* DefaultParser (the name is not accurate)
To recognize builtin syntax (like 'def concept' or 'isa')
* ConceptLexerParser
To recognize concept defined with BNF language
All theses parsers are executed in the row (the order in not very important)
The first observation is that there is lot of CPU waste. Most of the time (at least as of
now, when a there is a match with one parser, the others fail). So there is no need to
execute them.
The second point is that there is now way for a parser to use the result of another.
My idea is to have parsers that can be chained, each one of them will do the little thing
it is capable of before leaving the rest to some more powerful parser.
I don't want to bring out the big guns for every single user input. And I certainly
don't want a massive and over complex parser that will be capable (in theory) of everything
Why ?
| First of all, monolithic code is bad :-)
| Then I have to keep in mind that the process will be somehow distributed
| And last, but not least. I don't have (and I certainly will never have) the full completion
of all possible parsing situation. So what I need is a plug and play system where I can add
and remove and chain parsers, depending of the input.
So,
* I'll give all parsers a priority
* The parsers with the highest priority will be executed first
* The parsers with the same priority will be executed at the same time (The order does matter)
* If, for a given priority there is a match, the parser with a lower priority won't be executed
* A parser has access to the output of the parsers of higher priorities (which were executed before it)
+7 -1
View File
@@ -38,12 +38,15 @@ class UnexpectedTokenErrorNode(ErrorNode):
class BaseParser:
PREFIX = "parsers."
def __init__(self, name):
def __init__(self, name, priority: int, enabled=True):
self.log = get_logger("parsers." + self.__class__.__name__)
self.init_log = get_logger("init." + self.PREFIX + self.__class__.__name__)
self.verbose_log = get_logger("verbose." + self.PREFIX + self.__class__.__name__)
self.name = self.PREFIX + name
self.priority = priority
self.enabled = enabled
self.has_error = False
self.error_sink = []
@@ -55,6 +58,9 @@ class BaseParser:
def __hash__(self):
return hash(self.name)
def __repr__(self):
return self.name
def parse(self, context, text):
pass
+6 -5
View File
@@ -13,7 +13,7 @@ class UnexpectedEndOfFileError(ErrorNode):
pass
class BnfParser:
class BnfParser(BaseParser):
"""
Parser used to transform litteral into ParsingExpression
example :
@@ -27,10 +27,11 @@ class BnfParser:
"""
def __init__(self):
self.has_error = False
self.error_sink = []
self.name = BaseParser.PREFIX + "Bnf"
def __init__(self, **kwargs):
super().__init__("Bnf", 50, False)
# self.has_error = False
# self.error_sink = []
# self.name = BaseParser.PREFIX + "Bnf"
self.lexer_iter = None
self._current = None
+1 -1
View File
@@ -524,7 +524,7 @@ class ConceptMatch(Match):
class ConceptLexerParser(BaseParser):
def __init__(self, **kwargs):
super().__init__("ConceptLexer")
super().__init__("ConceptLexer", 50)
if 'grammars' in kwargs:
self.concepts_grammars = kwargs.get("grammars")
elif 'sheerka' in kwargs:
+1 -1
View File
@@ -108,7 +108,7 @@ class DefaultParser(BaseParser):
"""
def __init__(self, **kwargs):
BaseParser.__init__(self, "Default")
BaseParser.__init__(self, "Default", 50)
self.lexer_iter = None
self._current = None
self.context: ExecutionContext = None
+1 -1
View File
@@ -8,7 +8,7 @@ class EmptyStringParser(BaseParser):
"""
def __init__(self, **kwargs):
BaseParser.__init__(self, "EmptyString")
BaseParser.__init__(self, "EmptyString", 90)
def parse(self, context, text):
sheerka = context.sheerka
+1 -1
View File
@@ -14,7 +14,7 @@ class ExactConceptParser(BaseParser):
MAX_WORDS_SIZE = 10
def __init__(self, **kwargs):
BaseParser.__init__(self, "ExactConcept")
BaseParser.__init__(self, "ExactConcept", 80)
def parse(self, context, text):
"""
+1 -1
View File
@@ -59,7 +59,7 @@ class PythonParser(BaseParser):
def __init__(self, **kwargs):
BaseParser.__init__(self, "Python")
BaseParser.__init__(self, "Python", 50)
self.source = kwargs.get("source", "<undef>")
def parse(self, context, text):
-3
View File
@@ -6,9 +6,6 @@ import shutil
from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept, UserInputConcept, ConceptAlreadyInSet
from core.concept import Concept, PROPERTIES_TO_SERIALIZE, Property
from core.sheerka import Sheerka, ExecutionContext
from evaluators.MutipleSameSuccessEvaluator import MultipleSameSuccessEvaluator
from parsers.ConceptLexerParser import Sequence, ZeroOrMore, StrMatch, OrderedChoice, Optional, ConceptMatch, \
ConceptLexerParser
from sdp.sheerkaDataProvider import SheerkaDataProvider, Event
tests_root = path.abspath("../build/tests")
+380
View File
@@ -0,0 +1,380 @@
from core.builtin_concepts import ReturnValueConcept, UserInputConcept, BuiltinConcepts, ParserResultConcept
from core.sheerka import Sheerka, ExecutionContext
from parsers.BaseParser import BaseParser
from sdp.sheerkaDataProvider import Event
def get_sheerka():
sheerka = Sheerka()
sheerka.initialize("mem://")
return sheerka
def get_context(sheerka):
return ExecutionContext("test", Event(), sheerka)
def get_ret_val(text, who="who"):
return ReturnValueConcept(who, True, UserInputConcept(text, "user_name"))
class BaseTestParser(BaseParser):
debug_out = []
def __init__(self, name, priority, status=None, parser_result=None):
super().__init__(name, priority)
self.status = status
self.parser_result = parser_result
@staticmethod
def _get_name(name):
return name[8:] if name.startswith("parsers.") else name
@staticmethod
def _get_source(text_):
return text_ if isinstance(text_, str) else text_.body
def _out(self, name, priority, status, source):
debug = f"name={name}"
debug += f", priority={priority}"
debug += f", status={status}"
debug += f", source={source}"
self.debug_out.append(debug)
def parse(self, context, text):
self._out(self._get_name(self.name), self.priority, self.status, self._get_source(text))
value = self._get_name(self.name) + ":" + (text if isinstance(text, str) else text.body)
parser_result = ParserResultConcept(parser=self, value=value)
return ReturnValueConcept(self, self.status, self.parser_result or parser_result)
class Enabled90FalseParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled90False", 90, False)
class Enabled80FalseParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled80False", 80, False)
class Enabled80MultipleFalseParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled80MultipleFalse", 80, False)
def parse(self, context, text):
self._out(self._get_name(self.name), self.priority, self.status, self._get_source(text))
value1 = self._get_name(self.name) + ":" + (text if isinstance(text, str) else text.body) + "_1"
value2 = self._get_name(self.name) + ":" + (text if isinstance(text, str) else text.body) + "_2"
return [
ReturnValueConcept(self, self.status, ParserResultConcept(parser=self, value=value1)),
ReturnValueConcept(self, self.status, ParserResultConcept(parser=self, value=value2)),
]
class Enabled80MultipleTrueParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled80MultipleTrue", 80)
def parse(self, context, text):
self._out(self._get_name(self.name), self.priority, self.status, self._get_source(text))
value1 = self._get_name(self.name) + ":" + (text if isinstance(text, str) else text.body) + "_1"
value2 = self._get_name(self.name) + ":" + (text if isinstance(text, str) else text.body) + "_2"
return [
ReturnValueConcept(self, True, ParserResultConcept(parser=self, value=value1)),
ReturnValueConcept(self, False, ParserResultConcept(parser=self, value=value2)),
]
class Enabled70FalseParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled70False", 70, False, "Not a ParserResult")
class Enabled50TrueParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled50True", 50, True)
def parse(self, context, text):
source = self._get_source(text)
status = isinstance(text, ParserResultConcept) and source == "Enabled80False:Enabled90False:hello world"
self._out(self._get_name(self.name), self.priority, status, source)
value = self._get_name(self.name) + ":" + (text if isinstance(text, str) else text.body)
return_value = ParserResultConcept(parser=self, value=value)
return ReturnValueConcept(self, status, return_value)
class Enabled50bisTrueParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled50BisTrue", 50, True)
class Enabled50FalseParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled50False", 50, False)
class Enabled10TrueParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Enabled10True", 10, True)
class DisabledParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("Disabled", 90, True)
self.enabled = False
class NoneParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("None", 90, True, None)
def parse(self, context, text):
self._out(self._get_name(self.name), self.priority, self.status, self._get_source(text))
return None
class ListOfNoneParser(BaseTestParser):
def __init__(self, **kwargs):
super().__init__("ListOfNone", 90, True, None)
def parse(self, context, text):
self._out(self._get_name(self.name), self.priority, self.status, self._get_source(text))
return [None, None]
def test_disabled_parsers_are_not_executed():
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled": Enabled10TrueParser,
"Disabled": DisabledParser
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert BaseTestParser.debug_out == ['name=Enabled10True, priority=10, status=True, source=hello world']
def test_parser_are_executed_by_priority():
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled90False": Enabled90FalseParser,
"Enabled80False": Enabled80FalseParser,
"Enabled50True": Enabled50TrueParser,
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert BaseTestParser.debug_out == [
'name=Enabled90False, priority=90, status=False, source=hello world',
'name=Enabled80False, priority=80, status=False, source=hello world',
'name=Enabled80False, priority=80, status=False, source=Enabled90False:hello world',
'name=Enabled50True, priority=50, status=False, source=hello world',
'name=Enabled50True, priority=50, status=False, source=Enabled90False:hello world',
'name=Enabled50True, priority=50, status=False, source=Enabled80False:hello world',
'name=Enabled50True, priority=50, status=True, source=Enabled80False:Enabled90False:hello world',
]
def test_parsing_stop_at_the_first_success():
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled80False": Enabled80FalseParser,
"Enabled50bisTrue": Enabled50bisTrueParser,
"Enabled10True": Enabled10TrueParser,
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert BaseTestParser.debug_out == [
'name=Enabled80False, priority=80, status=False, source=hello world',
'name=Enabled50BisTrue, priority=50, status=True, source=hello world',
]
def test_parsing_stop_at_the_first_success_2():
"""
Same test than before, but Enabled50True takes more time to find a match
:return:
"""
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled90False": Enabled90FalseParser,
"Enabled80False": Enabled80FalseParser,
"Enabled50True": Enabled50TrueParser,
"Enabled10True": Enabled10TrueParser,
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert BaseTestParser.debug_out == [
'name=Enabled90False, priority=90, status=False, source=hello world',
'name=Enabled80False, priority=80, status=False, source=hello world',
'name=Enabled80False, priority=80, status=False, source=Enabled90False:hello world',
'name=Enabled50True, priority=50, status=False, source=hello world',
'name=Enabled50True, priority=50, status=False, source=Enabled90False:hello world',
'name=Enabled50True, priority=50, status=False, source=Enabled80False:hello world',
'name=Enabled50True, priority=50, status=True, source=Enabled80False:Enabled90False:hello world',
]
def test_all_parsers_of_a_given_priority_are_executed():
"""
Make sure that all parsers with priority 50 are executed
:return:
"""
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled90False": Enabled90FalseParser,
"Enabled80False": Enabled80FalseParser,
"Enabled50True": Enabled50TrueParser,
"Enabled50bisTrue": Enabled50bisTrueParser,
"Enabled50False": Enabled50FalseParser,
"Enabled10True": Enabled10TrueParser,
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert BaseTestParser.debug_out == [
'name=Enabled90False, priority=90, status=False, source=hello world',
'name=Enabled80False, priority=80, status=False, source=hello world',
'name=Enabled80False, priority=80, status=False, source=Enabled90False:hello world',
'name=Enabled50True, priority=50, status=False, source=hello world',
'name=Enabled50True, priority=50, status=False, source=Enabled90False:hello world',
'name=Enabled50True, priority=50, status=False, source=Enabled80False:hello world',
'name=Enabled50True, priority=50, status=True, source=Enabled80False:Enabled90False:hello world',
'name=Enabled50BisTrue, priority=50, status=True, source=hello world',
'name=Enabled50False, priority=50, status=False, source=hello world',
'name=Enabled50False, priority=50, status=False, source=Enabled90False:hello world',
'name=Enabled50False, priority=50, status=False, source=Enabled80False:hello world',
'name=Enabled50False, priority=50, status=False, source=Enabled80False:Enabled90False:hello world',
]
def test_a_parser_has_access_to_the_output_of_its_predecessors():
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled90False": Enabled90FalseParser,
"Enabled80False": Enabled80FalseParser,
"Enabled50True": Enabled50TrueParser,
}
user_input = [get_ret_val("hello world")]
res = sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
res_as_tuple = [(str(r.who)[8:], r.status, r.body.body) for r in res]
assert res_as_tuple == [
('Enabled90False', False, 'Enabled90False:hello world'),
('Enabled80False', False, 'Enabled80False:hello world'),
('Enabled80False', False, 'Enabled80False:Enabled90False:hello world'),
('Enabled50True', False, 'Enabled50True:hello world'),
('Enabled50True', False, 'Enabled50True:Enabled90False:hello world'),
('Enabled50True', False, 'Enabled50True:Enabled80False:hello world'),
('Enabled50True', True, 'Enabled50True:Enabled80False:Enabled90False:hello world'),
]
def test_none_return_values_are_discarded():
sheerka = get_sheerka()
sheerka.parsers = {
"NoneParser": NoneParser,
"ListOfNone": ListOfNoneParser,
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
res = sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert res == []
assert BaseTestParser.debug_out == [
'name=None, priority=90, status=True, source=hello world',
'name=ListOfNone, priority=90, status=True, source=hello world'
]
def test_following_priorities_can_only_see_parser_result_return_values():
"""
Normally, lower priority parsers can see the result of the higher priority parsers
This is true only if the higher priority parser return a ParserResultConcept
:return:
"""
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled80False": Enabled80FalseParser,
"Enabled70False": Enabled70FalseParser,
"Enabled50True": Enabled50TrueParser,
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert BaseTestParser.debug_out == [
'name=Enabled80False, priority=80, status=False, source=hello world',
'name=Enabled70False, priority=70, status=False, source=hello world',
'name=Enabled70False, priority=70, status=False, source=Enabled80False:hello world',
'name=Enabled50True, priority=50, status=False, source=hello world',
'name=Enabled50True, priority=50, status=False, source=Enabled80False:hello world',
]
def test_i_can_manage_parser_with_multiple_results():
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled80MultipleFalse": Enabled80MultipleFalseParser,
"Enabled50True": Enabled50TrueParser,
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
res = sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert BaseTestParser.debug_out == [
'name=Enabled80MultipleFalse, priority=80, status=False, source=hello world',
'name=Enabled50True, priority=50, status=False, source=hello world',
'name=Enabled50True, priority=50, status=False, source=Enabled80MultipleFalse:hello world_1',
'name=Enabled50True, priority=50, status=False, source=Enabled80MultipleFalse:hello world_2',
]
res_as_tuple = [(str(r.who)[8:], r.status, r.body.body) for r in res]
assert res_as_tuple == [
('Enabled80MultipleFalse', False, 'Enabled80MultipleFalse:hello world_1'),
('Enabled80MultipleFalse', False, 'Enabled80MultipleFalse:hello world_2'),
('Enabled50True', False, 'Enabled50True:hello world'),
('Enabled50True', False, 'Enabled50True:Enabled80MultipleFalse:hello world_1'),
('Enabled50True', False, 'Enabled50True:Enabled80MultipleFalse:hello world_2'),
]
def test_i_can_manage_parser_with_multiple_results_and_a_sucess():
sheerka = get_sheerka()
sheerka.parsers = {
"Enabled80MultipleTrue": Enabled80MultipleTrueParser,
"Enabled50True": Enabled50TrueParser,
}
user_input = [get_ret_val("hello world")]
BaseTestParser.debug_out = []
res = sheerka.execute(get_context(sheerka), user_input, [BuiltinConcepts.PARSING])
assert BaseTestParser.debug_out == [
'name=Enabled80MultipleTrue, priority=80, status=None, source=hello world',
]
res_as_tuple = [(str(r.who)[8:], r.status, r.body.body) for r in res]
assert res_as_tuple == [
('Enabled80MultipleTrue', True, 'Enabled80MultipleTrue:hello world_1'),
('Enabled80MultipleTrue', False, 'Enabled80MultipleTrue:hello world_2'),
]