diff --git a/docs/blog.rst b/docs/blog.rst index 7354c2e..c9cede7 100644 --- a/docs/blog.rst +++ b/docs/blog.rst @@ -19,7 +19,7 @@ For those you don't know this old cartoon, it's the Odyssey story from Homer, ported in the 31st century. Ulysses has a spacecraft with an AI named Shyrka I was a great fan of this cartoon when I was young. I thought that the idea of -bringing the ancient story of Ulysses in the future was a bright. +bringing the ancient story of Ulysses in the future was bright. Ever since then, Sheerka was my reference for any sophisticated computer. Unfortunately for me, at that time there was no wikipedia to tell the the correct spelling. @@ -654,3 +654,99 @@ For the two questions, I will first try the simple implementations and see there * the entry in sdp will not be all_number, but all_id_of_number. I will use the concept id instead of its name +2019-24-12 +********** + +Going back on BNF implementation. As it's Christmas eve today, I won't stay very long. + +So, the implementation lies in the class ConceptLexerParser, a it's a lexer not for token, but for concept. +The purpose of this class is to recognize a sequence of Concept. + +So if we defines the following concepts + +:: + + def concept foo from bnf one two three + def concept bar form bnf four five + +when you input + +:: + + one two three four five + +the list of :code:`[foo, bar]` will be returned by the parser (as return values) + +How does it works ? + +As explained in the code, my implementation is highly inspired by Arpegio project. To define your grammar, you +use **ParsingExpressions**. There are several types + +* some use to recognize tokens StrMatch, ConceptMatch +* other use to tell how to recognize Sequence, OrderedChoice, Optional, OneOrMore, ZeroOrMore... + +Some example : + +:: + + to recognize 'foo' -> StrMatch("foo') + to recognize 'foo bar' -> Sequence(StrMatch("foo'), StrMatch("bar')) + to recognize 'foo' or 'bar' -> OrderedChoice(StrMatch("foo'), StrMatch("bar')) + + and so on... + +So when a concept is defined using its bnf definition, I use the **BnfParser** to create the grammar, and then +I use the **ConceptLexerParser** to recognize the concepts + +The current implementation to recognize a concept is not very efficient. All the definitions are in a dictionary +and I go thru the whole dictionary to see if some concepts are recognized. Once a concept is found, I loop again +on the whole dictionary to find the next concept. + +| -> I need a btree to order the concept +| -> I need a predictive algorithm to guess the next concept + +But it is for later. + +So once the parsing is effective, I return a **ConceptNode** object + +.. code-block:: python + + class ConceptNode(LexerNode): + """ + Returned by the ConceptLexerParser + It represents a recognized concept + """ + + def __init__(self, concept, start, end, tokens=None, source=None, underlying=None): + super().__init__(start, end, tokens, source) + self.concept = concept + self.underlying = underlying + + if self.source is None: + self.source = BaseParser.get_text_from_tokens(self.tokens) + + +concept + | Remember that all grammars are listed in a dictionary of . + | So when a parsing expression is verified, it's easy to link it with the concept +start + position first of the token +end + position of the last token +tokens + list of tokens that are recognized +underling + **NonTerminalNode** or **TerminalNode** that wraps the underlying **ParsingExpression** used to recognize the concept +source + | The source is deduced from the tokens + | But in the unit tests, they are directly given for speed up and simplicity + +What is the difference between the **[Non]TerminalNode** and the **ParsingExpression** ? + +The ParsingExpression + defines how to recognize a concept + +The [Non]TerminalNode + represents what was found. So similarly to the ConceptNode, you will find the start, end and token attributes + +That's all for today ! \ No newline at end of file diff --git a/evaluators/ConceptNodeEvaluator.py b/evaluators/ConceptNodeEvaluator.py index 8c01138..b610f77 100644 --- a/evaluators/ConceptNodeEvaluator.py +++ b/evaluators/ConceptNodeEvaluator.py @@ -1,7 +1,7 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts from evaluators.BaseEvaluator import OneReturnValueEvaluator -from parsers.ConceptLexerParser import ConceptNode, NonTerminalNode, ConceptMatch +from parsers.ConceptLexerParser import ConceptNode, NonTerminalNode, ConceptMatch, UnrecognizedTokensNode class ConceptNodeEvaluator(OneReturnValueEvaluator): @@ -17,15 +17,22 @@ class ConceptNodeEvaluator(OneReturnValueEvaluator): def matches(self, context, return_value): if not return_value.status: return False + if not isinstance(return_value.value, ParserResultConcept): return False - return (isinstance(return_value.value.value, ConceptNode) or + return ( + isinstance(return_value.value.value, ConceptNode) or + isinstance(return_value.value.value, UnrecognizedTokensNode) or + ( + hasattr(return_value.value.value, "__iter__") and + len(return_value.value.value) > 0 and ( - hasattr(return_value.value.value, "__iter__") and - len(return_value.value.value) > 0 and - isinstance(return_value.value.value[0], ConceptNode) - )) + isinstance(return_value.value.value[0], ConceptNode) or + isinstance(return_value.value.value[0], UnrecognizedTokensNode) + ) + ) + ) def eval(self, context, return_value): """ @@ -38,19 +45,23 @@ class ConceptNodeEvaluator(OneReturnValueEvaluator): nodes = [nodes] concepts = [] + error_found = False for node in nodes: - concept = sheerka.new(node.concept.key) - concept = self.update_concept(sheerka, concept, node.underlying) - concepts.append(concept) + if isinstance(node, ConceptNode): + concept = sheerka.new(node.concept.key) + concept = self.update_concept(sheerka, concept, node.underlying) + concepts.append(concept) + else: + error_found = True if len(concepts) == 1: return sheerka.ret( self.name, - True, + not error_found, concepts[0], parents=[return_value]) - raise NotImplementedError("Not yet") + return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.NOT_FOR_ME), parents=[return_value]) def update_concept(self, sheerka, concept, underlying, init_empty_body=True): """ diff --git a/parsers/ConceptLexerParser.py b/parsers/ConceptLexerParser.py index 6e86c03..4f9a416 100644 --- a/parsers/ConceptLexerParser.py +++ b/parsers/ConceptLexerParser.py @@ -47,7 +47,33 @@ class LexerNode(Node): if not isinstance(other, LexerNode): return False - return self.start == other.start and self.end == other.end + return self.start == other.start and \ + self.end == other.end and \ + self.source == other.source and \ + self.tokens == other.tokens + + +class UnrecognizedTokensNode(LexerNode): + def __init__(self, start, end, tokens): + super().__init__(start, end, tokens) + + def add_token(self, token, pos): + self.tokens.append(token) + self.end = pos + + def fix_source(self): + self.source = BaseParser.get_text_from_tokens(self.tokens) + + def __eq__(self, other): + if not isinstance(other, UnrecognizedTokensNode): + return False + + return self.start == other.start and \ + self.end == other.end and \ + self.source == other.source + + def __repr__(self): + return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')" class ConceptNode(LexerNode): @@ -74,13 +100,15 @@ class ConceptNode(LexerNode): self.end == other[2] and \ self.source == other[3] - if not super().__eq__(other): - return False + # if not super().__eq__(other): + # return False if not isinstance(other, ConceptNode): return False return self.concept == other.concept and \ + self.start == other.start and \ + self.end == other.end and \ self.source == other.source and \ self.underlying == other.underlying @@ -110,8 +138,8 @@ class NonTerminalNode(LexerNode): return name + sub_names def __eq__(self, other): - if not super().__eq__(other): - return False + # if not super().__eq__(other): + # return False if not isinstance(other, NonTerminalNode): return False @@ -140,8 +168,8 @@ class TerminalNode(LexerNode): return name + f"'{self.value}'" def __eq__(self, other): - if not super().__eq__(other): - return False + # if not super().__eq__(other): + # return False if not isinstance(other, TerminalNode): return False @@ -699,6 +727,9 @@ class ConceptLexerParser(BaseParser): self.reset_parser(context, text) concepts_found = [[]] + unrecognized_tokens = None + has_unrecognized = False + # actually list of list # The first dimension is the number of possibilities found # The second dimension is the number of concepts found, under one possibility @@ -716,6 +747,7 @@ class ConceptLexerParser(BaseParser): while True: init_pos = self.pos res = [] + for concept, grammar in self.concepts_grammars.items(): self.seek(init_pos) node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode @@ -731,31 +763,31 @@ class ConceptLexerParser(BaseParser): if len(res) == 0: # not recognized self.seek(init_pos) - not_recognized = self.get_text_from_tokens(self.get_token()) - self.add_error(self.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body=not_recognized)) - break + if unrecognized_tokens: + unrecognized_tokens.add_token(self.get_token(), init_pos) + else: + unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()]) + concepts_found = core.utils.product(concepts_found, [unrecognized_tokens]) + has_unrecognized = True - res = self.get_bests(res) # only keep the concepts that eat the more tokens - concepts_found = core.utils.product(concepts_found, res) + if not self.next_token(False): + break - # loop - self.seek(res[0].end) - if not self.next_token(): - break + else: # some concepts are recognized + if unrecognized_tokens: + unrecognized_tokens.fix_source() + unrecognized_tokens = None + res = self.get_bests(res) # only keep the concepts that eat the more tokens + concepts_found = core.utils.product(concepts_found, res) - # manage when nothing is recognized (or other error) - if self.has_error: - ret = self.sheerka.ret( - self.name, - False, - self.sheerka.new( - BuiltinConcepts.PARSER_RESULT, - parser=self, - source=text, - body=self.error_sink, - try_parsed=concepts_found[0] if len(concepts_found) == 1 else concepts_found)) - self.log_result(context, text, ret) - return ret + # loop + self.seek(res[0].end) + if not self.next_token(): + break + + # Fix the source if we were working on unrecognized tokens + if unrecognized_tokens: + unrecognized_tokens.fix_source() # else # returns as many ReturnValue than choices found @@ -764,7 +796,7 @@ class ConceptLexerParser(BaseParser): ret.append( self.sheerka.ret( self.name, - True, + not has_unrecognized, self.sheerka.new( BuiltinConcepts.PARSER_RESULT, parser=self, diff --git a/parsers/DefaultParser.py b/parsers/DefaultParser.py index 8e8e0ff..c18f73f 100644 --- a/parsers/DefaultParser.py +++ b/parsers/DefaultParser.py @@ -248,6 +248,9 @@ class DefaultParser(BaseParser): # Regroup the tokens by parts first_token, tokens_found_by_parts = self.regroup_tokens_by_parts(keywords_tokens) + if first_token.type == TokenKind.EOF: + return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT])) + # get the name concept_found.name = self.get_concept_name(first_token, tokens_found_by_parts) diff --git a/tests/test_ConceptLexerParser.py b/tests/test_ConceptLexerParser.py index d938fef..4fc4f51 100644 --- a/tests/test_ConceptLexerParser.py +++ b/tests/test_ConceptLexerParser.py @@ -2,8 +2,10 @@ import pytest from core.builtin_concepts import BuiltinConcepts from core.concept import Concept from core.sheerka import Sheerka, ExecutionContext +from core.tokenizer import Tokenizer, TokenKind, Token from parsers.ConceptLexerParser import ConceptLexerParser, ConceptNode, Sequence, StrMatch, OrderedChoice, Optional, \ - ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore + ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore, \ + UnrecognizedTokensNode from sdp.sheerkaDataProvider import Event @@ -25,6 +27,16 @@ def u(parsing_expression, start, end, children=None): return NonTerminalNode(parsing_expression, start, end, [], children) +def t(text): + if text.startswith("'") or text.startswith('"'): + return Token(TokenKind.STRING, text, 0, 0, 0) + + if text.startswith(" "): + return Token(TokenKind.WHITESPACE, text, 0, 0, 0) + + return Token(TokenKind.IDENTIFIER, text, 0, 0, 0) + + @pytest.mark.parametrize("match, text", [ ("foo", "foo"), ("'foo'", "'foo'"), @@ -70,36 +82,6 @@ def test_i_can_match_multiple_concepts_in_one_input(): ] -def test_i_cannot_match_an_unknown_input(): - context = get_context() - parser = ConceptLexerParser() # no grammar registered - - res = parser.parse(context, "foo") - - assert not res.status - assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) - assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body[0].body == "foo" - - -def test_i_cannot_match_when_part_of_the_input_is_unknown(): - context = get_context() - one = Concept(name="one") - two = Concept(name="two") - concepts = {one: "one", two: "two"} - parser = ConceptLexerParser() - parser.initialize(context, concepts) - - res = parser.parse(context, "one two three") - assert not res.status - assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) - assert res.value.try_parsed == [ - ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)), - ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2))] # these two were recognized - assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body[0].body == "three" - - def test_i_can_match_sequence(): context = get_context() foo = Concept(name="foo") @@ -118,37 +100,6 @@ def test_i_can_match_sequence(): u("three", 4, 4)]))] -def test_wrong_sequence_is_not_matched(): - context = get_context() - foo = Concept(name="foo") - concepts = {foo: Sequence("one", "two", "three")} - parser = ConceptLexerParser() - parser.initialize(context, concepts) - - res = parser.parse(context, "one two three one") - - assert not res.status - assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) - assert res.value.try_parsed == [(foo, "one two three")] - assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body[0].body == "one" - - -def test_i_cannot_match_sequence_if_end_of_file(): - context = get_context() - foo = Concept(name="foo") - concepts = {foo: Sequence("one", "two", "three")} - parser = ConceptLexerParser() - parser.initialize(context, concepts) - - res = parser.parse(context, "one two") - assert not res.status - assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) - assert res.value.try_parsed == [] - assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body[0].body == "one" - - def test_i_always_choose_the_longest_match(): context = get_context() foo = Concept(name="foo") @@ -205,8 +156,10 @@ def test_i_can_match_ordered_choice(): res3 = parser.parse(context, "three") assert not res3.status - assert context.sheerka.isinstance(res3.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res3.value.body[0].body == "three" + assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT) + assert res3.value.value == [ + UnrecognizedTokensNode(0, 0, [t("three")]) + ] def test_i_cannot_match_ordered_choice_with_empty_alternative(): @@ -218,6 +171,10 @@ def test_i_cannot_match_ordered_choice_with_empty_alternative(): res = parser.parse(context, "ok") # because token[0] is not "one" and not "" (it is 'two') assert not res.status + assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) + assert res.value.value == [ + UnrecognizedTokensNode(0, 0, [t("ok")]) + ] def test_i_can_mix_sequences_and_ordered_choices(): @@ -248,8 +205,10 @@ def test_i_can_mix_sequences_and_ordered_choices(): res3 = parser.parse(context, "twenty one") assert not res3.status - assert res3.value.body[0].body == "twenty" - assert res3.value.try_parsed == [] + assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT) + assert res3.value.value == [ + UnrecognizedTokensNode(0, 2, [t("twenty"), t(" "), t("one")]) + ] def test_i_can_mix_ordered_choices_and_sequences(): @@ -364,9 +323,9 @@ def test_i_cannot_parse_wrong_input_with_optional(): res = parser.parse(context, "two") assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) - assert res.value.try_parsed == [] - assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body[0].body == "two" + assert res.value.value == [ + UnrecognizedTokensNode(0, 0, [t("two")]) + ] def test_i_can_use_reference(): @@ -463,7 +422,63 @@ def test_i_can_parse_when_reference(): assert res.value.body == [(foo, 0, 0, "twenty")] -def test_i_can_detect_duplicates_when_reference(): +def test_i_can_parse_multiple_results(): + context = get_context() + foo = Concept(name="foo") + bar = Concept(name="bar") + + concepts = { + bar: Sequence("one", "two"), + foo: Sequence("one", OrderedChoice("two", "three")) + } + + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "one two") + assert len(res) == 2 + assert res[0].status + assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) + assert res[0].value.body == [(bar, 0, 2, "one two")] + + assert res[1].status + assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) + assert res[1].value.body == [(foo, 0, 2, "one two")] + + +def test_i_can_parse_multiple_results_times_two(): + context = get_context() + foo = Concept(name="foo") + bar = Concept(name="bar") + + concepts = { + bar: Sequence("one", "two"), + foo: Sequence("one", OrderedChoice("two", "three")) + } + + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "one two one two") + assert len(res) == 4 + assert res[0].status + assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) + assert res[0].value.body == [(bar, "one two"), (bar, "one two")] + + assert res[1].status + assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) + assert res[1].value.body == [(foo, "one two"), (bar, "one two")] + + assert res[2].status + assert context.sheerka.isinstance(res[2].value, BuiltinConcepts.PARSER_RESULT) + assert res[2].value.body == [(bar, "one two"), (foo, "one two")] + + assert res[3].status + assert context.sheerka.isinstance(res[3].value, BuiltinConcepts.PARSER_RESULT) + assert res[3].value.body == [(foo, "one two"), (foo, "one two")] + + +def test_i_can_parse_multiple_results_when_reference(): context = get_context() foo = Concept(name="foo") bar = Concept(name="bar") @@ -557,17 +572,17 @@ def test_i_cannot_parse_zero_and_more_when_wrong_entry(): res = parser.parse(context, "one two") assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) - assert res.value.try_parsed == [ - ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)]))] - assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body[0].body == "two" + assert res.value.value == [ + ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)])), + UnrecognizedTokensNode(2, 2, [t("two")]) + ] res = parser.parse(context, "two") assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) - assert res.value.try_parsed == [] - assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body[0].body == "two" + assert res.value.value == [ + UnrecognizedTokensNode(0, 0, [t("two")]) + ] def test_i_can_parse_zero_and_more_with_separator(): @@ -636,10 +651,9 @@ def test_i_can_parse_sequence_and_one_or_more(): res = parser.parse(context, "two") assert not res.status - assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) - assert res.value.try_parsed == [] - assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body[0].body == "two" + assert res.value.body == [ + UnrecognizedTokensNode(0, 0, [t("two")]) + ] def test_i_can_parse_one_and_more_with_separator(): @@ -803,6 +817,201 @@ def test_i_can_initialize_rule_names(): assert return_value[bar].rule_name == "foo" +@pytest.mark.parametrize("text, end_position", [ + ("foo", 0), + ("foo bar", 2) +]) +def test_cannot_parser_unknown_concepts(text, end_position): + context = get_context() + + parser = ConceptLexerParser() + parser.initialize(context, {}) + + res = parser.parse(context, text) + tokens = list(Tokenizer(text))[:-1] + + assert not res.status + assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) + assert res.value.value == [UnrecognizedTokensNode(0, end_position, tokens)] + + +def test_i_cannot_parse_when_part_of_the_input_is_unrecognized(): + context = get_context() + one = Concept(name="one") + two = Concept(name="two") + concepts = {one: "one", two: "two"} + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "one two three") + assert not res.status + assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) + assert res.value.value == [ + ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)), + ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2)), + UnrecognizedTokensNode(4, 4, [t("three")]) + ] + + +def test_i_cannot_parse_when_wrong_sequence(): + context = get_context() + foo = Concept(name="foo") + concepts = {foo: Sequence("one", "two", "three")} + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "one two three one") + + assert not res.status + assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) + assert res.value.value == [ + (foo, "one two three"), + UnrecognizedTokensNode(6, 6, [t("one")]) + ] + + +def test_i_cannot_parse_when_sequence_cannot_match_because_of_end_of_file(): + context = get_context() + foo = Concept(name="foo") + concepts = {foo: Sequence("one", "two", "three")} + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "one two") + + assert not res.status + assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) + assert res.value.value == [ + UnrecognizedTokensNode(0, 2, [t("one"), t(" "), t("two")]) + ] + + +def test_i_cannot_parse_multiple_results_when_unknown_tokens_at_the_end(): + context = get_context() + foo = Concept(name="foo") + bar = Concept(name="bar") + + concepts = { + bar: Sequence("one", "two"), + foo: Sequence("one", OrderedChoice("two", "three")) + } + + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "one two four five") + assert len(res) == 2 + assert not res[0].status + assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) + assert res[0].value.body == [ + (bar, 0, 2, "one two"), + UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")]) + ] + + assert not res[1].status + assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) + assert res[1].value.body == [ + (foo, 0, 2, "one two"), + UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")]) + ] + + +def test_i_cannot_parse_multiple_results_when_beginning_by_unknown_tokens(): + context = get_context() + foo = Concept(name="foo") + bar = Concept(name="bar") + + concepts = { + bar: Sequence("one", "two"), + foo: Sequence("one", OrderedChoice("two", "three")) + } + + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "four five one two") + assert len(res) == 2 + assert not res[0].status + assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) + assert res[0].value.body == [ + UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]), + (bar, 4, 6, "one two"), + ] + + assert not res[1].status + assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) + assert res[1].value.body == [ + UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]), + (foo, 4, 6, "one two"), + ] + + +def test_i_cannot_parse_multiple_results_when_surrounded_by_unknown_tokens(): + context = get_context() + foo = Concept(name="foo") + bar = Concept(name="bar") + + concepts = { + bar: Sequence("one", "two"), + foo: Sequence("one", OrderedChoice("two", "three")) + } + + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "four five one two six seven") + assert len(res) == 2 + assert not res[0].status + assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) + assert res[0].value.body == [ + UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]), + (bar, 4, 6, "one two"), + UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]), + ] + + assert not res[1].status + assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) + assert res[1].value.body == [ + UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]), + (foo, 4, 6, "one two"), + UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]), + ] + + +def test_i_cannot_parse_multiple_results_when_unknown_tokens_in_the_middle(): + context = get_context() + foo = Concept(name="foo") + bar = Concept(name="bar") + baz = Concept(name="baz") + + concepts = { + bar: Sequence("one", "two"), + foo: Sequence("one", OrderedChoice("two", "three")), + baz: StrMatch("six"), + } + + parser = ConceptLexerParser() + parser.initialize(context, concepts) + + res = parser.parse(context, "one two four five six") + assert len(res) == 2 + assert not res[0].status + assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) + assert res[0].value.body == [ + (bar, 0, 2, "one two"), + UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]), + (baz, 8, 8, "six"), + ] + + assert not res[1].status + assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) + assert res[1].value.body == [ + (foo, 0, 2, "one two"), + UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]), + (baz, 8, 8, "six"), + ] + + # # def test_i_can_parse_basic_arithmetic_operations_and_resolve_properties(): # context = get_context() diff --git a/tests/test_ConceptNodeEvaluator.py b/tests/test_ConceptNodeEvaluator.py index 49e45ba..1c36bc8 100644 --- a/tests/test_ConceptNodeEvaluator.py +++ b/tests/test_ConceptNodeEvaluator.py @@ -5,7 +5,7 @@ from core.concept import Concept from core.sheerka import Sheerka, ExecutionContext from evaluators.ConceptNodeEvaluator import ConceptNodeEvaluator from parsers.ConceptLexerParser import ConceptNode, ConceptLexerParser, Sequence, TerminalNode, \ - StrMatch, Optional, OrderedChoice, ZeroOrMore + StrMatch, Optional, OrderedChoice, ZeroOrMore, UnrecognizedTokensNode from sdp.sheerkaDataProvider import Event @@ -37,8 +37,12 @@ def get_concept_node(context, grammar, expression): @pytest.mark.parametrize("ret_val, expected", [ (ReturnValueConcept("some_name", True, ParserResultConcept(value=[ConceptNode(Concept(), 0, 0)])), True), (ReturnValueConcept("some_name", True, ParserResultConcept(value=ConceptNode(Concept(), 0, 0))), True), + (ReturnValueConcept("some_name", True, ParserResultConcept(value=[UnrecognizedTokensNode(0, 0, [])])), True), + (ReturnValueConcept("some_name", True, ParserResultConcept(value=UnrecognizedTokensNode(0, 0, []))), True), (ReturnValueConcept("some_name", False, ParserResultConcept(value=[ConceptNode(Concept(), 0, 0)])), False), (ReturnValueConcept("some_name", False, ParserResultConcept(value=ConceptNode(Concept(), 0, 0))), False), + (ReturnValueConcept("some_name", False, ParserResultConcept(value=[UnrecognizedTokensNode(0, 0, [])])), False), + (ReturnValueConcept("some_name", False, ParserResultConcept(value=UnrecognizedTokensNode(0, 0, []))), False), (ReturnValueConcept("some_name", True, ParserResultConcept(value="Not a concept node")), False), (ReturnValueConcept("some_name", True, ParserResultConcept(value=["Not a concept node"])), False), (ReturnValueConcept("some_name", True, [ConceptNode(Concept(), 0, 0)]), False), diff --git a/tests/test_DefaultParser.py b/tests/test_DefaultParser.py index 9eaefb3..68b1c49 100644 --- a/tests/test_DefaultParser.py +++ b/tests/test_DefaultParser.py @@ -311,6 +311,8 @@ def test_i_can_parse_is_a(): "concept", "isa number", "name isa", + "def", + "def concept_name" ]) def test_i_cannot_parse_invalid_entries(text): parser = DefaultParser() diff --git a/tests/test_sheerka_non_reg.py b/tests/test_sheerka_non_reg.py index e0e82f7..4d39649 100644 --- a/tests/test_sheerka_non_reg.py +++ b/tests/test_sheerka_non_reg.py @@ -7,6 +7,7 @@ from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept from core.concept import Concept, PROPERTIES_TO_SERIALIZE, Property from core.sheerka import Sheerka, ExecutionContext from evaluators.MutipleSameSuccessEvaluator import MultipleSameSuccessEvaluator +from parsers.BaseParser import BaseParser from parsers.ConceptLexerParser import Sequence, ZeroOrMore, StrMatch, OrderedChoice, Optional, ConceptMatch, \ ConceptLexerParser from sdp.sheerkaDataProvider import SheerkaDataProvider, Event @@ -291,7 +292,7 @@ def test_i_can_manage_concepts_with_the_same_key_when_values_are_the_same(): res = sheerka.evaluate_user_input("hello 'foo'") assert len(res) == 1 assert res[0].status - assert res[0].value.body == "hello foo" # I don't know yet the one to choose + assert res[0].value.body == "hello foo" # I don't know yet the one to choose assert res[0].who == sheerka.get_evaluator_name(MultipleSameSuccessEvaluator.NAME)