import pytest from core.builtin_concepts import BuiltinConcepts from core.concept import Concept from core.sheerka import Sheerka, ExecutionContext from core.tokenizer import Tokenizer, TokenKind from parsers.ConceptLexerParser import ConceptLexerParser, ConceptNode, Sequence, StrMatch, OrderedChoice, Optional, \ CrossRef, RegexParser, ZeroOrMore, OneOrMore, UnexpectedEndOfFileError, UnexpectedTokenErrorNode, ConceptMatch, \ ParsingExpressionVisitor class ConceptVisitor(ParsingExpressionVisitor): def __init__(self): self.concepts = set() def visit_ConceptMatch(self, node): self.concepts.add(node.concept_name) @pytest.mark.parametrize("match, text", [ ("foo", "foo"), ("'foo'", "'foo'"), ("1", "1"), ("3.14", "3.14"), ("+", "+"), (StrMatch("foo"), "foo"), (StrMatch("'foo'"), "'foo'"), (StrMatch("1"), "1"), (StrMatch("3.14"), "3.14"), (StrMatch("+"), "+"), ]) def test_i_can_match_simple_tokens(match, text): context = get_context() foo = Concept(name="foo") concepts = {foo: text} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, text) assert res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert res.value.value == [ConceptNode(foo, 0, 0, source=text)] def test_i_can_match_multiple_concepts_in_one_input(): context = get_context() one = Concept(name="one") two = Concept(name="two") concepts = {one: "one", two: "two"} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two one") assert res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert res.value.value == [ ConceptNode(one, 0, 0, source="one"), ConceptNode(two, 2, 2, source="two"), ConceptNode(one, 4, 4, source="one"), ] def test_i_cannot_match_an_unknown_input(): context = get_context() parser = ConceptLexerParser() # no grammar registered res = parser.parse(context, "foo") assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) assert res.value.body[0].body == "foo" def test_i_cannot_match_when_part_of_the_input_is_unknown(): context = get_context() one = Concept(name="one") two = Concept(name="two") concepts = {one: "one", two: "two"} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two three") assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert res.value.try_parsed == [ ConceptNode(one, 0, 0, source="one"), ConceptNode(two, 2, 2, source="two")] # these two were recognized assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) assert res.value.body[0].body == "three" def test_i_can_match_sequence(): context = get_context() foo = Concept(name="foo") concepts = {foo: Sequence("one", "two", "three")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two three") assert res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert res.value.value == [ConceptNode(foo, 0, 4, source="one two three")] def test_wrong_sequence_is_not_matched(): context = get_context() foo = Concept(name="foo") concepts = {foo: Sequence("one", "two", "three")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two three one") assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert res.value.try_parsed == [ConceptNode(foo, 0, 4, source="one two three")] assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) assert res.value.body[0].body == "one" def test_i_cannot_match_sequence_if_end_of_file(): context = get_context() foo = Concept(name="foo") concepts = {foo: Sequence("one", "two", "three")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two") assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert res.value.try_parsed == [] assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) assert res.value.body[0].body == "one" def test_i_always_choose_the_longest_match(): context = get_context() foo = Concept(name="foo") bar = Concept(name="bar") concepts = {bar: Sequence("one", "two"), foo: Sequence("one", "two", "three")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two three") assert res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert res.value.value == [ConceptNode(foo, 0, 4, source="one two three")] def test_i_can_match_several_sequences(): context = get_context() foo = Concept(name="foo") bar = Concept(name="bar") concepts = {bar: Sequence("one", "two"), foo: Sequence("one", "two", "three")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two three one two") assert res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT) assert res.value.value == [ ConceptNode(foo, 0, 4, source="one two three"), ConceptNode(bar, 6, 8, source="one two"), ] def test_i_can_match_ordered_choice(): context = get_context() foo = Concept(name="foo") concepts = {foo: OrderedChoice("one", "two")} parser = ConceptLexerParser() parser.initialize(context, concepts) res1 = parser.parse(context, "one") assert res1.status assert context.sheerka.isinstance(res1.value, BuiltinConcepts.PARSER_RESULT) assert res1.value.body == [ConceptNode(foo, 0, 0, source="one")] res2 = parser.parse(context, "two") assert res2.status assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT) assert res2.value.body == [ConceptNode(foo, 0, 0, source="two")] res3 = parser.parse(context, "three") assert not res3.status assert context.sheerka.isinstance(res3.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT) assert res3.value.body[0].body == "three" def test_i_cannot_match_ordered_choice_with_empty_alternative(): context = get_context() foo = Concept(name="foo") concepts = {foo: Sequence(OrderedChoice("one", ""), "two")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "ok") # because token[0] is not "one" and not "" (it is 'two') assert not res.status def test_i_can_mix_sequences_and_ordered_choices(): context = get_context() foo = Concept(name="foo") concepts = {foo: Sequence(OrderedChoice("twenty", "thirty"), "one", "ok")} parser = ConceptLexerParser() parser.initialize(context, concepts) res1 = parser.parse(context, "twenty one ok") assert res1.status assert context.sheerka.isinstance(res1.value, BuiltinConcepts.PARSER_RESULT) assert res1.value.body == [ConceptNode(foo, 0, 4, source="twenty one ok")] res2 = parser.parse(context, "thirty one ok") assert res2.status assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT) assert res2.value.body == [ConceptNode(foo, 0, 4, source="thirty one ok")] res3 = parser.parse(context, "twenty one") assert not res3.status assert res3.value.body[0].body == "twenty" assert res3.value.try_parsed == [] def test_i_can_mix_ordered_choices_and_sequences(): context = get_context() foo = Concept(name="foo") concepts = {foo: OrderedChoice(Sequence("twenty", "thirty"), "one")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "twenty thirty") assert res.status res = parser.parse(context, "one") assert res.status def test_i_cannot_parse_empty_optional(): context = get_context() foo = Concept(name="foo") concepts = {foo: Optional("one")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "") assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.IS_EMPTY) def test_i_can_parse_optional(): context = get_context() foo = Concept(name="foo") concepts = {foo: Optional("one")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one") assert res.status assert res.value.value == [ConceptNode(foo, 0, 0, source="one")] def test_i_can_parse_sequence_starting_with_optional(): context = get_context() foo = Concept(name="foo") concepts = {foo: Sequence(Optional("twenty"), "one")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "twenty one") assert res.status assert res.value.body == [ConceptNode(foo, 0, 2, source="twenty one")] res = parser.parse(context, "one") assert res.status assert res.value.body == [ConceptNode(foo, 0, 0, source="one")] def test_i_can_parse_sequence_ending_with_optional(): context = get_context() foo = Concept(name="foo") concepts = {foo: Sequence("one", "two", Optional("three"))} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two three") assert res.status assert res.value.body == [ConceptNode(foo, 0, 4, source="one two three")] res = parser.parse(context, "one two") assert res.status assert res.value.body == [ConceptNode(foo, 0, 2, source="one two")] def test_i_can_parse_sequence_with_optional_in_between(): context = get_context() foo = Concept(name="foo") concepts = {foo: Sequence("one", Optional("two"), "three")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two three") assert res.status assert res.value.body == [ConceptNode(foo, 0, 4, source="one two three")] res = parser.parse(context, "one three") assert res.status assert res.value.body == [ConceptNode(foo, 0, 2, source="one three")] def test_i_can_use_reference(): # when there are multiple matches for the same input # Do I need to create a choice concept ? # No, create a return value for every possible graph context = get_context() foo = Concept(name="foo") bar = Concept(name="bar") concepts = {foo: Sequence("one", "two"), bar: foo} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two") assert len(res) == 2 assert res[0].status assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) assert res[0].value.body == [ConceptNode(foo, 0, 2, source="one two")] assert res[1].status assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) assert res[1].value.body == [ConceptNode(bar, 0, 2, source="one two")] def test_i_can_use_context_reference_with_multiple_levels(): """ Same than previous one, but with reference of reference :return: """ context = get_context() foo = Concept(name="foo") bar = Concept(name="bar") baz = Concept(name="baz") concepts = {foo: Sequence("one", "two"), bar: foo, baz: bar} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two") assert len(res) == 3 assert res[0].status assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) assert res[0].value.body == [ConceptNode(foo, 0, 2, source="one two")] assert res[1].status assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) assert res[1].value.body == [ConceptNode(bar, 0, 2, source="one two")] assert res[2].status assert context.sheerka.isinstance(res[2].value, BuiltinConcepts.PARSER_RESULT) assert res[2].value.body == [ConceptNode(baz, 0, 2, source="one two")] def test_order_is_not_important_when_using_references(): context = get_context() foo = Concept(name="foo") bar = Concept(name="bar") concepts = {bar: foo, foo: Sequence("one", "two")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "one two") assert len(res) == 2 assert res[0].value.body == [ConceptNode(bar, 0, 2, source="one two")] assert res[1].value.body == [ConceptNode(foo, 0, 2, source="one two")] def test_i_can_parse_when_reference(): context = get_context() foo = Concept(name="foo") bar = Concept(name="bar") concepts = {bar: Sequence(foo, OrderedChoice("one", "two")), foo: OrderedChoice("twenty", "thirty")} parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "twenty two") assert res.status assert res.value.body == [ConceptNode(bar, 0, 2, source="twenty two")] res = parser.parse(context, "thirty one") assert res.status assert res.value.body == [ConceptNode(bar, 0, 2, source="thirty one")] res = parser.parse(context, "twenty") assert res.status assert res.value.body == [ConceptNode(foo, 0, 0, source="twenty")] def test_i_can_detect_duplicates_when_reference(): context = get_context() foo = Concept(name="foo") bar = Concept(name="bar") concepts = { bar: Sequence(foo, Optional(OrderedChoice("one", "two"))), foo: OrderedChoice("twenty", "thirty") } parser = ConceptLexerParser() parser.initialize(context, concepts) res = parser.parse(context, "twenty") assert len(res) == 2 assert res[0].status assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT) assert res[0].value.body == [ConceptNode(bar, 0, 0, source="twenty")] assert res[1].status assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT) assert res[1].value.body == [ConceptNode(foo, 0, 0, source="twenty")] def test_i_can_detect_infinite_recursion(): foo = Concept(name="foo") bar = Concept(name="bar") concepts = { bar: foo, foo: bar } parser = ConceptLexerParser() parser.initialize(get_context(), concepts) assert bar not in parser.concepts_dict assert foo not in parser.concepts_dict def test_i_can_detect_indirect_infinite_recursion_with_ordered_choice(): foo = Concept(name="foo") bar = Concept(name="bar") concepts = { bar: foo, foo: OrderedChoice(bar, "foo") } parser = ConceptLexerParser() parser.initialize(get_context(), concepts) assert foo not in parser.concepts_dict # removed because of the infinite recursion assert bar not in parser.concepts_dict # removed because of the infinite recursion # the other way around is possible context = get_context() concepts = { bar: foo, foo: OrderedChoice("foo", bar) } parser = ConceptLexerParser() parser.initialize(context, concepts) assert foo in parser.concepts_dict assert bar in parser.concepts_dict res = parser.parse(context, "foo") assert len(res) == 2 assert res[0].status assert res[0].value.body == [ConceptNode(bar, 0, 0, source="foo")] assert res[1].status assert res[1].value.body == [ConceptNode(foo, 0, 0, source="foo")] def test_i_can_detect_indirect_infinite_recursion_with_sequence(): foo = Concept(name="foo") bar = Concept(name="bar") concepts = { bar: foo, foo: Sequence("one", bar, "two") } parser = ConceptLexerParser() parser.initialize(get_context(), concepts) assert foo not in parser.concepts_dict # removed because of the infinite recursion assert bar not in parser.concepts_dict # removed because of the infinite recursion def test_i_can_detect_indirect_infinite_recursion_with_sequence_or_ordered_choice(): foo = Concept(name="foo") bar = Concept(name="bar") concepts = { bar: foo, foo: Sequence("one", OrderedChoice(bar, "other"), "two") } parser = ConceptLexerParser() parser.initialize(get_context(), concepts) assert foo not in parser.concepts_dict # removed because of the infinite recursion assert bar not in parser.concepts_dict # removed because of the infinite recursion def test_i_can_detect_indirect_infinite_recursion_with_optional(): # TODO infinite recursion with optional pass @pytest.mark.parametrize("expression, expected", [ ("'str'", StrMatch("str")), ("1", StrMatch("1")), (" 1", StrMatch("1")), (",", StrMatch(",")), ("'foo'?", Optional(StrMatch("foo"))), ("'foo'*", ZeroOrMore(StrMatch("foo"))), ("'foo'+", OneOrMore(StrMatch("foo"))), ("1 | 2 | 3", OrderedChoice(StrMatch("1"), StrMatch("2"), StrMatch("3"))), ("1|2|3", OrderedChoice(StrMatch("1"), StrMatch("2"), StrMatch("3"))), ("1 2 'foo'", Sequence(StrMatch("1"), StrMatch("2"), StrMatch("foo"))), ("1 2 | 3 4+", OrderedChoice( Sequence(StrMatch("1"), StrMatch("2")), Sequence(StrMatch("3"), OneOrMore(StrMatch("4"))))), ("1 (2 | 3) 4+", Sequence(StrMatch("1"), OrderedChoice(StrMatch("2"), StrMatch("3")), OneOrMore(StrMatch("4")))), ("(1|2)+", OneOrMore(OrderedChoice(StrMatch("1"), StrMatch("2")))), ("(1 2)+", OneOrMore(Sequence(StrMatch("1"), StrMatch("2")))), ("1 *", Sequence(StrMatch("1"), StrMatch("*"))), ("1 ?", Sequence(StrMatch("1"), StrMatch("?"))), ("1 +", Sequence(StrMatch("1"), StrMatch("+"))), ("(1|*) +", Sequence(OrderedChoice(StrMatch("1"), StrMatch("*")), StrMatch("+"))), ("1, :&", Sequence(StrMatch("1"), StrMatch(","), StrMatch(":"), StrMatch("&"))), ("(1 )", StrMatch("1")), ]) def test_i_can_parse_regex(expression, expected): parser = RegexParser() res = parser.parse(get_context(), Tokenizer(expression)) assert not parser.has_error assert res.status assert res.value.value == expected assert res.value.source == expression @pytest.mark.parametrize("expression, error", [ ("1 ", UnexpectedEndOfFileError()), ("1|", UnexpectedEndOfFileError()), ("(1|)", UnexpectedTokenErrorNode("Unexpected token 'TokenKind.EOF'", [TokenKind.RPAR])), ]) def test_i_can_detect_errors(expression, error): parser = RegexParser() res = parser.parse(get_context(), Tokenizer(expression)) ret_value = res.value.value assert parser.has_error assert not res.status assert ret_value[0] == error def test_i_can_parse_regex_with_reference(): expression = "foo" parser = RegexParser() res = parser.parse(get_context(), Tokenizer(expression)) assert res.status assert res.value.value == ConceptMatch("foo") assert res.value.source == expression def test_i_can_parse_cross_ref_with_modifier(): expression = "foo*" parser = RegexParser() res = parser.parse(get_context(), Tokenizer(expression)) assert res.status assert res.value.value == ZeroOrMore(ConceptMatch("foo")) assert res.value.source == expression def test_i_can_parse_sequence_with_cross_ref(): expression = "foo 'and' bar+" parser = RegexParser() res = parser.parse(get_context(), Tokenizer(expression)) assert res.status assert res.value.value == Sequence(ConceptMatch("foo"), StrMatch("and"), OneOrMore(ConceptMatch("bar"))) assert res.value.source == expression def test_i_can_parse_choice_with_cross_ref(): foo = Concept("foo") bar = Concept("bar") context = get_context() context.sheerka.add_in_cache(foo) context.sheerka.add_in_cache(bar) expression = "foo | bar?" parser = RegexParser() res = parser.parse(context, Tokenizer(expression)) assert res.status assert res.value.value == OrderedChoice(ConceptMatch("foo"), Optional(ConceptMatch("bar"))) assert res.value.source == expression def test_i_can_use_the_result_of_regex_parsing_to_parse_a_text(): foo = Concept(name="foo") bar = Concept(name="bar") context = get_context() context.sheerka.add_in_cache(foo) context.sheerka.add_in_cache(bar) regex_parser = RegexParser() foo_definition = regex_parser.parse(context, "'twenty' | 'thirty'").value.value bar_definition = regex_parser.parse(context, "foo ('one' | 'two')").value.value concepts = {bar: bar_definition, foo: foo_definition} concept_parser = ConceptLexerParser() concept_parser.initialize(context, concepts) res = concept_parser.parse(context, "twenty two") assert res.status assert res.value.body == [ConceptNode(bar, 0, 2, source="twenty two")] res = concept_parser.parse(context, "thirty one") assert res.status assert res.value.body == [ConceptNode(bar, 0, 2, source="thirty one")] res = concept_parser.parse(context, "twenty") assert res.status assert res.value.body == [ConceptNode(foo, 0, 0, source="twenty")] def test_i_can_visit_parsing_expression(): mult = Concept(name="mult") add = Concept(name="add") visitor = ConceptVisitor() visitor.visit(Sequence(mult, Optional(Sequence("+", add)))) assert sorted(list(visitor.concepts)) == ["add", "mult"] # # def test_i_can_parse_basic_arithmetic_operations_and_resolve_properties(): # context = get_context() # add = Concept(name="add") # mult = Concept(name="mult") # atom = Concept(name="atom") # # concepts = { # add: Sequence(mult, Optional(Sequence(OrderedChoice('+', '-', rule_name="sign"), add))), # mult: Sequence(atom, Optional(Sequence(OrderedChoice('*', '/'), mult))), # atom: OrderedChoice(OrderedChoice('1', '2', '3'), Sequence('(', add, ')')), # } # # parser = ConceptLexerParser() # parser.register(concepts) # # # res = parser.parse(context, "1") # # assert len(res) == 3 # add, mult, atom # # # # res = parser.parse(context, "1 * 2") # # assert len(res) == 2 # add and mult # # # # res = parser.parse(context, "1 + 2") # # assert res.status # # assert res.value.value == [ConceptNode(add, 0, 4, source="1 + 2")] # # res = parser.parse(context, "1 * 2 + 3") # assert res.status # assert res.value.value == [ConceptNode(add, 0, 4, source="1 + 2 + 3")] def test_i_can_register_concepts_with_the_same_name(): # TODO : concepts are registered by name, # what when two concepts have the same name ? pass def test_i_can_parse_very_very_long_input(): # TODO: In the current implementation, all the tokens are loaded in memory # It's clearly not the good approach pass def get_context(): sheerka = Sheerka(skip_builtins_in_db=True) sheerka.initialize("mem://") return ExecutionContext("sheerka", "xxxx", sheerka)