Sheerka-Old/tests/test_ConceptLexerParser.py

import pytest
from core.builtin_concepts import BuiltinConcepts
from core.concept import Concept
from core.sheerka import Sheerka, ExecutionContext
from parsers.ConceptLexerParser import ConceptLexerParser, ConceptNode, Sequence, StrMatch, OrderedChoice, Optional, \
    ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch


class ConceptVisitor(ParsingExpressionVisitor):
    def __init__(self):
        self.concepts = set()

    def visit_ConceptMatch(self, node):
        self.concepts.add(node.concept)


def u(parsing_expression, start, end, children=None):
    if isinstance(parsing_expression, str):
        parsing_expression = StrMatch(parsing_expression)

    if isinstance(parsing_expression, StrMatch):
        return TerminalNode(parsing_expression, start, end, parsing_expression.to_match)

    return NonTerminalNode(parsing_expression, start, end, [], children)


@pytest.mark.parametrize("match, text", [
    ("foo", "foo"),
    ("'foo'", "'foo'"),
    ("1", "1"),
    ("3.14", "3.14"),
    ("+", "+"),
    (StrMatch("foo"), "foo"),
    (StrMatch("'foo'"), "'foo'"),
    (StrMatch("1"), "1"),
    (StrMatch("3.14"), "3.14"),
    (StrMatch("+"), "+"),
])
def test_i_can_match_simple_tokens(match, text):
    context = get_context()
    foo = Concept(name="foo")
    concepts = {foo: text}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, text)

    assert res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert res.value.value == [ConceptNode(foo, 0, 0, source=text, underlying=u(match, 0, 0))]


def test_i_can_match_multiple_concepts_in_one_input():
    context = get_context()
    one = Concept(name="one")
    two = Concept(name="two")
    concepts = {one: "one", two: "two"}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two one")

    assert res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert res.value.value == [
        ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
        ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2)),
        ConceptNode(one, 4, 4, source="one", underlying=u("one", 4, 4)),
    ]


def test_i_cannot_match_an_unknown_input():
    context = get_context()
    parser = ConceptLexerParser()  # no grammar registered

    res = parser.parse(context, "foo")

    assert not res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
    assert res.value.body[0].body == "foo"


def test_i_cannot_match_when_part_of_the_input_is_unknown():
    context = get_context()
    one = Concept(name="one")
    two = Concept(name="two")
    concepts = {one: "one", two: "two"}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two three")
    assert not res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert res.value.try_parsed == [
        ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
        ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2))]  # these two were recognized
    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
    assert res.value.body[0].body == "three"


def test_i_can_match_sequence():
    context = get_context()
    foo = Concept(name="foo")
    concepts = {foo: Sequence("one", "two", "three")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two three")

    assert res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert res.value.value == [
        ConceptNode(foo, 0, 4, source="one two three", underlying=u(concepts[foo], 0, 4, [
            u("one", 0, 0),
            u("two", 2, 2),
            u("three", 4, 4)]))]


def test_wrong_sequence_is_not_matched():
    context = get_context()
    foo = Concept(name="foo")
    concepts = {foo: Sequence("one", "two", "three")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two three one")

    assert not res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert res.value.try_parsed == [(foo, "one two three")]
    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
    assert res.value.body[0].body == "one"


def test_i_cannot_match_sequence_if_end_of_file():
    context = get_context()
    foo = Concept(name="foo")
    concepts = {foo: Sequence("one", "two", "three")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two")
    assert not res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert res.value.try_parsed == []
    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
    assert res.value.body[0].body == "one"


def test_i_always_choose_the_longest_match():
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")
    concepts = {bar: Sequence("one", "two"), foo: Sequence("one", "two", "three")}

    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two three")

    assert res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert res.value.value == [(foo, "one two three")]


def test_i_can_match_several_sequences():
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")
    concepts = {bar: Sequence("one", "two"), foo: Sequence("one", "two", "three")}

    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two three one two")

    assert res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
    assert res.value.value == [
        (foo, 0, 4, "one two three"),
        (bar, 6, 8, "one two"),
    ]


def test_i_can_match_ordered_choice():
    context = get_context()
    foo = Concept(name="foo")
    concepts = {foo: OrderedChoice("one", "two")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res1 = parser.parse(context, "one")
    assert res1.status
    assert context.sheerka.isinstance(res1.value, BuiltinConcepts.PARSER_RESULT)
    assert res1.value.body == [
        ConceptNode(foo, 0, 0, source="one", underlying=u(concepts[foo], 0, 0, [u("one", 0, 0)]))]

    res2 = parser.parse(context, "two")
    assert res2.status
    assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
    assert res2.value.body == [
        ConceptNode(foo, 0, 0, source="two", underlying=u(concepts[foo], 0, 0, [u("two", 0, 0)]))]

    res3 = parser.parse(context, "three")
    assert not res3.status
    assert context.sheerka.isinstance(res3.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
    assert res3.value.body[0].body == "three"


def test_i_cannot_match_ordered_choice_with_empty_alternative():
    context = get_context()
    foo = Concept(name="foo")
    concepts = {foo: Sequence(OrderedChoice("one", ""), "two")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "ok")  # because token[0] is not "one" and not "" (it is 'two')
    assert not res.status


def test_i_can_mix_sequences_and_ordered_choices():
    context = get_context()
    foo = Concept(name="foo")

    concepts = {foo: Sequence(OrderedChoice("twenty", "thirty"), "one", "ok")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res1 = parser.parse(context, "twenty one ok")
    assert res1.status
    assert context.sheerka.isinstance(res1.value, BuiltinConcepts.PARSER_RESULT)
    assert res1.value.body == [ConceptNode(foo, 0, 4, source="twenty one ok",
                                           underlying=u(concepts[foo], 0, 4, [
                                               u(OrderedChoice("twenty", "thirty"), 0, 0, [u("twenty", 0, 0)]),
                                               u("one", 2, 2),
                                               u("ok", 4, 4)]))]

    res2 = parser.parse(context, "thirty one ok")
    assert res2.status
    assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
    assert res2.value.body == [ConceptNode(foo, 0, 4, source="thirty one ok",
                                           underlying=u(concepts[foo], 0, 4, [
                                               u(OrderedChoice("twenty", "thirty"), 0, 0, [u("thirty", 0, 0)]),
                                               u("one", 2, 2),
                                               u("ok", 4, 4)]))]

    res3 = parser.parse(context, "twenty one")
    assert not res3.status
    assert res3.value.body[0].body == "twenty"
    assert res3.value.try_parsed == []


def test_i_can_mix_ordered_choices_and_sequences():
    context = get_context()
    foo = Concept(name="foo")

    concepts = {foo: OrderedChoice(Sequence("twenty", "thirty"), "one")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "twenty thirty")
    assert res.status

    res = parser.parse(context, "one")
    assert res.status


def test_i_cannot_parse_empty_optional():
    context = get_context()
    foo = Concept(name="foo")

    concepts = {foo: Optional("one")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "")
    assert not res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.IS_EMPTY)


def test_i_can_parse_optional():
    context = get_context()
    foo = Concept(name="foo")

    concepts = {foo: Optional("one")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one")
    assert res.status
    assert res.value.value == [ConceptNode(foo, 0, 0, source="one",
                                           underlying=u(concepts[foo], 0, 0, [u("one", 0, 0)]))]


def test_i_can_parse_sequence_starting_with_optional():
    context = get_context()
    foo = Concept(name="foo")

    concepts = {foo: Sequence(Optional("twenty"), "one")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "twenty one")
    assert res.status
    assert res.value.body == [ConceptNode(
        foo, 0, 2,
        source="twenty one",
        underlying=u(concepts[foo], 0, 2,
                     [
                         u(Optional("twenty"), 0, 0, [u("twenty", 0, 0)]),
                         u("one", 2, 2)]
                     ))]

    res = parser.parse(context, "one")
    assert res.status
    assert res.value.body == [ConceptNode(foo, 0, 0, source="one",
                                          underlying=u(concepts[foo], 0, 0, [u("one", 0, 0)]))]


def test_i_can_parse_sequence_ending_with_optional():
    context = get_context()
    foo = Concept(name="foo")

    concepts = {foo: Sequence("one", "two", Optional("three"))}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two three")
    assert res.status
    assert res.value.body == [(foo, 0, 4, "one two three")]

    res = parser.parse(context, "one two")
    assert res.status
    assert res.value.body == [(foo, 0, 2, "one two")]


def test_i_can_parse_sequence_with_optional_in_between():
    context = get_context()
    foo = Concept(name="foo")

    concepts = {foo: Sequence("one", Optional("two"), "three")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two three")
    assert res.status
    assert res.value.body == [(foo, 0, 4, "one two three")]

    res = parser.parse(context, "one three")
    assert res.status
    assert res.value.body == [(foo, 0, 2, "one three")]


def test_i_can_use_reference():
    # when there are multiple matches for the same input
    # Do I need to create a choice concept ?
    # No, create a return value for every possible graph
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {foo: Sequence("one", "two"), bar: foo}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two")
    assert len(res) == 2

    assert res[0].status
    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
    assert res[0].value.body == [ConceptNode(foo, 0, 2, source="one two",
                                             underlying=u(concepts[foo], 0, 2, [u("one", 0, 0), u("two", 2, 2)]))]

    assert res[1].status
    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
    assert res[1].value.body == [ConceptNode(bar, 0, 2, source="one two",
                                             underlying=u(ConceptMatch(foo, rule_name="foo"), 0, 2,
                                                          [u(concepts[foo], 0, 2, [u("one", 0, 0), u("two", 2, 2)])]))]


def test_i_can_use_context_reference_with_multiple_levels():
    """
    Same than previous one, but with reference of reference
    :return:
    """
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")
    baz = Concept(name="baz")

    concepts = {foo: Sequence("one", "two"), bar: foo, baz: bar}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two")
    assert len(res) == 3

    assert res[0].status
    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
    assert res[0].value.body == [(foo, 0, 2, "one two")]

    assert res[1].status
    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
    assert res[1].value.body == [(bar, 0, 2, "one two")]

    assert res[2].status
    assert context.sheerka.isinstance(res[2].value, BuiltinConcepts.PARSER_RESULT)
    assert res[2].value.body == [(baz, 0, 2, "one two")]


def test_order_is_not_important_when_using_references():
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {bar: foo, foo: Sequence("one", "two")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "one two")
    assert len(res) == 2
    assert res[0].value.body == [(bar, 0, 2, "one two")]
    assert res[1].value.body == [(foo, 0, 2, "one two")]


def test_i_can_parse_when_reference():
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {bar: Sequence(foo, OrderedChoice("one", "two")), foo: OrderedChoice("twenty", "thirty")}
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "twenty two")
    assert res.status
    assert res.value.body == [(bar, 0, 2, "twenty two")]

    res = parser.parse(context, "thirty one")
    assert res.status
    assert res.value.body == [(bar, 0, 2, "thirty one")]

    res = parser.parse(context, "twenty")
    assert res.status
    assert res.value.body == [(foo, 0, 0, "twenty")]


def test_i_can_detect_duplicates_when_reference():
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {
        bar: Sequence(foo, Optional(OrderedChoice("one", "two"))),
        foo: OrderedChoice("twenty", "thirty")
    }
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)

    res = parser.parse(context, "twenty")
    assert len(res) == 2
    assert res[0].status
    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
    assert res[0].value.body == [(bar, 0, 0, "twenty")]

    assert res[1].status
    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
    assert res[1].value.body == [(foo, 0, 0, "twenty")]


def test_i_can_detect_infinite_recursion():
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {
        bar: foo,
        foo: bar
    }
    parser = ConceptLexerParser()
    parser.initialize(get_context(), concepts)

    assert bar not in parser.concepts_grammars
    assert foo not in parser.concepts_grammars


def test_i_can_detect_indirect_infinite_recursion_with_ordered_choice():
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {
        bar: foo,
        foo: OrderedChoice(bar, "foo")
    }
    parser = ConceptLexerParser()
    parser.initialize(get_context(), concepts)

    assert foo not in parser.concepts_grammars  # removed because of the infinite recursion
    assert bar not in parser.concepts_grammars  # removed because of the infinite recursion

    # the other way around is possible
    context = get_context()
    concepts = {
        bar: foo,
        foo: OrderedChoice("foo", bar)
    }
    parser = ConceptLexerParser()
    parser.initialize(context, concepts)
    assert foo in parser.concepts_grammars
    assert bar in parser.concepts_grammars

    res = parser.parse(context, "foo")
    assert len(res) == 2
    assert res[0].status
    assert res[0].value.body == [(bar, 0, 0, "foo")]
    assert res[1].status
    assert res[1].value.body == [(foo, 0, 0, "foo")]


def test_i_can_detect_indirect_infinite_recursion_with_sequence():
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {
        bar: foo,
        foo: Sequence("one", bar, "two")
    }
    parser = ConceptLexerParser()
    parser.initialize(get_context(), concepts)

    assert foo not in parser.concepts_grammars  # removed because of the infinite recursion
    assert bar not in parser.concepts_grammars  # removed because of the infinite recursion


def test_i_can_detect_indirect_infinite_recursion_with_sequence_or_ordered_choice():
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {
        bar: foo,
        foo: Sequence("one", OrderedChoice(bar, "other"), "two")
    }
    parser = ConceptLexerParser()
    parser.initialize(get_context(), concepts)

    assert foo not in parser.concepts_grammars  # removed because of the infinite recursion
    assert bar not in parser.concepts_grammars  # removed because of the infinite recursion


def test_i_can_detect_indirect_infinite_recursion_with_optional():
    # TODO infinite recursion with optional
    pass


def test_i_can_visit_parsing_expression():
    mult = Concept(name="mult")
    add = Concept(name="add")

    visitor = ConceptVisitor()
    visitor.visit(Sequence(mult, Optional(Sequence("+", add))))

    assert sorted(list(visitor.concepts)) == ["add", "mult"]


def test_i_can_initialize_rule_names():
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")

    concepts = {foo: Sequence("one", "two"), bar: foo}
    parser = ConceptLexerParser()
    ret = parser.initialize(context, concepts)
    return_value = ret.body

    assert return_value[foo].rule_name == ""
    assert return_value[bar].rule_name == "foo"


#
# def test_i_can_parse_basic_arithmetic_operations_and_resolve_properties():
#     context = get_context()
#     add = Concept(name="add")
#     mult = Concept(name="mult")
#     atom = Concept(name="atom")
#
#     concepts = {
#         add: Sequence(mult, Optional(Sequence(OrderedChoice('+', '-', rule_name="sign"), add))),
#         mult: Sequence(atom, Optional(Sequence(OrderedChoice('*', '/'), mult))),
#         atom: OrderedChoice(OrderedChoice('1', '2', '3'), Sequence('(', add, ')')),
#     }
#
#     parser = ConceptLexerParser()
#     parser.register(concepts)
#
#     # res = parser.parse(context, "1")
#     # assert len(res) == 3  # add, mult, atom
#     #
#     # res = parser.parse(context, "1 * 2")
#     # assert len(res) == 2  # add and mult
#     #
#     # res = parser.parse(context, "1 + 2")
#     # assert res.status
#     # assert res.value.value == [ConceptNode(add, 0, 4, source="1 + 2")]
#
#     res = parser.parse(context, "1 * 2 + 3")
#     assert res.status
#     assert res.value.value == [ConceptNode(add, 0, 4, source="1 + 2 + 3")]


def test_i_can_register_concepts_with_the_same_name():
    # TODO : concepts are registered by name,
    #  what when two concepts have the same name ?
    pass


def test_i_can_parse_very_very_long_input():
    # TODO: In the current implementation, all the tokens are loaded in memory
    # It's clearly not the good approach
    pass


def get_context():
    sheerka = Sheerka(skip_builtins_in_db=True)
    sheerka.initialize("mem://")

    return ExecutionContext("sheerka", "xxxx", sheerka)