Sheerka-Old/tests/parsers/test_BaseCustomGrammarParser.py

import pytest

from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import Keywords, Tokenizer, TokenKind
from parsers.BaseCustomGrammarParser import BaseCustomGrammarParser, SyntaxErrorNode, KeywordNotFound
from parsers.BaseParser import UnexpectedEofParsingError, UnexpectedTokenParsingError
from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka


def get_tokens(items):
    return [list(Tokenizer(item, yield_eof=False))[0] for item in items]


class TestBaseCustomGrammarParser(TestUsingMemoryBasedSheerka):

    @staticmethod
    def compare_results(actual, expected, compare_str=False):
        resolved_expected = {}
        for k, v in expected.items():
            if isinstance(v, str):
                # case like {Keywords.DEF_VAR: "def_var var1 def_var var2"}
                tokens = list(Tokenizer(v, yield_eof=False))
                resolved_expected[k] = [tokens[0]] + tokens[2:]
            else:
                # case like {Keywords.DEF_VAR: get_tokens(["def_var", "var1", "var2"])}
                resolved_expected[k] = v

        def get_better_representation(value):
            better_repr = {}
            for k, tokens in value.items():
                value = "".join([t.str_value if compare_str else t.repr_value for t in tokens[1:]])
                better_repr[k] = [tokens[0].repr_value, value]
            return better_repr

        # it's easier to compare two list of string
        actual_to_compare = get_better_representation(actual)
        expected_to_compare = get_better_representation(resolved_expected)

        assert actual_to_compare == expected_to_compare

    def init_parser(self, text):
        sheerka, context = self.init_concepts()

        parser = BaseCustomGrammarParser("TestBaseCustomLanguageParser", 0)

        parser.reset_parser(context, ParserInput(text))
        parser.parser_input.next_token(False)  # do not skip starting whitespaces

        return sheerka, context, parser

    @pytest.mark.parametrize("text, strip_tokens, expected", [
        ("when xxx yyy", False, {Keywords.WHEN: "when xxx yyy"}),
        ("when uuu vvv print xxx yyy", False, {Keywords.WHEN: "when uuu vvv ", Keywords.PRINT: "print xxx yyy"}),
        ("print xxx yyy when uuu vvv", False, {Keywords.WHEN: "when uuu vvv", Keywords.PRINT: "print xxx yyy "}),
        ("  when xxx", False, {Keywords.WHEN: "when xxx"}),

        ("when xxx yyy", True, {Keywords.WHEN: "when xxx yyy"}),
        ("when uuu vvv print xxx yyy", True, {Keywords.WHEN: "when uuu vvv", Keywords.PRINT: "print xxx yyy"}),
        ("print xxx yyy when uuu vvv", True, {Keywords.WHEN: "when uuu vvv", Keywords.PRINT: "print xxx yyy"}),
        ("  when xxx", True, {Keywords.WHEN: "when xxx"}),
    ])
    def test_i_can_get_parts(self, text, strip_tokens, expected):
        sheerka, context, parser = self.init_parser(text)

        res = parser.get_parts(["when", "print"], strip_tokens=strip_tokens)
        self.compare_results(res, expected)

    def test_i_can_get_parts_when_multilines(self):
        text = """when
def func(x):
\treturn x+1
func(a)
"""
        expected = {Keywords.WHEN: "when def func(x):\n\treturn x+1\nfunc(a)\n"}
        sheerka, context, parser = self.init_parser(text)

        res = parser.get_parts(["when"])
        self.compare_results(res, expected)

    @pytest.mark.parametrize("text, allow_multiple, expected", [
        ("def_var var1 def_var var2", {}, {Keywords.DEF_VAR: "def_var var1 def_var var2"}),
        ("def_var var1 def_var var2", {"def_var"}, {Keywords.DEF_VAR: get_tokens(["def_var", "var1", "var2"])}),
        ("def_var x y z def_var var2", {"def_var"}, {Keywords.DEF_VAR: get_tokens(["def_var", "'x y z'", "var2"])}),
        ("def_var 'x y z' def_var var2", {"def_var"}, {Keywords.DEF_VAR: get_tokens(["def_var", "'x y z'", "var2"])}),
        ("def_var var1 def_var x y z def_var var2", {"def_var"},
         {Keywords.DEF_VAR: get_tokens(["def_var", "var1", "'x y z'", "var2"])}),
    ])
    def test_i_can_get_parts_when_allow_multiple_is_set(self, text, allow_multiple, expected):
        sheerka, context, parser = self.init_parser(text)

        res = parser.get_parts(["def_var"], allow_multiple=allow_multiple)
        self.compare_results(res, expected)

    @pytest.mark.parametrize("text", [
        "",
        "no keyword",
        "anything before when xxx print yyy",
    ])
    def test_i_cannot_get_parts_when_no_keyword_found(self, text):
        sheerka, context, parser = self.init_parser(text)

        assert parser.get_parts(["when", "print"]) is None
        assert len(parser.error_sink) == 1
        assert isinstance(parser.error_sink[0], KeywordNotFound)
        assert parser.error_sink[0].keywords == ['when', 'print']

    def test_i_cannot_get_part_when_the_first_expected_token_is_incorrect(self):
        sheerka, context, parser = self.init_parser("when xxx print yyy")

        assert parser.get_parts(["when", "print"], Keywords.PRINT) is None
        assert parser.error_sink == [UnexpectedTokenParsingError(f"'print' keyword not found.",
                                                                 "when",
                                                                 [Keywords.PRINT])]

    @pytest.mark.parametrize("text", [
        "print",
        "print ",
        "when xxx print",
        "when xxx print ",
    ])
    def test_i_can_detect_incorrect_end_of_file_after_keyword(self, text):
        sheerka, context, parser = self.init_parser(text)

        assert parser.get_parts(["print", "when"]) is not None
        assert len(parser.error_sink) == 1
        assert isinstance(parser.error_sink[0], UnexpectedEofParsingError)
        assert parser.error_sink[0].message == "while parsing keyword 'print'"

    def test_i_can_double_quoted_strings_are_expanded(self):
        """
        When inside a double quote, the double quote is removed and its content it used as is.
        It allows usage of keywords withing parts
        :return:
        """
        sheerka, context, parser = self.init_parser('print "when can be used" when True')
        expected = {Keywords.PRINT: "print when can be used", Keywords.WHEN: "when True"}

        res = parser.get_parts(["print", "when"])
        self.compare_results(res, expected)

    def test_single_quoted_strings_are_not_expanded(self):
        sheerka, context, parser = self.init_parser("print 'when can be used' when True")
        expected = {Keywords.PRINT: "print 'when can be used' ", Keywords.WHEN: "when True"}

        res = parser.get_parts(["print", "when"])
        self.compare_results(res, expected)

    def test_i_can_manage_colon(self):
        text = """when:
    xxx
    when
    print
print:
    xxx:
        when
        print
    yyy
"""
        sheerka, context, parser = self.init_parser(text)
        expected = {Keywords.PRINT: "print xxx:\n    when\n    print\nyyy", Keywords.WHEN: "when xxx\nwhen\nprint"}

        res = parser.get_parts(["print", "when"])
        self.compare_results(res, expected, compare_str=True)

    def test_indentation_is_normalized_when_using_colon(self):
        text = """print:
        xxx:
            when
            print
        yyy
        """
        sheerka, context, parser = self.init_parser(text)
        expected = {Keywords.PRINT: "print xxx:\n    when\n    print\nyyy"}

        res = parser.get_parts(["print", "when"])
        self.compare_results(res, expected, compare_str=True)

    def test_i_can_mix_parts_with_colon_and_parts_without_colon(self):
        text = """when:
            xxx
            when
            print
print xxx"""
        sheerka, context, parser = self.init_parser(text)
        expected = {Keywords.PRINT: "print xxx", Keywords.WHEN: "when xxx\nwhen\nprint"}

        res = parser.get_parts(["print", "when"])
        self.compare_results(res, expected, compare_str=True)

    @pytest.mark.parametrize("text", [
        "when:\nx x",
        "when:  \nx x",
    ])
    def test_i_cannot_manage_colon_when_tab_is_missing(self, text):
        sheerka, context, parser = self.init_parser(text)

        assert parser.get_parts(["when"])
        assert parser.error_sink == [UnexpectedTokenParsingError("Indentation not found.", "x", [TokenKind.WHITESPACE])]

    @pytest.mark.parametrize("text", [
        "",
        "\n",
        "  \n",
        "x",  # less than two characters
        "\n\t"
    ])
    def test_i_cannot_get_body_when_body_is_too_short(self, text):
        sheerka, context, parser = self.init_parser("")

        assert parser.get_body(list(Tokenizer(text, yield_eof=False))) is None
        assert parser.error_sink == [SyntaxErrorNode(None, "Body is empty or too short.")]

    def test_a_new_line_is_expected_when_get_body(self):
        sheerka, context, parser = self.init_parser("")

        assert parser.get_body(list(Tokenizer("not a newline", yield_eof=False))) is None
        assert parser.error_sink == [UnexpectedTokenParsingError("New line not found.", "not", [TokenKind.NEWLINE])]

    @pytest.mark.parametrize("text", [
        "\nx x",
        "  \nx x",
    ])
    def test_tab_is_mandatory_after_new_line_when_get_body(self, text):
        sheerka, context, parser = self.init_parser("")

        assert parser.get_body(list(Tokenizer(text, yield_eof=False))) is None
        assert parser.error_sink == [UnexpectedTokenParsingError("Indentation not found.", "x", [TokenKind.WHITESPACE])]

    def test_i_can_detect_missing_tab_when_get_body(self):
        text = "\n\txxx\n\tyyy\nzzz"

        sheerka, context, parser = self.init_parser("")
        assert parser.get_body(list(Tokenizer(text, yield_eof=False))) is None
        assert parser.error_sink == [
            UnexpectedTokenParsingError("Indentation not found.", "zzz", [TokenKind.WHITESPACE])]

    def test_i_can_detect_invalid_indentation_when_get_body(self):
        sheerka, context, parser = self.init_parser("")
        assert parser.get_body(list(Tokenizer("\n\t\txxx\n\tyyy", yield_eof=False))) is None
        assert parser.error_sink == [SyntaxErrorNode(None, "Invalid indentation.")]

    def test_i_can_get_body(self):
        sheerka, context, parser = self.init_parser("")
        res = parser.get_body(list(Tokenizer("\n\txxx\n\tyyyy", yield_eof=False)))
        expected = list(Tokenizer("xxx\n yyyy", yield_eof=False))
        expected[2].value = ""

        assert [t.repr_value for t in res] == [t.repr_value for t in expected]
        assert parser.error_sink == []