Hardened DefaultParser

2020-09-22 17:39:42 +02:00
parent 310c9ae839
commit 9b965105e9
5 changed files with 220 additions and 40 deletions
@@ -88,6 +88,19 @@ class ParserInput:

        return self.pos < self.end

+    def the_token_after(self, skip_whitespace=True):
+        my_pos = self.pos + 1
+        if my_pos >= self.end:
+            return Token(TokenKind.EOF, "", -1, -1, -1)
+
+        if skip_whitespace:
+            while self.tokens[my_pos].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
+                my_pos += 1
+                if my_pos == self.end:
+                    return Token(TokenKind.EOF, "", -1, -1, -1)
+
+        return self.tokens[my_pos]
+
    def seek(self, pos):
        """
        Move the token offset to position pos
@@ -68,9 +68,9 @@ class Token:
        if self.type == TokenKind.IDENTIFIER:
            value = str(self.value)
        elif self.type == TokenKind.WHITESPACE:
-            value = "<ws>"
+            value = "<tab>" if self.value[0] == "\t" else "<ws>"
        elif self.type == TokenKind.NEWLINE:
-            value = r"\n"
+            value = "<nl>"
        elif self.type == TokenKind.EOF:
            value = "<EOF>"
        else:
@@ -10,6 +10,11 @@ from parsers.BaseParser import BaseParser, Node, ErrorNode, NotInitializedNode
 from parsers.BnfParser import BnfParser


+class ParsingException(Exception):
+    def __init__(self, error):
+        self.error = error
+
+
@dataclass()
 class DefaultParserNode(Node):
    """
@@ -125,24 +130,35 @@ class DefaultParser(BaseParser):
        :param tokens:
        :return:
        """
+        if len(tokens) == 0:
+            return tokens
+
        tokens = tokens.copy()  # do not modify ParserInput.tokens
+
        if tokens[0].type != TokenKind.COLON:
            return tokens

        if len(tokens) < 3:
-            return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE])
+            raise ParsingException(UnexpectedTokenErrorNode(tokens[0:2],
+                                                            "Unexpected end of file",
+                                                            [TokenKind.NEWLINE]))
+        pos = DefaultParser.eat_white_space(tokens, 1)
+        if tokens[pos].type != TokenKind.NEWLINE:
+            raise ParsingException(UnexpectedTokenErrorNode([tokens[pos]],
+                                                            "Unexpected token after colon",
+                                                            [TokenKind.NEWLINE]))
+        pos += 1

-        if tokens[1].type != TokenKind.NEWLINE:
-            return UnexpectedTokenErrorNode([tokens[1]], "Unexpected token after colon", [TokenKind.NEWLINE])
-
-        if tokens[2].type != TokenKind.WHITESPACE:
-            return SyntaxErrorNode([tokens[2]], "Indentation not found.")
-        indent_size = len(tokens[2].value)
+        if tokens[pos].type != TokenKind.WHITESPACE:
+            raise ParsingException(SyntaxErrorNode([tokens[pos]],
+                                                   "Indentation not found."))
+        indent_size = len(tokens[pos].value)
+        pos += 1

        # now fix the other indentations
        # KSI 23/05/2020 Not quite sure this 'fixing' stuff is still relevant,
        #   as I now have an editor in interactive mode
-        i = 3
+        i = pos
        while i < len(tokens) - 1:
            if tokens[i].type == TokenKind.NEWLINE:
                if tokens[i + 1].type != TokenKind.WHITESPACE:
@@ -155,7 +171,17 @@ class DefaultParser(BaseParser):
                tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size)
            i += 1

-        return tokens[3:]
+        return tokens[pos:]
+
+    @staticmethod
+    def eat_white_space(tokens, index):
+        if index >= len(tokens):
+            return index
+
+        while index < len(tokens) and tokens[index].type == TokenKind.WHITESPACE:
+            index += 1
+
+        return index

    def reset_parser(self, context, parser_input):
        self.context = context
@@ -252,6 +278,22 @@ class DefaultParser(BaseParser):

    def regroup_tokens_by_parts(self, keywords_tokens):

+        def new_part(t, cma, p):
+            """
+
+            :param t: token
+            :param cma: concept_mode_activated
+            :param p: previous token
+            :return:
+            """
+            if not t.value in def_concept_parts:
+                return False
+
+            if not cma or not p:
+                return True
+
+            return p.line != t.line
+
        def_concept_parts = [Keywords.CONCEPT.value,
                             Keywords.FROM.value,
                             Keywords.AS.value,
@@ -273,10 +315,34 @@ class DefaultParser(BaseParser):
        current_part = Keywords.CONCEPT
        token = self.parser_input.token
        first_token = token
+        colon_mode_activated = False  # if activate, use keyword + colon to start a new keyword definition
+        previous_token = None
+
+        # more explanation on colon_mode_activated
+        # You can use the pattern
+        # def concept <name> as:
+        # <tab> xxx
+        # <tab> yyy
+        # ...
+        #
+        # It allows to readability and usage of other keywords inside the bloc#
+        # Example
+        # def concept give the the date as:
+        #   from datetime import date
+        #   return date.today()
+        #
+        # 'from datetime' will not be considered as a keyword because it's lead by a tab
+        # whereas in
+        # def concept in x days as:
+        #   from datetime import date
+        #   return date.today() - x
+        # where x > 0
+        #
+        # where will be recognized as the keyword because it is the first word of the line

        # loop thru the tokens, and put them in the correct tokens_found_by_parts entry
        while token.type != TokenKind.EOF:
-            if token.value in def_concept_parts:
+            if new_part(token, colon_mode_activated, previous_token):
                keywords_tokens.append(token)  # keep track of the keywords
                keyword = Keywords(token.value)
                if tokens_found_by_parts[keyword]:
@@ -286,11 +352,14 @@ class DefaultParser(BaseParser):
                else:
                    tokens_found_by_parts[keyword] = [token]
                current_part = keyword
+                colon_mode_activated = self.parser_input.the_token_after().type == TokenKind.COLON
+
                self.parser_input.next_token()
            else:
                tokens_found_by_parts[current_part].append(token)
                self.parser_input.next_token(False)

+            previous_token = token
            token = self.parser_input.token

        return first_token, tokens_found_by_parts
@@ -335,7 +404,12 @@ class DefaultParser(BaseParser):
        return self.get_concept_simple_definition(definition_tokens)

    def get_concept_bnf_definition(self, current_concept_def, definition_tokens):
-        tokens = core.utils.strip_tokens(definition_tokens[2:])
+        try:
+            tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[2:]))
+        except ParsingException as ex:
+            self.add_error(ex.error)
+            return None, NotInitializedNode()
+
        if len(tokens) == 0:
            self.add_error(SyntaxErrorNode([definition_tokens[1]], "Empty declaration"), False)
            return None, NotInitializedNode()
@@ -358,7 +432,12 @@ class DefaultParser(BaseParser):

    def get_concept_simple_definition(self, definition_tokens):
        start = 2 if definition_tokens[1].value == Keywords.DEF.value else 1
-        tokens = core.utils.strip_tokens(definition_tokens[start:])
+        try:
+            tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[start:]))
+        except ParsingException as ex:
+            self.add_error(ex.error)
+            return None, NotInitializedNode()
+
        if len(tokens) == 0:
            self.add_error(SyntaxErrorNode([definition_tokens[start]], "Empty declaration"), False)
            return None, NotInitializedNode()
@@ -386,9 +465,10 @@ class DefaultParser(BaseParser):
                self.add_error(SyntaxErrorNode([tokens[0]], "Empty declaration"), False)
                continue

+            try:
                tokens = self.fix_indentation(tokens[1:])  # manage multi-lines declarations
-            if isinstance(tokens, ErrorNode):
-                self.add_error(tokens)
+            except ParsingException as ex:
+                self.add_error(ex.error)
                continue

            # ask the other parsers if they recognize the tokens
@@ -77,3 +77,19 @@ def test_i_can_parse_twice():
    while p2.next_token():
        p1.next_token()
        assert p1.token == p2.token
+
+
+@pytest.mark.parametrize("text, skip_whitespace, expected", [
+    ("first second", True, "second"),
+    ("first second", False, "<ws>"),
+    ("first", True, "<EOF>"),
+    ("first", False, "<EOF>"),
+    ("first ", True, "<EOF>"),
+    ("first ", False, "<ws>"),
+    ("first:", True, ":"),
+    ("first:", False, ":"),
+])
+def test_i_can_get_the_token_after(text, skip_whitespace, expected):
+    parser_input = ParserInput(text).reset()
+    parser_input.next_token()
+    assert parser_input.the_token_after(skip_whitespace).repr_value == expected
@@ -6,8 +6,8 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnVa
 from core.concept import DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF, Concept, CV
 from core.sheerka.services.SheerkaExecute import ParserInput
 from core.tokenizer import Keywords, Tokenizer, LexerError
-from parsers.BaseNodeParser import SCN, SCWC
-from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch
+from parsers.BaseNodeParser import SCWC
+from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch, Sequence
 from parsers.BnfParser import BnfParser
 from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode
 from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode
@@ -15,7 +15,7 @@ from parsers.FunctionParser import FunctionParser
 from parsers.PythonParser import PythonParser, PythonNode

 from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka
-from tests.parsers.parsers_utils import get_node, compute_expected_array
+from tests.parsers.parsers_utils import compute_expected_array


 def get_def_concept(name, where=None, pre=None, post=None, body=None, definition=None, bnf_def=None, ret=None):
@@ -164,7 +164,7 @@ ret a if isinstance(a, Concept) else self
        assert isinstance(return_value, ParserResultConcept)
        assert return_value.value == expected_concept

-    def test_i_can_have_mutilines_declarations(self):
+    def test_i_can_parse_mutilines_declarations(self):
        text = """
 def concept add one to a as
 def func(x):
@@ -207,14 +207,16 @@ def concept add one to a as:
        assert isinstance(return_value, ParserResultConcept)
        assert return_value.value == expected_concept

-    def test_indentation_is_mandatory_after_a_colon(self):
-        text = """
-def concept add one to a as:
-def func(x):
-    return x+1
-func(a)
-    """
-
+    @pytest.mark.parametrize("text", [
+        "def concept foo as:\npass",
+        "def concept foo where:\npass",
+        "def concept foo pre:\npass",
+        "def concept foo post:\npass",
+        "def concept foo from:\nanother definition",
+        "def concept foo from def:\nanother definition",
+        "def concept foo from bnf:\n'another' 'definition'",
+    ])
+    def test_indentation_is_mandatory_after_a_colon(self, text):
        sheerka, context, parser = self.init_parser()
        res = parser.parse(context, ParserInput(text))
        return_value = res.value
@@ -224,19 +226,76 @@ func(a)
        assert isinstance(return_value.body[0], SyntaxErrorNode)
        assert return_value.body[0].message == "Indentation not found."

-    def test_indentation_is_not_allowed_if_the_colon_is_missing(self):
-        text = """
-def concept add one to a as
-    def func(x):
-        return x+1
-    func(a)
-        """
+    @pytest.mark.parametrize("text", [
+        "def concept plus from:\n\ta plus b",
+        "def concept plus from def:\n\ta plus b",
+
+        # space before the colon
+        "def concept plus from :\n\ta plus b",
+        "def concept plus from def :\n\ta plus b",
+
+        # space after the colon
+        "def concept plus from: \n\ta plus b",
+        "def concept plus from def: \n\ta plus b",
+    ])
+    def test_i_can_use_colon_and_definition_together(self, text):
        sheerka, context, parser = self.init_parser()
        res = parser.parse(context, ParserInput(text))
-        return_value = res.value
+        defined_concept = res.body.body
+        defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]

-        assert not res.status
-        assert context.sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS)
+        assert res.status
+        assert defined_concept.definition_type == DEFINITION_TYPE_DEF
+        assert defined_concept_tokens == [t.repr_value for t in Tokenizer("a plus b", yield_eof=False)]
+
+    @pytest.mark.parametrize("text", [
+        "def concept plus from bnf:\n\t'a' 'plus' 'b'",
+        "def concept plus from bnf :\n\t'a' 'plus' 'b'",
+        "def concept plus from bnf: \n\t'a' 'plus' 'b'",
+    ])
+    def test_i_can_use_colon_and_bnf_definition_together(self, text):
+        sheerka, context, parser = self.init_parser()
+        res = parser.parse(context, ParserInput(text))
+        defined_concept = res.body.body
+
+        assert res.status
+        assert defined_concept.definition.status
+        assert defined_concept.definition.body.body == Sequence(StrMatch("a"), StrMatch("plus"), StrMatch("b"))
+
+    def test_i_can_use_colon_to_protect_keyword(self):
+        text = """
+def concept today as:
+    from datetime import date
+    today = date.today()
+from:
+    give me the date !
+"""
+        sheerka, context, parser = self.init_parser()
+        res = parser.parse(context, ParserInput(text))
+        defined_concept = res.body.body
+        defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
+
+        assert res.status
+        assert defined_concept.definition_type == DEFINITION_TYPE_DEF
+        assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)]
+        assert defined_concept.body.status
+
+    def test_i_can_use_colon_to_protect_keyword_2(self):
+        text = """
+def concept today as:
+    from datetime import date
+    today = date.today()
+from give me the date !
+"""
+        sheerka, context, parser = self.init_parser()
+        res = parser.parse(context, ParserInput(text))
+        defined_concept = res.body.body
+        defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
+
+        assert res.status
+        assert defined_concept.definition_type == DEFINITION_TYPE_DEF
+        assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)]
+        assert defined_concept.body.status

    def test_name_is_mandatory(self):
        text = "def concept as 'hello'"
@@ -277,7 +336,19 @@ def concept add one to a as
        assert not res.status
        assert sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS)

-    def test_new_line_is_not_allowed_in_the_name(self):
+    @pytest.mark.parametrize("text", [
+        "def concept hello\nmy friend",
+        "def concept hello \nmy friend",
+        "def concept hello\n my friend",
+        "def concept hello \n my friend",
+        "def concept hello from hello\nmy friend",
+        "def concept hello from def hello\nmy friend",
+        "def concept hello from bnf hello\nmy friend",
+        "def concept hello from:\n\thello\nmy friend",
+        "def concept hello from def:\n\thello\nmy friend",
+        "def concept hello from bnf:\n\thello\nmy friend",
+    ])
+    def test_new_line_is_not_allowed_in_the_name(self, text):
        text = "def concept hello \n my friend as 'hello'"

        sheerka, context, parser = self.init_parser()