From 9b965105e9233268846644edbe6e74bf9a50d822 Mon Sep 17 00:00:00 2001
From: Kodjo Sossouvi <kodjo.sossouvi@gmail.com>
Date: Tue, 22 Sep 2020 17:39:42 +0200
Subject: [PATCH] Hardened DefaultParser

---
 src/core/sheerka/services/SheerkaExecute.py |  13 +++
 src/core/tokenizer.py                       |   4 +-
 src/parsers/DefaultParser.py                | 110 +++++++++++++++---
 tests/core/test_ParserInput.py              |  16 +++
 tests/parsers/test_DefaultParser.py         | 117 ++++++++++++++++----
 5 files changed, 220 insertions(+), 40 deletions(-)
diff --git a/src/core/sheerka/services/SheerkaExecute.py b/src/core/sheerka/services/SheerkaExecute.py
index 245d834..f1b28db 100644
--- a/src/core/sheerka/services/SheerkaExecute.py
+++ b/src/core/sheerka/services/SheerkaExecute.py
@@ -88,6 +88,19 @@ class ParserInput:
 
         return self.pos < self.end
 
+    def the_token_after(self, skip_whitespace=True):
+        my_pos = self.pos + 1
+        if my_pos >= self.end:
+            return Token(TokenKind.EOF, "", -1, -1, -1)
+
+        if skip_whitespace:
+            while self.tokens[my_pos].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
+                my_pos += 1
+                if my_pos == self.end:
+                    return Token(TokenKind.EOF, "", -1, -1, -1)
+
+        return self.tokens[my_pos]
+
     def seek(self, pos):
         """
         Move the token offset to position pos
diff --git a/src/core/tokenizer.py b/src/core/tokenizer.py
index bb467e4..e8d7b36 100644
--- a/src/core/tokenizer.py
+++ b/src/core/tokenizer.py
@@ -68,9 +68,9 @@ class Token:
         if self.type == TokenKind.IDENTIFIER:
             value = str(self.value)
         elif self.type == TokenKind.WHITESPACE:
-            value = "<ws>"
+            value = "<tab>" if self.value[0] == "\t" else "<ws>"
         elif self.type == TokenKind.NEWLINE:
-            value = r"\n"
+            value = "<nl>"
         elif self.type == TokenKind.EOF:
             value = "<EOF>"
         else:
diff --git a/src/parsers/DefaultParser.py b/src/parsers/DefaultParser.py
index 8feb83f..798c70b 100644
--- a/src/parsers/DefaultParser.py
+++ b/src/parsers/DefaultParser.py
@@ -10,6 +10,11 @@ from parsers.BaseParser import BaseParser, Node, ErrorNode, NotInitializedNode
 from parsers.BnfParser import BnfParser
 
 
+class ParsingException(Exception):
+    def __init__(self, error):
+        self.error = error
+
+
 @dataclass()
 class DefaultParserNode(Node):
     """
@@ -125,24 +130,35 @@ class DefaultParser(BaseParser):
         :param tokens:
         :return:
         """
+        if len(tokens) == 0:
+            return tokens
+
         tokens = tokens.copy()  # do not modify ParserInput.tokens
+
         if tokens[0].type != TokenKind.COLON:
             return tokens
 
         if len(tokens) < 3:
-            return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE])
+            raise ParsingException(UnexpectedTokenErrorNode(tokens[0:2],
+                                                            "Unexpected end of file",
+                                                            [TokenKind.NEWLINE]))
+        pos = DefaultParser.eat_white_space(tokens, 1)
+        if tokens[pos].type != TokenKind.NEWLINE:
+            raise ParsingException(UnexpectedTokenErrorNode([tokens[pos]],
+                                                            "Unexpected token after colon",
+                                                            [TokenKind.NEWLINE]))
+        pos += 1
 
-        if tokens[1].type != TokenKind.NEWLINE:
-            return UnexpectedTokenErrorNode([tokens[1]], "Unexpected token after colon", [TokenKind.NEWLINE])
-
-        if tokens[2].type != TokenKind.WHITESPACE:
-            return SyntaxErrorNode([tokens[2]], "Indentation not found.")
-        indent_size = len(tokens[2].value)
+        if tokens[pos].type != TokenKind.WHITESPACE:
+            raise ParsingException(SyntaxErrorNode([tokens[pos]],
+                                                   "Indentation not found."))
+        indent_size = len(tokens[pos].value)
+        pos += 1
 
         # now fix the other indentations
         # KSI 23/05/2020 Not quite sure this 'fixing' stuff is still relevant,
         #   as I now have an editor in interactive mode
-        i = 3
+        i = pos
         while i < len(tokens) - 1:
             if tokens[i].type == TokenKind.NEWLINE:
                 if tokens[i + 1].type != TokenKind.WHITESPACE:
@@ -155,7 +171,17 @@ class DefaultParser(BaseParser):
                 tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size)
             i += 1
 
-        return tokens[3:]
+        return tokens[pos:]
+
+    @staticmethod
+    def eat_white_space(tokens, index):
+        if index >= len(tokens):
+            return index
+
+        while index < len(tokens) and tokens[index].type == TokenKind.WHITESPACE:
+            index += 1
+
+        return index
 
     def reset_parser(self, context, parser_input):
         self.context = context
@@ -252,6 +278,22 @@ class DefaultParser(BaseParser):
 
     def regroup_tokens_by_parts(self, keywords_tokens):
 
+        def new_part(t, cma, p):
+            """
+
+            :param t: token
+            :param cma: concept_mode_activated
+            :param p: previous token
+            :return:
+            """
+            if not t.value in def_concept_parts:
+                return False
+
+            if not cma or not p:
+                return True
+
+            return p.line != t.line
+
         def_concept_parts = [Keywords.CONCEPT.value,
                              Keywords.FROM.value,
                              Keywords.AS.value,
@@ -273,10 +315,34 @@ class DefaultParser(BaseParser):
         current_part = Keywords.CONCEPT
         token = self.parser_input.token
         first_token = token
+        colon_mode_activated = False  # if activate, use keyword + colon to start a new keyword definition
+        previous_token = None
+
+        # more explanation on colon_mode_activated
+        # You can use the pattern
+        # def concept <name> as:
+        # <tab> xxx
+        # <tab> yyy
+        # ...
+        #
+        # It allows to readability and usage of other keywords inside the bloc#
+        # Example
+        # def concept give the the date as:
+        #   from datetime import date
+        #   return date.today()
+        #
+        # 'from datetime' will not be considered as a keyword because it's lead by a tab
+        # whereas in
+        # def concept in x days as:
+        #   from datetime import date
+        #   return date.today() - x
+        # where x > 0
+        #
+        # where will be recognized as the keyword because it is the first word of the line
 
         # loop thru the tokens, and put them in the correct tokens_found_by_parts entry
         while token.type != TokenKind.EOF:
-            if token.value in def_concept_parts:
+            if new_part(token, colon_mode_activated, previous_token):
                 keywords_tokens.append(token)  # keep track of the keywords
                 keyword = Keywords(token.value)
                 if tokens_found_by_parts[keyword]:
@@ -286,11 +352,14 @@ class DefaultParser(BaseParser):
                 else:
                     tokens_found_by_parts[keyword] = [token]
                 current_part = keyword
+                colon_mode_activated = self.parser_input.the_token_after().type == TokenKind.COLON
+
                 self.parser_input.next_token()
             else:
                 tokens_found_by_parts[current_part].append(token)
                 self.parser_input.next_token(False)
 
+            previous_token = token
             token = self.parser_input.token
 
         return first_token, tokens_found_by_parts
@@ -335,7 +404,12 @@ class DefaultParser(BaseParser):
         return self.get_concept_simple_definition(definition_tokens)
 
     def get_concept_bnf_definition(self, current_concept_def, definition_tokens):
-        tokens = core.utils.strip_tokens(definition_tokens[2:])
+        try:
+            tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[2:]))
+        except ParsingException as ex:
+            self.add_error(ex.error)
+            return None, NotInitializedNode()
+
         if len(tokens) == 0:
             self.add_error(SyntaxErrorNode([definition_tokens[1]], "Empty declaration"), False)
             return None, NotInitializedNode()
@@ -358,7 +432,12 @@ class DefaultParser(BaseParser):
 
     def get_concept_simple_definition(self, definition_tokens):
         start = 2 if definition_tokens[1].value == Keywords.DEF.value else 1
-        tokens = core.utils.strip_tokens(definition_tokens[start:])
+        try:
+            tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[start:]))
+        except ParsingException as ex:
+            self.add_error(ex.error)
+            return None, NotInitializedNode()
+
         if len(tokens) == 0:
             self.add_error(SyntaxErrorNode([definition_tokens[start]], "Empty declaration"), False)
             return None, NotInitializedNode()
@@ -386,9 +465,10 @@ class DefaultParser(BaseParser):
                 self.add_error(SyntaxErrorNode([tokens[0]], "Empty declaration"), False)
                 continue
 
-            tokens = self.fix_indentation(tokens[1:])  # manage multi-lines declarations
-            if isinstance(tokens, ErrorNode):
-                self.add_error(tokens)
+            try:
+                tokens = self.fix_indentation(tokens[1:])  # manage multi-lines declarations
+            except ParsingException as ex:
+                self.add_error(ex.error)
                 continue
 
             # ask the other parsers if they recognize the tokens
diff --git a/tests/core/test_ParserInput.py b/tests/core/test_ParserInput.py
index 81cdd5d..a33b6da 100644
--- a/tests/core/test_ParserInput.py
+++ b/tests/core/test_ParserInput.py
@@ -77,3 +77,19 @@ def test_i_can_parse_twice():
     while p2.next_token():
         p1.next_token()
         assert p1.token == p2.token
+
+
+@pytest.mark.parametrize("text, skip_whitespace, expected", [
+    ("first second", True, "second"),
+    ("first second", False, "<ws>"),
+    ("first", True, "<EOF>"),
+    ("first", False, "<EOF>"),
+    ("first ", True, "<EOF>"),
+    ("first ", False, "<ws>"),
+    ("first:", True, ":"),
+    ("first:", False, ":"),
+])
+def test_i_can_get_the_token_after(text, skip_whitespace, expected):
+    parser_input = ParserInput(text).reset()
+    parser_input.next_token()
+    assert parser_input.the_token_after(skip_whitespace).repr_value == expected
diff --git a/tests/parsers/test_DefaultParser.py b/tests/parsers/test_DefaultParser.py
index 0ac7163..4b0730f 100644
--- a/tests/parsers/test_DefaultParser.py
+++ b/tests/parsers/test_DefaultParser.py
@@ -6,8 +6,8 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnVa
 from core.concept import DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF, Concept, CV
 from core.sheerka.services.SheerkaExecute import ParserInput
 from core.tokenizer import Keywords, Tokenizer, LexerError
-from parsers.BaseNodeParser import SCN, SCWC
-from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch
+from parsers.BaseNodeParser import SCWC
+from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch, Sequence
 from parsers.BnfParser import BnfParser
 from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode
 from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode
@@ -15,7 +15,7 @@ from parsers.FunctionParser import FunctionParser
 from parsers.PythonParser import PythonParser, PythonNode
 
 from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka
-from tests.parsers.parsers_utils import get_node, compute_expected_array
+from tests.parsers.parsers_utils import compute_expected_array
 
 
 def get_def_concept(name, where=None, pre=None, post=None, body=None, definition=None, bnf_def=None, ret=None):
@@ -164,7 +164,7 @@ ret a if isinstance(a, Concept) else self
         assert isinstance(return_value, ParserResultConcept)
         assert return_value.value == expected_concept
 
-    def test_i_can_have_mutilines_declarations(self):
+    def test_i_can_parse_mutilines_declarations(self):
         text = """
 def concept add one to a as
 def func(x):
@@ -207,14 +207,16 @@ def concept add one to a as:
         assert isinstance(return_value, ParserResultConcept)
         assert return_value.value == expected_concept
 
-    def test_indentation_is_mandatory_after_a_colon(self):
-        text = """
-def concept add one to a as:
-def func(x):
-    return x+1
-func(a)
-    """
-
+    @pytest.mark.parametrize("text", [
+        "def concept foo as:\npass",
+        "def concept foo where:\npass",
+        "def concept foo pre:\npass",
+        "def concept foo post:\npass",
+        "def concept foo from:\nanother definition",
+        "def concept foo from def:\nanother definition",
+        "def concept foo from bnf:\n'another' 'definition'",
+    ])
+    def test_indentation_is_mandatory_after_a_colon(self, text):
         sheerka, context, parser = self.init_parser()
         res = parser.parse(context, ParserInput(text))
         return_value = res.value
@@ -224,19 +226,76 @@ func(a)
         assert isinstance(return_value.body[0], SyntaxErrorNode)
         assert return_value.body[0].message == "Indentation not found."
 
-    def test_indentation_is_not_allowed_if_the_colon_is_missing(self):
-        text = """
-def concept add one to a as
-    def func(x):
-        return x+1
-    func(a)
-        """
+    @pytest.mark.parametrize("text", [
+        "def concept plus from:\n\ta plus b",
+        "def concept plus from def:\n\ta plus b",
+
+        # space before the colon
+        "def concept plus from :\n\ta plus b",
+        "def concept plus from def :\n\ta plus b",
+
+        # space after the colon
+        "def concept plus from: \n\ta plus b",
+        "def concept plus from def: \n\ta plus b",
+    ])
+    def test_i_can_use_colon_and_definition_together(self, text):
         sheerka, context, parser = self.init_parser()
         res = parser.parse(context, ParserInput(text))
-        return_value = res.value
+        defined_concept = res.body.body
+        defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
 
-        assert not res.status
-        assert context.sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS)
+        assert res.status
+        assert defined_concept.definition_type == DEFINITION_TYPE_DEF
+        assert defined_concept_tokens == [t.repr_value for t in Tokenizer("a plus b", yield_eof=False)]
+
+    @pytest.mark.parametrize("text", [
+        "def concept plus from bnf:\n\t'a' 'plus' 'b'",
+        "def concept plus from bnf :\n\t'a' 'plus' 'b'",
+        "def concept plus from bnf: \n\t'a' 'plus' 'b'",
+    ])
+    def test_i_can_use_colon_and_bnf_definition_together(self, text):
+        sheerka, context, parser = self.init_parser()
+        res = parser.parse(context, ParserInput(text))
+        defined_concept = res.body.body
+
+        assert res.status
+        assert defined_concept.definition.status
+        assert defined_concept.definition.body.body == Sequence(StrMatch("a"), StrMatch("plus"), StrMatch("b"))
+
+    def test_i_can_use_colon_to_protect_keyword(self):
+        text = """
+def concept today as:
+    from datetime import date
+    today = date.today()
+from:
+    give me the date !
+"""
+        sheerka, context, parser = self.init_parser()
+        res = parser.parse(context, ParserInput(text))
+        defined_concept = res.body.body
+        defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
+
+        assert res.status
+        assert defined_concept.definition_type == DEFINITION_TYPE_DEF
+        assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)]
+        assert defined_concept.body.status
+
+    def test_i_can_use_colon_to_protect_keyword_2(self):
+        text = """
+def concept today as:
+    from datetime import date
+    today = date.today()
+from give me the date !
+"""
+        sheerka, context, parser = self.init_parser()
+        res = parser.parse(context, ParserInput(text))
+        defined_concept = res.body.body
+        defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
+
+        assert res.status
+        assert defined_concept.definition_type == DEFINITION_TYPE_DEF
+        assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)]
+        assert defined_concept.body.status
 
     def test_name_is_mandatory(self):
         text = "def concept as 'hello'"
@@ -277,7 +336,19 @@ def concept add one to a as
         assert not res.status
         assert sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS)
 
-    def test_new_line_is_not_allowed_in_the_name(self):
+    @pytest.mark.parametrize("text", [
+        "def concept hello\nmy friend",
+        "def concept hello \nmy friend",
+        "def concept hello\n my friend",
+        "def concept hello \n my friend",
+        "def concept hello from hello\nmy friend",
+        "def concept hello from def hello\nmy friend",
+        "def concept hello from bnf hello\nmy friend",
+        "def concept hello from:\n\thello\nmy friend",
+        "def concept hello from def:\n\thello\nmy friend",
+        "def concept hello from bnf:\n\thello\nmy friend",
+    ])
+    def test_new_line_is_not_allowed_in_the_name(self, text):
         text = "def concept hello \n my friend as 'hello'"
 
         sheerka, context, parser = self.init_parser()