From 9b965105e9233268846644edbe6e74bf9a50d822 Mon Sep 17 00:00:00 2001 From: Kodjo Sossouvi Date: Tue, 22 Sep 2020 17:39:42 +0200 Subject: [PATCH] Hardened DefaultParser --- src/core/sheerka/services/SheerkaExecute.py | 13 +++ src/core/tokenizer.py | 4 +- src/parsers/DefaultParser.py | 110 +++++++++++++++--- tests/core/test_ParserInput.py | 16 +++ tests/parsers/test_DefaultParser.py | 117 ++++++++++++++++---- 5 files changed, 220 insertions(+), 40 deletions(-) diff --git a/src/core/sheerka/services/SheerkaExecute.py b/src/core/sheerka/services/SheerkaExecute.py index 245d834..f1b28db 100644 --- a/src/core/sheerka/services/SheerkaExecute.py +++ b/src/core/sheerka/services/SheerkaExecute.py @@ -88,6 +88,19 @@ class ParserInput: return self.pos < self.end + def the_token_after(self, skip_whitespace=True): + my_pos = self.pos + 1 + if my_pos >= self.end: + return Token(TokenKind.EOF, "", -1, -1, -1) + + if skip_whitespace: + while self.tokens[my_pos].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE): + my_pos += 1 + if my_pos == self.end: + return Token(TokenKind.EOF, "", -1, -1, -1) + + return self.tokens[my_pos] + def seek(self, pos): """ Move the token offset to position pos diff --git a/src/core/tokenizer.py b/src/core/tokenizer.py index bb467e4..e8d7b36 100644 --- a/src/core/tokenizer.py +++ b/src/core/tokenizer.py @@ -68,9 +68,9 @@ class Token: if self.type == TokenKind.IDENTIFIER: value = str(self.value) elif self.type == TokenKind.WHITESPACE: - value = "" + value = "" if self.value[0] == "\t" else "" elif self.type == TokenKind.NEWLINE: - value = r"\n" + value = "" elif self.type == TokenKind.EOF: value = "" else: diff --git a/src/parsers/DefaultParser.py b/src/parsers/DefaultParser.py index 8feb83f..798c70b 100644 --- a/src/parsers/DefaultParser.py +++ b/src/parsers/DefaultParser.py @@ -10,6 +10,11 @@ from parsers.BaseParser import BaseParser, Node, ErrorNode, NotInitializedNode from parsers.BnfParser import BnfParser +class ParsingException(Exception): + def __init__(self, error): + self.error = error + + @dataclass() class DefaultParserNode(Node): """ @@ -125,24 +130,35 @@ class DefaultParser(BaseParser): :param tokens: :return: """ + if len(tokens) == 0: + return tokens + tokens = tokens.copy() # do not modify ParserInput.tokens + if tokens[0].type != TokenKind.COLON: return tokens if len(tokens) < 3: - return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE]) + raise ParsingException(UnexpectedTokenErrorNode(tokens[0:2], + "Unexpected end of file", + [TokenKind.NEWLINE])) + pos = DefaultParser.eat_white_space(tokens, 1) + if tokens[pos].type != TokenKind.NEWLINE: + raise ParsingException(UnexpectedTokenErrorNode([tokens[pos]], + "Unexpected token after colon", + [TokenKind.NEWLINE])) + pos += 1 - if tokens[1].type != TokenKind.NEWLINE: - return UnexpectedTokenErrorNode([tokens[1]], "Unexpected token after colon", [TokenKind.NEWLINE]) - - if tokens[2].type != TokenKind.WHITESPACE: - return SyntaxErrorNode([tokens[2]], "Indentation not found.") - indent_size = len(tokens[2].value) + if tokens[pos].type != TokenKind.WHITESPACE: + raise ParsingException(SyntaxErrorNode([tokens[pos]], + "Indentation not found.")) + indent_size = len(tokens[pos].value) + pos += 1 # now fix the other indentations # KSI 23/05/2020 Not quite sure this 'fixing' stuff is still relevant, # as I now have an editor in interactive mode - i = 3 + i = pos while i < len(tokens) - 1: if tokens[i].type == TokenKind.NEWLINE: if tokens[i + 1].type != TokenKind.WHITESPACE: @@ -155,7 +171,17 @@ class DefaultParser(BaseParser): tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size) i += 1 - return tokens[3:] + return tokens[pos:] + + @staticmethod + def eat_white_space(tokens, index): + if index >= len(tokens): + return index + + while index < len(tokens) and tokens[index].type == TokenKind.WHITESPACE: + index += 1 + + return index def reset_parser(self, context, parser_input): self.context = context @@ -252,6 +278,22 @@ class DefaultParser(BaseParser): def regroup_tokens_by_parts(self, keywords_tokens): + def new_part(t, cma, p): + """ + + :param t: token + :param cma: concept_mode_activated + :param p: previous token + :return: + """ + if not t.value in def_concept_parts: + return False + + if not cma or not p: + return True + + return p.line != t.line + def_concept_parts = [Keywords.CONCEPT.value, Keywords.FROM.value, Keywords.AS.value, @@ -273,10 +315,34 @@ class DefaultParser(BaseParser): current_part = Keywords.CONCEPT token = self.parser_input.token first_token = token + colon_mode_activated = False # if activate, use keyword + colon to start a new keyword definition + previous_token = None + + # more explanation on colon_mode_activated + # You can use the pattern + # def concept as: + # xxx + # yyy + # ... + # + # It allows to readability and usage of other keywords inside the bloc# + # Example + # def concept give the the date as: + # from datetime import date + # return date.today() + # + # 'from datetime' will not be considered as a keyword because it's lead by a tab + # whereas in + # def concept in x days as: + # from datetime import date + # return date.today() - x + # where x > 0 + # + # where will be recognized as the keyword because it is the first word of the line # loop thru the tokens, and put them in the correct tokens_found_by_parts entry while token.type != TokenKind.EOF: - if token.value in def_concept_parts: + if new_part(token, colon_mode_activated, previous_token): keywords_tokens.append(token) # keep track of the keywords keyword = Keywords(token.value) if tokens_found_by_parts[keyword]: @@ -286,11 +352,14 @@ class DefaultParser(BaseParser): else: tokens_found_by_parts[keyword] = [token] current_part = keyword + colon_mode_activated = self.parser_input.the_token_after().type == TokenKind.COLON + self.parser_input.next_token() else: tokens_found_by_parts[current_part].append(token) self.parser_input.next_token(False) + previous_token = token token = self.parser_input.token return first_token, tokens_found_by_parts @@ -335,7 +404,12 @@ class DefaultParser(BaseParser): return self.get_concept_simple_definition(definition_tokens) def get_concept_bnf_definition(self, current_concept_def, definition_tokens): - tokens = core.utils.strip_tokens(definition_tokens[2:]) + try: + tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[2:])) + except ParsingException as ex: + self.add_error(ex.error) + return None, NotInitializedNode() + if len(tokens) == 0: self.add_error(SyntaxErrorNode([definition_tokens[1]], "Empty declaration"), False) return None, NotInitializedNode() @@ -358,7 +432,12 @@ class DefaultParser(BaseParser): def get_concept_simple_definition(self, definition_tokens): start = 2 if definition_tokens[1].value == Keywords.DEF.value else 1 - tokens = core.utils.strip_tokens(definition_tokens[start:]) + try: + tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[start:])) + except ParsingException as ex: + self.add_error(ex.error) + return None, NotInitializedNode() + if len(tokens) == 0: self.add_error(SyntaxErrorNode([definition_tokens[start]], "Empty declaration"), False) return None, NotInitializedNode() @@ -386,9 +465,10 @@ class DefaultParser(BaseParser): self.add_error(SyntaxErrorNode([tokens[0]], "Empty declaration"), False) continue - tokens = self.fix_indentation(tokens[1:]) # manage multi-lines declarations - if isinstance(tokens, ErrorNode): - self.add_error(tokens) + try: + tokens = self.fix_indentation(tokens[1:]) # manage multi-lines declarations + except ParsingException as ex: + self.add_error(ex.error) continue # ask the other parsers if they recognize the tokens diff --git a/tests/core/test_ParserInput.py b/tests/core/test_ParserInput.py index 81cdd5d..a33b6da 100644 --- a/tests/core/test_ParserInput.py +++ b/tests/core/test_ParserInput.py @@ -77,3 +77,19 @@ def test_i_can_parse_twice(): while p2.next_token(): p1.next_token() assert p1.token == p2.token + + +@pytest.mark.parametrize("text, skip_whitespace, expected", [ + ("first second", True, "second"), + ("first second", False, ""), + ("first", True, ""), + ("first", False, ""), + ("first ", True, ""), + ("first ", False, ""), + ("first:", True, ":"), + ("first:", False, ":"), +]) +def test_i_can_get_the_token_after(text, skip_whitespace, expected): + parser_input = ParserInput(text).reset() + parser_input.next_token() + assert parser_input.the_token_after(skip_whitespace).repr_value == expected diff --git a/tests/parsers/test_DefaultParser.py b/tests/parsers/test_DefaultParser.py index 0ac7163..4b0730f 100644 --- a/tests/parsers/test_DefaultParser.py +++ b/tests/parsers/test_DefaultParser.py @@ -6,8 +6,8 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnVa from core.concept import DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF, Concept, CV from core.sheerka.services.SheerkaExecute import ParserInput from core.tokenizer import Keywords, Tokenizer, LexerError -from parsers.BaseNodeParser import SCN, SCWC -from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch +from parsers.BaseNodeParser import SCWC +from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch, Sequence from parsers.BnfParser import BnfParser from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode @@ -15,7 +15,7 @@ from parsers.FunctionParser import FunctionParser from parsers.PythonParser import PythonParser, PythonNode from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka -from tests.parsers.parsers_utils import get_node, compute_expected_array +from tests.parsers.parsers_utils import compute_expected_array def get_def_concept(name, where=None, pre=None, post=None, body=None, definition=None, bnf_def=None, ret=None): @@ -164,7 +164,7 @@ ret a if isinstance(a, Concept) else self assert isinstance(return_value, ParserResultConcept) assert return_value.value == expected_concept - def test_i_can_have_mutilines_declarations(self): + def test_i_can_parse_mutilines_declarations(self): text = """ def concept add one to a as def func(x): @@ -207,14 +207,16 @@ def concept add one to a as: assert isinstance(return_value, ParserResultConcept) assert return_value.value == expected_concept - def test_indentation_is_mandatory_after_a_colon(self): - text = """ -def concept add one to a as: -def func(x): - return x+1 -func(a) - """ - + @pytest.mark.parametrize("text", [ + "def concept foo as:\npass", + "def concept foo where:\npass", + "def concept foo pre:\npass", + "def concept foo post:\npass", + "def concept foo from:\nanother definition", + "def concept foo from def:\nanother definition", + "def concept foo from bnf:\n'another' 'definition'", + ]) + def test_indentation_is_mandatory_after_a_colon(self, text): sheerka, context, parser = self.init_parser() res = parser.parse(context, ParserInput(text)) return_value = res.value @@ -224,19 +226,76 @@ func(a) assert isinstance(return_value.body[0], SyntaxErrorNode) assert return_value.body[0].message == "Indentation not found." - def test_indentation_is_not_allowed_if_the_colon_is_missing(self): - text = """ -def concept add one to a as - def func(x): - return x+1 - func(a) - """ + @pytest.mark.parametrize("text", [ + "def concept plus from:\n\ta plus b", + "def concept plus from def:\n\ta plus b", + + # space before the colon + "def concept plus from :\n\ta plus b", + "def concept plus from def :\n\ta plus b", + + # space after the colon + "def concept plus from: \n\ta plus b", + "def concept plus from def: \n\ta plus b", + ]) + def test_i_can_use_colon_and_definition_together(self, text): sheerka, context, parser = self.init_parser() res = parser.parse(context, ParserInput(text)) - return_value = res.value + defined_concept = res.body.body + defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens] - assert not res.status - assert context.sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS) + assert res.status + assert defined_concept.definition_type == DEFINITION_TYPE_DEF + assert defined_concept_tokens == [t.repr_value for t in Tokenizer("a plus b", yield_eof=False)] + + @pytest.mark.parametrize("text", [ + "def concept plus from bnf:\n\t'a' 'plus' 'b'", + "def concept plus from bnf :\n\t'a' 'plus' 'b'", + "def concept plus from bnf: \n\t'a' 'plus' 'b'", + ]) + def test_i_can_use_colon_and_bnf_definition_together(self, text): + sheerka, context, parser = self.init_parser() + res = parser.parse(context, ParserInput(text)) + defined_concept = res.body.body + + assert res.status + assert defined_concept.definition.status + assert defined_concept.definition.body.body == Sequence(StrMatch("a"), StrMatch("plus"), StrMatch("b")) + + def test_i_can_use_colon_to_protect_keyword(self): + text = """ +def concept today as: + from datetime import date + today = date.today() +from: + give me the date ! +""" + sheerka, context, parser = self.init_parser() + res = parser.parse(context, ParserInput(text)) + defined_concept = res.body.body + defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens] + + assert res.status + assert defined_concept.definition_type == DEFINITION_TYPE_DEF + assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)] + assert defined_concept.body.status + + def test_i_can_use_colon_to_protect_keyword_2(self): + text = """ +def concept today as: + from datetime import date + today = date.today() +from give me the date ! +""" + sheerka, context, parser = self.init_parser() + res = parser.parse(context, ParserInput(text)) + defined_concept = res.body.body + defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens] + + assert res.status + assert defined_concept.definition_type == DEFINITION_TYPE_DEF + assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)] + assert defined_concept.body.status def test_name_is_mandatory(self): text = "def concept as 'hello'" @@ -277,7 +336,19 @@ def concept add one to a as assert not res.status assert sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS) - def test_new_line_is_not_allowed_in_the_name(self): + @pytest.mark.parametrize("text", [ + "def concept hello\nmy friend", + "def concept hello \nmy friend", + "def concept hello\n my friend", + "def concept hello \n my friend", + "def concept hello from hello\nmy friend", + "def concept hello from def hello\nmy friend", + "def concept hello from bnf hello\nmy friend", + "def concept hello from:\n\thello\nmy friend", + "def concept hello from def:\n\thello\nmy friend", + "def concept hello from bnf:\n\thello\nmy friend", + ]) + def test_new_line_is_not_allowed_in_the_name(self, text): text = "def concept hello \n my friend as 'hello'" sheerka, context, parser = self.init_parser()