Hardened DefaultParser

This commit is contained in:
2020-09-22 17:39:42 +02:00
parent 310c9ae839
commit 9b965105e9
5 changed files with 220 additions and 40 deletions
@@ -88,6 +88,19 @@ class ParserInput:
return self.pos < self.end
def the_token_after(self, skip_whitespace=True):
my_pos = self.pos + 1
if my_pos >= self.end:
return Token(TokenKind.EOF, "", -1, -1, -1)
if skip_whitespace:
while self.tokens[my_pos].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
my_pos += 1
if my_pos == self.end:
return Token(TokenKind.EOF, "", -1, -1, -1)
return self.tokens[my_pos]
def seek(self, pos):
"""
Move the token offset to position pos
+2 -2
View File
@@ -68,9 +68,9 @@ class Token:
if self.type == TokenKind.IDENTIFIER:
value = str(self.value)
elif self.type == TokenKind.WHITESPACE:
value = "<ws>"
value = "<tab>" if self.value[0] == "\t" else "<ws>"
elif self.type == TokenKind.NEWLINE:
value = r"\n"
value = "<nl>"
elif self.type == TokenKind.EOF:
value = "<EOF>"
else:
+95 -15
View File
@@ -10,6 +10,11 @@ from parsers.BaseParser import BaseParser, Node, ErrorNode, NotInitializedNode
from parsers.BnfParser import BnfParser
class ParsingException(Exception):
def __init__(self, error):
self.error = error
@dataclass()
class DefaultParserNode(Node):
"""
@@ -125,24 +130,35 @@ class DefaultParser(BaseParser):
:param tokens:
:return:
"""
if len(tokens) == 0:
return tokens
tokens = tokens.copy() # do not modify ParserInput.tokens
if tokens[0].type != TokenKind.COLON:
return tokens
if len(tokens) < 3:
return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE])
raise ParsingException(UnexpectedTokenErrorNode(tokens[0:2],
"Unexpected end of file",
[TokenKind.NEWLINE]))
pos = DefaultParser.eat_white_space(tokens, 1)
if tokens[pos].type != TokenKind.NEWLINE:
raise ParsingException(UnexpectedTokenErrorNode([tokens[pos]],
"Unexpected token after colon",
[TokenKind.NEWLINE]))
pos += 1
if tokens[1].type != TokenKind.NEWLINE:
return UnexpectedTokenErrorNode([tokens[1]], "Unexpected token after colon", [TokenKind.NEWLINE])
if tokens[2].type != TokenKind.WHITESPACE:
return SyntaxErrorNode([tokens[2]], "Indentation not found.")
indent_size = len(tokens[2].value)
if tokens[pos].type != TokenKind.WHITESPACE:
raise ParsingException(SyntaxErrorNode([tokens[pos]],
"Indentation not found."))
indent_size = len(tokens[pos].value)
pos += 1
# now fix the other indentations
# KSI 23/05/2020 Not quite sure this 'fixing' stuff is still relevant,
# as I now have an editor in interactive mode
i = 3
i = pos
while i < len(tokens) - 1:
if tokens[i].type == TokenKind.NEWLINE:
if tokens[i + 1].type != TokenKind.WHITESPACE:
@@ -155,7 +171,17 @@ class DefaultParser(BaseParser):
tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size)
i += 1
return tokens[3:]
return tokens[pos:]
@staticmethod
def eat_white_space(tokens, index):
if index >= len(tokens):
return index
while index < len(tokens) and tokens[index].type == TokenKind.WHITESPACE:
index += 1
return index
def reset_parser(self, context, parser_input):
self.context = context
@@ -252,6 +278,22 @@ class DefaultParser(BaseParser):
def regroup_tokens_by_parts(self, keywords_tokens):
def new_part(t, cma, p):
"""
:param t: token
:param cma: concept_mode_activated
:param p: previous token
:return:
"""
if not t.value in def_concept_parts:
return False
if not cma or not p:
return True
return p.line != t.line
def_concept_parts = [Keywords.CONCEPT.value,
Keywords.FROM.value,
Keywords.AS.value,
@@ -273,10 +315,34 @@ class DefaultParser(BaseParser):
current_part = Keywords.CONCEPT
token = self.parser_input.token
first_token = token
colon_mode_activated = False # if activate, use keyword + colon to start a new keyword definition
previous_token = None
# more explanation on colon_mode_activated
# You can use the pattern
# def concept <name> as:
# <tab> xxx
# <tab> yyy
# ...
#
# It allows to readability and usage of other keywords inside the bloc#
# Example
# def concept give the the date as:
# from datetime import date
# return date.today()
#
# 'from datetime' will not be considered as a keyword because it's lead by a tab
# whereas in
# def concept in x days as:
# from datetime import date
# return date.today() - x
# where x > 0
#
# where will be recognized as the keyword because it is the first word of the line
# loop thru the tokens, and put them in the correct tokens_found_by_parts entry
while token.type != TokenKind.EOF:
if token.value in def_concept_parts:
if new_part(token, colon_mode_activated, previous_token):
keywords_tokens.append(token) # keep track of the keywords
keyword = Keywords(token.value)
if tokens_found_by_parts[keyword]:
@@ -286,11 +352,14 @@ class DefaultParser(BaseParser):
else:
tokens_found_by_parts[keyword] = [token]
current_part = keyword
colon_mode_activated = self.parser_input.the_token_after().type == TokenKind.COLON
self.parser_input.next_token()
else:
tokens_found_by_parts[current_part].append(token)
self.parser_input.next_token(False)
previous_token = token
token = self.parser_input.token
return first_token, tokens_found_by_parts
@@ -335,7 +404,12 @@ class DefaultParser(BaseParser):
return self.get_concept_simple_definition(definition_tokens)
def get_concept_bnf_definition(self, current_concept_def, definition_tokens):
tokens = core.utils.strip_tokens(definition_tokens[2:])
try:
tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[2:]))
except ParsingException as ex:
self.add_error(ex.error)
return None, NotInitializedNode()
if len(tokens) == 0:
self.add_error(SyntaxErrorNode([definition_tokens[1]], "Empty declaration"), False)
return None, NotInitializedNode()
@@ -358,7 +432,12 @@ class DefaultParser(BaseParser):
def get_concept_simple_definition(self, definition_tokens):
start = 2 if definition_tokens[1].value == Keywords.DEF.value else 1
tokens = core.utils.strip_tokens(definition_tokens[start:])
try:
tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[start:]))
except ParsingException as ex:
self.add_error(ex.error)
return None, NotInitializedNode()
if len(tokens) == 0:
self.add_error(SyntaxErrorNode([definition_tokens[start]], "Empty declaration"), False)
return None, NotInitializedNode()
@@ -386,9 +465,10 @@ class DefaultParser(BaseParser):
self.add_error(SyntaxErrorNode([tokens[0]], "Empty declaration"), False)
continue
tokens = self.fix_indentation(tokens[1:]) # manage multi-lines declarations
if isinstance(tokens, ErrorNode):
self.add_error(tokens)
try:
tokens = self.fix_indentation(tokens[1:]) # manage multi-lines declarations
except ParsingException as ex:
self.add_error(ex.error)
continue
# ask the other parsers if they recognize the tokens
+16
View File
@@ -77,3 +77,19 @@ def test_i_can_parse_twice():
while p2.next_token():
p1.next_token()
assert p1.token == p2.token
@pytest.mark.parametrize("text, skip_whitespace, expected", [
("first second", True, "second"),
("first second", False, "<ws>"),
("first", True, "<EOF>"),
("first", False, "<EOF>"),
("first ", True, "<EOF>"),
("first ", False, "<ws>"),
("first:", True, ":"),
("first:", False, ":"),
])
def test_i_can_get_the_token_after(text, skip_whitespace, expected):
parser_input = ParserInput(text).reset()
parser_input.next_token()
assert parser_input.the_token_after(skip_whitespace).repr_value == expected
+94 -23
View File
@@ -6,8 +6,8 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnVa
from core.concept import DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF, Concept, CV
from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import Keywords, Tokenizer, LexerError
from parsers.BaseNodeParser import SCN, SCWC
from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch
from parsers.BaseNodeParser import SCWC
from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch, Sequence
from parsers.BnfParser import BnfParser
from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode
from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode
@@ -15,7 +15,7 @@ from parsers.FunctionParser import FunctionParser
from parsers.PythonParser import PythonParser, PythonNode
from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka
from tests.parsers.parsers_utils import get_node, compute_expected_array
from tests.parsers.parsers_utils import compute_expected_array
def get_def_concept(name, where=None, pre=None, post=None, body=None, definition=None, bnf_def=None, ret=None):
@@ -164,7 +164,7 @@ ret a if isinstance(a, Concept) else self
assert isinstance(return_value, ParserResultConcept)
assert return_value.value == expected_concept
def test_i_can_have_mutilines_declarations(self):
def test_i_can_parse_mutilines_declarations(self):
text = """
def concept add one to a as
def func(x):
@@ -207,14 +207,16 @@ def concept add one to a as:
assert isinstance(return_value, ParserResultConcept)
assert return_value.value == expected_concept
def test_indentation_is_mandatory_after_a_colon(self):
text = """
def concept add one to a as:
def func(x):
return x+1
func(a)
"""
@pytest.mark.parametrize("text", [
"def concept foo as:\npass",
"def concept foo where:\npass",
"def concept foo pre:\npass",
"def concept foo post:\npass",
"def concept foo from:\nanother definition",
"def concept foo from def:\nanother definition",
"def concept foo from bnf:\n'another' 'definition'",
])
def test_indentation_is_mandatory_after_a_colon(self, text):
sheerka, context, parser = self.init_parser()
res = parser.parse(context, ParserInput(text))
return_value = res.value
@@ -224,19 +226,76 @@ func(a)
assert isinstance(return_value.body[0], SyntaxErrorNode)
assert return_value.body[0].message == "Indentation not found."
def test_indentation_is_not_allowed_if_the_colon_is_missing(self):
text = """
def concept add one to a as
def func(x):
return x+1
func(a)
"""
@pytest.mark.parametrize("text", [
"def concept plus from:\n\ta plus b",
"def concept plus from def:\n\ta plus b",
# space before the colon
"def concept plus from :\n\ta plus b",
"def concept plus from def :\n\ta plus b",
# space after the colon
"def concept plus from: \n\ta plus b",
"def concept plus from def: \n\ta plus b",
])
def test_i_can_use_colon_and_definition_together(self, text):
sheerka, context, parser = self.init_parser()
res = parser.parse(context, ParserInput(text))
return_value = res.value
defined_concept = res.body.body
defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
assert not res.status
assert context.sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS)
assert res.status
assert defined_concept.definition_type == DEFINITION_TYPE_DEF
assert defined_concept_tokens == [t.repr_value for t in Tokenizer("a plus b", yield_eof=False)]
@pytest.mark.parametrize("text", [
"def concept plus from bnf:\n\t'a' 'plus' 'b'",
"def concept plus from bnf :\n\t'a' 'plus' 'b'",
"def concept plus from bnf: \n\t'a' 'plus' 'b'",
])
def test_i_can_use_colon_and_bnf_definition_together(self, text):
sheerka, context, parser = self.init_parser()
res = parser.parse(context, ParserInput(text))
defined_concept = res.body.body
assert res.status
assert defined_concept.definition.status
assert defined_concept.definition.body.body == Sequence(StrMatch("a"), StrMatch("plus"), StrMatch("b"))
def test_i_can_use_colon_to_protect_keyword(self):
text = """
def concept today as:
from datetime import date
today = date.today()
from:
give me the date !
"""
sheerka, context, parser = self.init_parser()
res = parser.parse(context, ParserInput(text))
defined_concept = res.body.body
defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
assert res.status
assert defined_concept.definition_type == DEFINITION_TYPE_DEF
assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)]
assert defined_concept.body.status
def test_i_can_use_colon_to_protect_keyword_2(self):
text = """
def concept today as:
from datetime import date
today = date.today()
from give me the date !
"""
sheerka, context, parser = self.init_parser()
res = parser.parse(context, ParserInput(text))
defined_concept = res.body.body
defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
assert res.status
assert defined_concept.definition_type == DEFINITION_TYPE_DEF
assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)]
assert defined_concept.body.status
def test_name_is_mandatory(self):
text = "def concept as 'hello'"
@@ -277,7 +336,19 @@ def concept add one to a as
assert not res.status
assert sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS)
def test_new_line_is_not_allowed_in_the_name(self):
@pytest.mark.parametrize("text", [
"def concept hello\nmy friend",
"def concept hello \nmy friend",
"def concept hello\n my friend",
"def concept hello \n my friend",
"def concept hello from hello\nmy friend",
"def concept hello from def hello\nmy friend",
"def concept hello from bnf hello\nmy friend",
"def concept hello from:\n\thello\nmy friend",
"def concept hello from def:\n\thello\nmy friend",
"def concept hello from bnf:\n\thello\nmy friend",
])
def test_new_line_is_not_allowed_in_the_name(self, text):
text = "def concept hello \n my friend as 'hello'"
sheerka, context, parser = self.init_parser()