Hardened DefaultParser
This commit is contained in:
@@ -88,6 +88,19 @@ class ParserInput:
|
||||
|
||||
return self.pos < self.end
|
||||
|
||||
def the_token_after(self, skip_whitespace=True):
|
||||
my_pos = self.pos + 1
|
||||
if my_pos >= self.end:
|
||||
return Token(TokenKind.EOF, "", -1, -1, -1)
|
||||
|
||||
if skip_whitespace:
|
||||
while self.tokens[my_pos].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
|
||||
my_pos += 1
|
||||
if my_pos == self.end:
|
||||
return Token(TokenKind.EOF, "", -1, -1, -1)
|
||||
|
||||
return self.tokens[my_pos]
|
||||
|
||||
def seek(self, pos):
|
||||
"""
|
||||
Move the token offset to position pos
|
||||
|
||||
@@ -68,9 +68,9 @@ class Token:
|
||||
if self.type == TokenKind.IDENTIFIER:
|
||||
value = str(self.value)
|
||||
elif self.type == TokenKind.WHITESPACE:
|
||||
value = "<ws>"
|
||||
value = "<tab>" if self.value[0] == "\t" else "<ws>"
|
||||
elif self.type == TokenKind.NEWLINE:
|
||||
value = r"\n"
|
||||
value = "<nl>"
|
||||
elif self.type == TokenKind.EOF:
|
||||
value = "<EOF>"
|
||||
else:
|
||||
|
||||
@@ -10,6 +10,11 @@ from parsers.BaseParser import BaseParser, Node, ErrorNode, NotInitializedNode
|
||||
from parsers.BnfParser import BnfParser
|
||||
|
||||
|
||||
class ParsingException(Exception):
|
||||
def __init__(self, error):
|
||||
self.error = error
|
||||
|
||||
|
||||
@dataclass()
|
||||
class DefaultParserNode(Node):
|
||||
"""
|
||||
@@ -125,24 +130,35 @@ class DefaultParser(BaseParser):
|
||||
:param tokens:
|
||||
:return:
|
||||
"""
|
||||
if len(tokens) == 0:
|
||||
return tokens
|
||||
|
||||
tokens = tokens.copy() # do not modify ParserInput.tokens
|
||||
|
||||
if tokens[0].type != TokenKind.COLON:
|
||||
return tokens
|
||||
|
||||
if len(tokens) < 3:
|
||||
return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE])
|
||||
raise ParsingException(UnexpectedTokenErrorNode(tokens[0:2],
|
||||
"Unexpected end of file",
|
||||
[TokenKind.NEWLINE]))
|
||||
pos = DefaultParser.eat_white_space(tokens, 1)
|
||||
if tokens[pos].type != TokenKind.NEWLINE:
|
||||
raise ParsingException(UnexpectedTokenErrorNode([tokens[pos]],
|
||||
"Unexpected token after colon",
|
||||
[TokenKind.NEWLINE]))
|
||||
pos += 1
|
||||
|
||||
if tokens[1].type != TokenKind.NEWLINE:
|
||||
return UnexpectedTokenErrorNode([tokens[1]], "Unexpected token after colon", [TokenKind.NEWLINE])
|
||||
|
||||
if tokens[2].type != TokenKind.WHITESPACE:
|
||||
return SyntaxErrorNode([tokens[2]], "Indentation not found.")
|
||||
indent_size = len(tokens[2].value)
|
||||
if tokens[pos].type != TokenKind.WHITESPACE:
|
||||
raise ParsingException(SyntaxErrorNode([tokens[pos]],
|
||||
"Indentation not found."))
|
||||
indent_size = len(tokens[pos].value)
|
||||
pos += 1
|
||||
|
||||
# now fix the other indentations
|
||||
# KSI 23/05/2020 Not quite sure this 'fixing' stuff is still relevant,
|
||||
# as I now have an editor in interactive mode
|
||||
i = 3
|
||||
i = pos
|
||||
while i < len(tokens) - 1:
|
||||
if tokens[i].type == TokenKind.NEWLINE:
|
||||
if tokens[i + 1].type != TokenKind.WHITESPACE:
|
||||
@@ -155,7 +171,17 @@ class DefaultParser(BaseParser):
|
||||
tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size)
|
||||
i += 1
|
||||
|
||||
return tokens[3:]
|
||||
return tokens[pos:]
|
||||
|
||||
@staticmethod
|
||||
def eat_white_space(tokens, index):
|
||||
if index >= len(tokens):
|
||||
return index
|
||||
|
||||
while index < len(tokens) and tokens[index].type == TokenKind.WHITESPACE:
|
||||
index += 1
|
||||
|
||||
return index
|
||||
|
||||
def reset_parser(self, context, parser_input):
|
||||
self.context = context
|
||||
@@ -252,6 +278,22 @@ class DefaultParser(BaseParser):
|
||||
|
||||
def regroup_tokens_by_parts(self, keywords_tokens):
|
||||
|
||||
def new_part(t, cma, p):
|
||||
"""
|
||||
|
||||
:param t: token
|
||||
:param cma: concept_mode_activated
|
||||
:param p: previous token
|
||||
:return:
|
||||
"""
|
||||
if not t.value in def_concept_parts:
|
||||
return False
|
||||
|
||||
if not cma or not p:
|
||||
return True
|
||||
|
||||
return p.line != t.line
|
||||
|
||||
def_concept_parts = [Keywords.CONCEPT.value,
|
||||
Keywords.FROM.value,
|
||||
Keywords.AS.value,
|
||||
@@ -273,10 +315,34 @@ class DefaultParser(BaseParser):
|
||||
current_part = Keywords.CONCEPT
|
||||
token = self.parser_input.token
|
||||
first_token = token
|
||||
colon_mode_activated = False # if activate, use keyword + colon to start a new keyword definition
|
||||
previous_token = None
|
||||
|
||||
# more explanation on colon_mode_activated
|
||||
# You can use the pattern
|
||||
# def concept <name> as:
|
||||
# <tab> xxx
|
||||
# <tab> yyy
|
||||
# ...
|
||||
#
|
||||
# It allows to readability and usage of other keywords inside the bloc#
|
||||
# Example
|
||||
# def concept give the the date as:
|
||||
# from datetime import date
|
||||
# return date.today()
|
||||
#
|
||||
# 'from datetime' will not be considered as a keyword because it's lead by a tab
|
||||
# whereas in
|
||||
# def concept in x days as:
|
||||
# from datetime import date
|
||||
# return date.today() - x
|
||||
# where x > 0
|
||||
#
|
||||
# where will be recognized as the keyword because it is the first word of the line
|
||||
|
||||
# loop thru the tokens, and put them in the correct tokens_found_by_parts entry
|
||||
while token.type != TokenKind.EOF:
|
||||
if token.value in def_concept_parts:
|
||||
if new_part(token, colon_mode_activated, previous_token):
|
||||
keywords_tokens.append(token) # keep track of the keywords
|
||||
keyword = Keywords(token.value)
|
||||
if tokens_found_by_parts[keyword]:
|
||||
@@ -286,11 +352,14 @@ class DefaultParser(BaseParser):
|
||||
else:
|
||||
tokens_found_by_parts[keyword] = [token]
|
||||
current_part = keyword
|
||||
colon_mode_activated = self.parser_input.the_token_after().type == TokenKind.COLON
|
||||
|
||||
self.parser_input.next_token()
|
||||
else:
|
||||
tokens_found_by_parts[current_part].append(token)
|
||||
self.parser_input.next_token(False)
|
||||
|
||||
previous_token = token
|
||||
token = self.parser_input.token
|
||||
|
||||
return first_token, tokens_found_by_parts
|
||||
@@ -335,7 +404,12 @@ class DefaultParser(BaseParser):
|
||||
return self.get_concept_simple_definition(definition_tokens)
|
||||
|
||||
def get_concept_bnf_definition(self, current_concept_def, definition_tokens):
|
||||
tokens = core.utils.strip_tokens(definition_tokens[2:])
|
||||
try:
|
||||
tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[2:]))
|
||||
except ParsingException as ex:
|
||||
self.add_error(ex.error)
|
||||
return None, NotInitializedNode()
|
||||
|
||||
if len(tokens) == 0:
|
||||
self.add_error(SyntaxErrorNode([definition_tokens[1]], "Empty declaration"), False)
|
||||
return None, NotInitializedNode()
|
||||
@@ -358,7 +432,12 @@ class DefaultParser(BaseParser):
|
||||
|
||||
def get_concept_simple_definition(self, definition_tokens):
|
||||
start = 2 if definition_tokens[1].value == Keywords.DEF.value else 1
|
||||
tokens = core.utils.strip_tokens(definition_tokens[start:])
|
||||
try:
|
||||
tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[start:]))
|
||||
except ParsingException as ex:
|
||||
self.add_error(ex.error)
|
||||
return None, NotInitializedNode()
|
||||
|
||||
if len(tokens) == 0:
|
||||
self.add_error(SyntaxErrorNode([definition_tokens[start]], "Empty declaration"), False)
|
||||
return None, NotInitializedNode()
|
||||
@@ -386,9 +465,10 @@ class DefaultParser(BaseParser):
|
||||
self.add_error(SyntaxErrorNode([tokens[0]], "Empty declaration"), False)
|
||||
continue
|
||||
|
||||
try:
|
||||
tokens = self.fix_indentation(tokens[1:]) # manage multi-lines declarations
|
||||
if isinstance(tokens, ErrorNode):
|
||||
self.add_error(tokens)
|
||||
except ParsingException as ex:
|
||||
self.add_error(ex.error)
|
||||
continue
|
||||
|
||||
# ask the other parsers if they recognize the tokens
|
||||
|
||||
@@ -77,3 +77,19 @@ def test_i_can_parse_twice():
|
||||
while p2.next_token():
|
||||
p1.next_token()
|
||||
assert p1.token == p2.token
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text, skip_whitespace, expected", [
|
||||
("first second", True, "second"),
|
||||
("first second", False, "<ws>"),
|
||||
("first", True, "<EOF>"),
|
||||
("first", False, "<EOF>"),
|
||||
("first ", True, "<EOF>"),
|
||||
("first ", False, "<ws>"),
|
||||
("first:", True, ":"),
|
||||
("first:", False, ":"),
|
||||
])
|
||||
def test_i_can_get_the_token_after(text, skip_whitespace, expected):
|
||||
parser_input = ParserInput(text).reset()
|
||||
parser_input.next_token()
|
||||
assert parser_input.the_token_after(skip_whitespace).repr_value == expected
|
||||
|
||||
@@ -6,8 +6,8 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnVa
|
||||
from core.concept import DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF, Concept, CV
|
||||
from core.sheerka.services.SheerkaExecute import ParserInput
|
||||
from core.tokenizer import Keywords, Tokenizer, LexerError
|
||||
from parsers.BaseNodeParser import SCN, SCWC
|
||||
from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch
|
||||
from parsers.BaseNodeParser import SCWC
|
||||
from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch, Sequence
|
||||
from parsers.BnfParser import BnfParser
|
||||
from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode
|
||||
from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode
|
||||
@@ -15,7 +15,7 @@ from parsers.FunctionParser import FunctionParser
|
||||
from parsers.PythonParser import PythonParser, PythonNode
|
||||
|
||||
from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka
|
||||
from tests.parsers.parsers_utils import get_node, compute_expected_array
|
||||
from tests.parsers.parsers_utils import compute_expected_array
|
||||
|
||||
|
||||
def get_def_concept(name, where=None, pre=None, post=None, body=None, definition=None, bnf_def=None, ret=None):
|
||||
@@ -164,7 +164,7 @@ ret a if isinstance(a, Concept) else self
|
||||
assert isinstance(return_value, ParserResultConcept)
|
||||
assert return_value.value == expected_concept
|
||||
|
||||
def test_i_can_have_mutilines_declarations(self):
|
||||
def test_i_can_parse_mutilines_declarations(self):
|
||||
text = """
|
||||
def concept add one to a as
|
||||
def func(x):
|
||||
@@ -207,14 +207,16 @@ def concept add one to a as:
|
||||
assert isinstance(return_value, ParserResultConcept)
|
||||
assert return_value.value == expected_concept
|
||||
|
||||
def test_indentation_is_mandatory_after_a_colon(self):
|
||||
text = """
|
||||
def concept add one to a as:
|
||||
def func(x):
|
||||
return x+1
|
||||
func(a)
|
||||
"""
|
||||
|
||||
@pytest.mark.parametrize("text", [
|
||||
"def concept foo as:\npass",
|
||||
"def concept foo where:\npass",
|
||||
"def concept foo pre:\npass",
|
||||
"def concept foo post:\npass",
|
||||
"def concept foo from:\nanother definition",
|
||||
"def concept foo from def:\nanother definition",
|
||||
"def concept foo from bnf:\n'another' 'definition'",
|
||||
])
|
||||
def test_indentation_is_mandatory_after_a_colon(self, text):
|
||||
sheerka, context, parser = self.init_parser()
|
||||
res = parser.parse(context, ParserInput(text))
|
||||
return_value = res.value
|
||||
@@ -224,19 +226,76 @@ func(a)
|
||||
assert isinstance(return_value.body[0], SyntaxErrorNode)
|
||||
assert return_value.body[0].message == "Indentation not found."
|
||||
|
||||
def test_indentation_is_not_allowed_if_the_colon_is_missing(self):
|
||||
text = """
|
||||
def concept add one to a as
|
||||
def func(x):
|
||||
return x+1
|
||||
func(a)
|
||||
"""
|
||||
@pytest.mark.parametrize("text", [
|
||||
"def concept plus from:\n\ta plus b",
|
||||
"def concept plus from def:\n\ta plus b",
|
||||
|
||||
# space before the colon
|
||||
"def concept plus from :\n\ta plus b",
|
||||
"def concept plus from def :\n\ta plus b",
|
||||
|
||||
# space after the colon
|
||||
"def concept plus from: \n\ta plus b",
|
||||
"def concept plus from def: \n\ta plus b",
|
||||
])
|
||||
def test_i_can_use_colon_and_definition_together(self, text):
|
||||
sheerka, context, parser = self.init_parser()
|
||||
res = parser.parse(context, ParserInput(text))
|
||||
return_value = res.value
|
||||
defined_concept = res.body.body
|
||||
defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
|
||||
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS)
|
||||
assert res.status
|
||||
assert defined_concept.definition_type == DEFINITION_TYPE_DEF
|
||||
assert defined_concept_tokens == [t.repr_value for t in Tokenizer("a plus b", yield_eof=False)]
|
||||
|
||||
@pytest.mark.parametrize("text", [
|
||||
"def concept plus from bnf:\n\t'a' 'plus' 'b'",
|
||||
"def concept plus from bnf :\n\t'a' 'plus' 'b'",
|
||||
"def concept plus from bnf: \n\t'a' 'plus' 'b'",
|
||||
])
|
||||
def test_i_can_use_colon_and_bnf_definition_together(self, text):
|
||||
sheerka, context, parser = self.init_parser()
|
||||
res = parser.parse(context, ParserInput(text))
|
||||
defined_concept = res.body.body
|
||||
|
||||
assert res.status
|
||||
assert defined_concept.definition.status
|
||||
assert defined_concept.definition.body.body == Sequence(StrMatch("a"), StrMatch("plus"), StrMatch("b"))
|
||||
|
||||
def test_i_can_use_colon_to_protect_keyword(self):
|
||||
text = """
|
||||
def concept today as:
|
||||
from datetime import date
|
||||
today = date.today()
|
||||
from:
|
||||
give me the date !
|
||||
"""
|
||||
sheerka, context, parser = self.init_parser()
|
||||
res = parser.parse(context, ParserInput(text))
|
||||
defined_concept = res.body.body
|
||||
defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
|
||||
|
||||
assert res.status
|
||||
assert defined_concept.definition_type == DEFINITION_TYPE_DEF
|
||||
assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)]
|
||||
assert defined_concept.body.status
|
||||
|
||||
def test_i_can_use_colon_to_protect_keyword_2(self):
|
||||
text = """
|
||||
def concept today as:
|
||||
from datetime import date
|
||||
today = date.today()
|
||||
from give me the date !
|
||||
"""
|
||||
sheerka, context, parser = self.init_parser()
|
||||
res = parser.parse(context, ParserInput(text))
|
||||
defined_concept = res.body.body
|
||||
defined_concept_tokens = [t.repr_value for t in defined_concept.definition.tokens]
|
||||
|
||||
assert res.status
|
||||
assert defined_concept.definition_type == DEFINITION_TYPE_DEF
|
||||
assert defined_concept_tokens == [t.repr_value for t in Tokenizer("give me the date !", yield_eof=False)]
|
||||
assert defined_concept.body.status
|
||||
|
||||
def test_name_is_mandatory(self):
|
||||
text = "def concept as 'hello'"
|
||||
@@ -277,7 +336,19 @@ def concept add one to a as
|
||||
assert not res.status
|
||||
assert sheerka.isinstance(return_value, BuiltinConcepts.TOO_MANY_ERRORS)
|
||||
|
||||
def test_new_line_is_not_allowed_in_the_name(self):
|
||||
@pytest.mark.parametrize("text", [
|
||||
"def concept hello\nmy friend",
|
||||
"def concept hello \nmy friend",
|
||||
"def concept hello\n my friend",
|
||||
"def concept hello \n my friend",
|
||||
"def concept hello from hello\nmy friend",
|
||||
"def concept hello from def hello\nmy friend",
|
||||
"def concept hello from bnf hello\nmy friend",
|
||||
"def concept hello from:\n\thello\nmy friend",
|
||||
"def concept hello from def:\n\thello\nmy friend",
|
||||
"def concept hello from bnf:\n\thello\nmy friend",
|
||||
])
|
||||
def test_new_line_is_not_allowed_in_the_name(self, text):
|
||||
text = "def concept hello \n my friend as 'hello'"
|
||||
|
||||
sheerka, context, parser = self.init_parser()
|
||||
|
||||
Reference in New Issue
Block a user