Tokenizer exceptions are not catched

This commit is contained in:
2019-12-31 18:28:04 +01:00
parent 197b0700fa
commit adcbc6bb2e
12 changed files with 131 additions and 39 deletions
+1 -1
View File
@@ -261,7 +261,7 @@ class Tokenizer:
raise LexerError(f"Missing ending colon", result, i, line, column + 2 + len(result)) raise LexerError(f"Missing ending colon", result, i, line, column + 2 + len(result))
if result == "": if result == "":
raise LexerError(f"Context name not found", result, start, line, column + 2 + len(result)) raise LexerError(f"Concept name not found", result, start, line, column + 2 + len(result))
return result return result
+6 -1
View File
@@ -3,7 +3,7 @@ from dataclasses import dataclass
import core.utils import core.utils
from core.builtin_concepts import BuiltinConcepts from core.builtin_concepts import BuiltinConcepts
from core.sheerka import ExecutionContext from core.sheerka import ExecutionContext
from core.tokenizer import Tokenizer, Token, TokenKind from core.tokenizer import Tokenizer, Token, TokenKind, LexerError
from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode
from parsers.ConceptLexerParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptMatch, StrMatch from parsers.ConceptLexerParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptMatch, StrMatch
@@ -114,12 +114,17 @@ class BnfParser:
return token.type == second or token.type == first and self.next_after().type == second return token.type == second or token.type == first and self.next_after().type == second
def parse(self, context: ExecutionContext, text): def parse(self, context: ExecutionContext, text):
tree = None
try:
self.reset_parser(context, text) self.reset_parser(context, text)
tree = self.parser_outer_rule_name() tree = self.parser_outer_rule_name()
token = self.get_token() token = self.get_token()
if token and token.type != TokenKind.EOF: if token and token.type != TokenKind.EOF:
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [])) self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", []))
except LexerError as e:
self.add_error(e, False)
ret = self.sheerka.ret( ret = self.sheerka.ret(
self.name, self.name,
+11 -1
View File
@@ -555,7 +555,12 @@ class ConceptLexerParser(BaseParser):
self.text = text self.text = text
if isinstance(text, str): if isinstance(text, str):
try:
self.tokens = list(Tokenizer(text)) self.tokens = list(Tokenizer(text))
except core.tokenizer.LexerError as e:
self.add_error(self.sheerka.new(BuiltinConcepts.ERROR, body=e), False)
return False
else: else:
self.tokens = list(text) self.tokens = list(text)
self.tokens.append(Token(TokenKind.EOF, "", -1, -1, -1)) # make sure to finish with end of file token self.tokens.append(Token(TokenKind.EOF, "", -1, -1, -1)) # make sure to finish with end of file token
@@ -563,6 +568,7 @@ class ConceptLexerParser(BaseParser):
self.token = None self.token = None
self.pos = -1 self.pos = -1
self.next_token() self.next_token()
return True
def get_token(self) -> Token: def get_token(self) -> Token:
return self.token return self.token
@@ -724,7 +730,11 @@ class ConceptLexerParser(BaseParser):
context.sheerka.new(BuiltinConcepts.IS_EMPTY) context.sheerka.new(BuiltinConcepts.IS_EMPTY)
) )
self.reset_parser(context, text) if not self.reset_parser(context, text):
return self.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
concepts_found = [[]] concepts_found = [[]]
unrecognized_tokens = None unrecognized_tokens = None
+5
View File
@@ -183,6 +183,7 @@ class DefaultParser(BaseParser):
self._current = next(self.lexer_iter) self._current = next(self.lexer_iter)
except StopIteration: except StopIteration:
self._current = None self._current = None
return return
def parse(self, context, text): def parse(self, context, text):
@@ -195,8 +196,12 @@ class DefaultParser(BaseParser):
self.log_result(context, text, ret) self.log_result(context, text, ret)
return ret return ret
tree = None
try:
self.reset_parser(context, text) self.reset_parser(context, text)
tree = self.parse_statement() tree = self.parse_statement()
except core.tokenizer.LexerError as e:
self.add_error(e, False)
# If a error is found it must be sent to error_sink # If a error is found it must be sent to error_sink
# tree must contain what was recognized # tree must contain what was recognized
+6 -1
View File
@@ -2,7 +2,7 @@ import logging
from core.builtin_concepts import ReturnValueConcept, BuiltinConcepts from core.builtin_concepts import ReturnValueConcept, BuiltinConcepts
from parsers.BaseParser import BaseParser from parsers.BaseParser import BaseParser
from core.tokenizer import Tokenizer, Keywords, TokenKind from core.tokenizer import Tokenizer, Keywords, TokenKind, LexerError
from core.concept import VARIABLE_PREFIX from core.concept import VARIABLE_PREFIX
@@ -27,7 +27,12 @@ class ExactConceptParser(BaseParser):
context.log(self.verbose_log, f"Parsing '{text}'", self.name) context.log(self.verbose_log, f"Parsing '{text}'", self.name)
res = [] res = []
sheerka = context.sheerka sheerka = context.sheerka
try:
words = self.get_words(text) words = self.get_words(text)
except LexerError as e:
context.log(self.verbose_log, f"Error found in tokenizer {e}", self.name)
return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.ERROR, body=e))
if len(words) > self.MAX_WORDS_SIZE: if len(words) > self.MAX_WORDS_SIZE:
context.log(self.verbose_log, f"Max words reached. Stopping.", self.name) context.log(self.verbose_log, f"Max words reached. Stopping.", self.name)
return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.CONCEPT_TOO_LONG, body=text)) return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.CONCEPT_TOO_LONG, body=text))
+9 -3
View File
@@ -1,5 +1,5 @@
from core.builtin_concepts import BuiltinConcepts from core.builtin_concepts import BuiltinConcepts
from core.tokenizer import Tokenizer from core.tokenizer import Tokenizer, LexerError
from parsers.BaseParser import BaseParser, Node, ErrorNode from parsers.BaseParser import BaseParser, Node, ErrorNode
from dataclasses import dataclass from dataclasses import dataclass
import ast import ast
@@ -63,6 +63,10 @@ class PythonParser(BaseParser):
self.source = kwargs.get("source", "<undef>") self.source = kwargs.get("source", "<undef>")
def parse(self, context, text): def parse(self, context, text):
sheerka = context.sheerka
tree = None
try:
if isinstance(text, str) and "c:" in text: if isinstance(text, str) and "c:" in text:
source = self.get_text_from_tokens(list(Tokenizer(text))) source = self.get_text_from_tokens(list(Tokenizer(text)))
elif isinstance(text, str): elif isinstance(text, str):
@@ -73,8 +77,6 @@ class PythonParser(BaseParser):
text = text if isinstance(text, str) else source text = text if isinstance(text, str) else source
sheerka = context.sheerka
# first, try to parse an expression # first, try to parse an expression
res, tree, error = self.try_parse_expression(source) res, tree, error = self.try_parse_expression(source)
if not res: if not res:
@@ -85,6 +87,10 @@ class PythonParser(BaseParser):
error_node = PythonErrorNode(text, error) error_node = PythonErrorNode(text, error)
self.error_sink.append(error_node) self.error_sink.append(error_node)
except LexerError as e:
self.has_error = True
self.error_sink.append(e)
ret = sheerka.ret( ret = sheerka.ret(
self.name, self.name,
not self.has_error, not self.has_error,
+2
View File
@@ -180,3 +180,5 @@ def test_i_can_get_props_from_definition():
ret_val = get_concept_definition("mult (('+'|'-') add)?", parsing_expression) ret_val = get_concept_definition("mult (('+'|'-') add)?", parsing_expression)
assert AddConceptEvaluator.get_props(get_context(), ret_val, []) == ["add", "mult"] assert AddConceptEvaluator.get_props(get_context(), ret_val, []) == ["add", "mult"]
+2 -1
View File
@@ -2,7 +2,7 @@ import pytest
from core.concept import Concept from core.concept import Concept
from core.sheerka import Sheerka, ExecutionContext from core.sheerka import Sheerka, ExecutionContext
from core.tokenizer import Tokenizer, TokenKind from core.tokenizer import Tokenizer, TokenKind, LexerError
from parsers.BaseParser import UnexpectedTokenErrorNode from parsers.BaseParser import UnexpectedTokenErrorNode
from parsers.BnfParser import BnfParser, UnexpectedEndOfFileError from parsers.BnfParser import BnfParser, UnexpectedEndOfFileError
from parsers.ConceptLexerParser import StrMatch, Optional, ZeroOrMore, OrderedChoice, Sequence, OneOrMore, \ from parsers.ConceptLexerParser import StrMatch, Optional, ZeroOrMore, OrderedChoice, Sequence, OneOrMore, \
@@ -80,6 +80,7 @@ def test_i_can_parse_regex(expression, expected):
("1|", UnexpectedEndOfFileError()), ("1|", UnexpectedEndOfFileError()),
("(1|)", UnexpectedTokenErrorNode("Unexpected token 'Token(<EOF>)'", [TokenKind.RPAR])), ("(1|)", UnexpectedTokenErrorNode("Unexpected token 'Token(<EOF>)'", [TokenKind.RPAR])),
("1=", UnexpectedTokenErrorNode("Unexpected token 'Token(<EOF>)'", [TokenKind.IDENTIFIER])), ("1=", UnexpectedTokenErrorNode("Unexpected token 'Token(<EOF>)'", [TokenKind.IDENTIFIER])),
("'name", LexerError("Missing Trailing quote", "'name", 5, 1, 6))
]) ])
def test_i_can_detect_errors(expression, error): def test_i_can_detect_errors(expression, error):
parser = BnfParser() parser = BnfParser()
+20 -2
View File
@@ -5,12 +5,11 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnVa
from core.sheerka import Sheerka, ExecutionContext from core.sheerka import Sheerka, ExecutionContext
from parsers.ConceptLexerParser import OrderedChoice, StrMatch, ConceptMatch from parsers.ConceptLexerParser import OrderedChoice, StrMatch, ConceptMatch
from parsers.PythonParser import PythonParser, PythonNode from parsers.PythonParser import PythonParser, PythonNode
from core.tokenizer import Keywords, Tokenizer from core.tokenizer import Keywords, Tokenizer, LexerError
from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode, IsaConceptNode from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode, IsaConceptNode
from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode
from parsers.BnfParser import BnfParser from parsers.BnfParser import BnfParser
from sdp.sheerkaDataProvider import Event from sdp.sheerkaDataProvider import Event
@@ -321,3 +320,22 @@ def test_i_cannot_parse_invalid_entries(text):
assert not res.status assert not res.status
assert isinstance(res.body, ParserResultConcept) assert isinstance(res.body, ParserResultConcept)
assert isinstance(res.body.body[0], UnexpectedTokenErrorNode) assert isinstance(res.body.body[0], UnexpectedTokenErrorNode)
@pytest.mark.parametrize("text, error_msg, error_text", [
("'name", "Missing Trailing quote", "'name"),
("foo isa 'name", "Missing Trailing quote", "'name"),
("def concept 'name", "Missing Trailing quote", "'name"),
("def concept name as 'body", "Missing Trailing quote", "'body"),
("def concept name from bnf 'expression", "Missing Trailing quote", "'expression"),
("def concept c::", "Concept name not found", ""),
])
def test_i_cannot_parse_when_tokenizer_fails(text, error_msg, error_text):
parser = DefaultParser()
res = parser.parse(get_context(), text)
assert not res.status
assert isinstance(res.body, ParserResultConcept)
assert isinstance(res.body.body[0], LexerError)
assert res.body.body[0].message == error_msg
assert res.body.body[0].text == error_text
+22 -2
View File
@@ -4,7 +4,7 @@ import pytest
from core.builtin_concepts import ParserResultConcept from core.builtin_concepts import ParserResultConcept
from core.sheerka import Sheerka, ExecutionContext from core.sheerka import Sheerka, ExecutionContext
from core.tokenizer import Tokenizer from core.tokenizer import Tokenizer, LexerError
from parsers.PythonParser import PythonNode, PythonParser, PythonErrorNode from parsers.PythonParser import PythonNode, PythonParser, PythonErrorNode
from sdp.sheerkaDataProvider import Event from sdp.sheerkaDataProvider import Event
@@ -44,7 +44,12 @@ def test_i_can_parse_from_tokens(text, expected):
assert res.value.value == expected assert res.value.value == expected
def test_i_can_detect_error(): @pytest.mark.parametrize("text", [
"1+",
"'name",
"foo = 'name"
])
def test_i_can_detect_error(text):
text = "1+" text = "1+"
parser = PythonParser() parser = PythonParser()
@@ -57,6 +62,21 @@ def test_i_can_detect_error():
assert isinstance(res.value.value[0].exception, SyntaxError) assert isinstance(res.value.value[0].exception, SyntaxError)
@pytest.mark.parametrize("text, error_msg, error_text", [
("c::", "Concept name not found", ""),
("c:: + 1", "Concept name not found", ""),
])
def test_i_can_detect_lexer_errors(text, error_msg, error_text):
parser = PythonParser()
res = parser.parse(get_context(), text)
assert not res.status
assert isinstance(res.body, ParserResultConcept)
assert isinstance(res.body.body[0], LexerError)
assert res.body.body[0].message == error_msg
assert res.body.body[0].text == error_text
def test_i_can_parse_a_concept(): def test_i_can_parse_a_concept():
text = "c:concept_name: + 1" text = "c:concept_name: + 1"
+20
View File
@@ -416,3 +416,23 @@ def test_eval_does_not_break_valid_result():
assert len(res) == 1 assert len(res) == 1
assert res[0].status assert res[0].status
assert res[0].body == 3 assert res[0].body == 3
@pytest.mark.parametrize("text", [
"'hello",
'"foo" + "string',
"c::",
"c:foo\nbar:",
"c:foo",
"def concept 'name",
"def concept name from bnf 'name"
])
def test_i_can_manage_tokenizer_error(text):
sheerka = get_sheerka()
sheerka.add_in_cache(Concept("foo"))
res = sheerka.evaluate_user_input(text)
assert len(res) > 1
for r in [r for r in res if r.who.startswith("parsers.")]:
assert not r.status
+2 -2
View File
@@ -66,11 +66,11 @@ def test_i_can_tokenize_identifiers(text, expected):
('"string', "Missing Trailing quote", '"string', 7, 1, 8), ('"string', "Missing Trailing quote", '"string', 7, 1, 8),
('"a" + "string', "Missing Trailing quote", '"string', 13, 1, 14), ('"a" + "string', "Missing Trailing quote", '"string', 13, 1, 14),
('"a"\n\n"string', "Missing Trailing quote", '"string', 12, 3, 8), ('"a"\n\n"string', "Missing Trailing quote", '"string', 12, 3, 8),
("c::", "Context name not found", "", 2, 1, 3), ("c::", "Concept name not found", "", 2, 1, 3),
("c:foo\nbar:", "New line is forbidden in concept name", "foo", 5, 1, 6), ("c:foo\nbar:", "New line is forbidden in concept name", "foo", 5, 1, 6),
("c:foo", "Missing ending colon", "foo", 5, 1, 6) ("c:foo", "Missing ending colon", "foo", 5, 1, 6)
]) ])
def test_i_can_detect_unfinished_strings(text, message, error_text, index, line, column): def test_i_can_detect_tokenizer_errors(text, message, error_text, index, line, column):
with pytest.raises(LexerError) as e: with pytest.raises(LexerError) as e:
list(Tokenizer(text)) list(Tokenizer(text))
assert e.value.message == message assert e.value.message == message