Tokenizer exceptions are not catched
This commit is contained in:
+1
-1
@@ -261,7 +261,7 @@ class Tokenizer:
|
|||||||
raise LexerError(f"Missing ending colon", result, i, line, column + 2 + len(result))
|
raise LexerError(f"Missing ending colon", result, i, line, column + 2 + len(result))
|
||||||
|
|
||||||
if result == "":
|
if result == "":
|
||||||
raise LexerError(f"Context name not found", result, start, line, column + 2 + len(result))
|
raise LexerError(f"Concept name not found", result, start, line, column + 2 + len(result))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
+11
-6
@@ -3,7 +3,7 @@ from dataclasses import dataclass
|
|||||||
import core.utils
|
import core.utils
|
||||||
from core.builtin_concepts import BuiltinConcepts
|
from core.builtin_concepts import BuiltinConcepts
|
||||||
from core.sheerka import ExecutionContext
|
from core.sheerka import ExecutionContext
|
||||||
from core.tokenizer import Tokenizer, Token, TokenKind
|
from core.tokenizer import Tokenizer, Token, TokenKind, LexerError
|
||||||
from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode
|
from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode
|
||||||
from parsers.ConceptLexerParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptMatch, StrMatch
|
from parsers.ConceptLexerParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptMatch, StrMatch
|
||||||
|
|
||||||
@@ -114,12 +114,17 @@ class BnfParser:
|
|||||||
return token.type == second or token.type == first and self.next_after().type == second
|
return token.type == second or token.type == first and self.next_after().type == second
|
||||||
|
|
||||||
def parse(self, context: ExecutionContext, text):
|
def parse(self, context: ExecutionContext, text):
|
||||||
self.reset_parser(context, text)
|
|
||||||
tree = self.parser_outer_rule_name()
|
|
||||||
|
|
||||||
token = self.get_token()
|
tree = None
|
||||||
if token and token.type != TokenKind.EOF:
|
try:
|
||||||
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", []))
|
self.reset_parser(context, text)
|
||||||
|
tree = self.parser_outer_rule_name()
|
||||||
|
|
||||||
|
token = self.get_token()
|
||||||
|
if token and token.type != TokenKind.EOF:
|
||||||
|
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", []))
|
||||||
|
except LexerError as e:
|
||||||
|
self.add_error(e, False)
|
||||||
|
|
||||||
ret = self.sheerka.ret(
|
ret = self.sheerka.ret(
|
||||||
self.name,
|
self.name,
|
||||||
|
|||||||
@@ -555,7 +555,12 @@ class ConceptLexerParser(BaseParser):
|
|||||||
self.text = text
|
self.text = text
|
||||||
|
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
self.tokens = list(Tokenizer(text))
|
try:
|
||||||
|
self.tokens = list(Tokenizer(text))
|
||||||
|
except core.tokenizer.LexerError as e:
|
||||||
|
self.add_error(self.sheerka.new(BuiltinConcepts.ERROR, body=e), False)
|
||||||
|
return False
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.tokens = list(text)
|
self.tokens = list(text)
|
||||||
self.tokens.append(Token(TokenKind.EOF, "", -1, -1, -1)) # make sure to finish with end of file token
|
self.tokens.append(Token(TokenKind.EOF, "", -1, -1, -1)) # make sure to finish with end of file token
|
||||||
@@ -563,6 +568,7 @@ class ConceptLexerParser(BaseParser):
|
|||||||
self.token = None
|
self.token = None
|
||||||
self.pos = -1
|
self.pos = -1
|
||||||
self.next_token()
|
self.next_token()
|
||||||
|
return True
|
||||||
|
|
||||||
def get_token(self) -> Token:
|
def get_token(self) -> Token:
|
||||||
return self.token
|
return self.token
|
||||||
@@ -724,7 +730,11 @@ class ConceptLexerParser(BaseParser):
|
|||||||
context.sheerka.new(BuiltinConcepts.IS_EMPTY)
|
context.sheerka.new(BuiltinConcepts.IS_EMPTY)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.reset_parser(context, text)
|
if not self.reset_parser(context, text):
|
||||||
|
return self.sheerka.ret(
|
||||||
|
self.name,
|
||||||
|
False,
|
||||||
|
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
|
||||||
|
|
||||||
concepts_found = [[]]
|
concepts_found = [[]]
|
||||||
unrecognized_tokens = None
|
unrecognized_tokens = None
|
||||||
|
|||||||
@@ -183,6 +183,7 @@ class DefaultParser(BaseParser):
|
|||||||
self._current = next(self.lexer_iter)
|
self._current = next(self.lexer_iter)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
self._current = None
|
self._current = None
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def parse(self, context, text):
|
def parse(self, context, text):
|
||||||
@@ -195,8 +196,12 @@ class DefaultParser(BaseParser):
|
|||||||
self.log_result(context, text, ret)
|
self.log_result(context, text, ret)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
self.reset_parser(context, text)
|
tree = None
|
||||||
tree = self.parse_statement()
|
try:
|
||||||
|
self.reset_parser(context, text)
|
||||||
|
tree = self.parse_statement()
|
||||||
|
except core.tokenizer.LexerError as e:
|
||||||
|
self.add_error(e, False)
|
||||||
|
|
||||||
# If a error is found it must be sent to error_sink
|
# If a error is found it must be sent to error_sink
|
||||||
# tree must contain what was recognized
|
# tree must contain what was recognized
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import logging
|
|||||||
|
|
||||||
from core.builtin_concepts import ReturnValueConcept, BuiltinConcepts
|
from core.builtin_concepts import ReturnValueConcept, BuiltinConcepts
|
||||||
from parsers.BaseParser import BaseParser
|
from parsers.BaseParser import BaseParser
|
||||||
from core.tokenizer import Tokenizer, Keywords, TokenKind
|
from core.tokenizer import Tokenizer, Keywords, TokenKind, LexerError
|
||||||
from core.concept import VARIABLE_PREFIX
|
from core.concept import VARIABLE_PREFIX
|
||||||
|
|
||||||
|
|
||||||
@@ -27,7 +27,12 @@ class ExactConceptParser(BaseParser):
|
|||||||
context.log(self.verbose_log, f"Parsing '{text}'", self.name)
|
context.log(self.verbose_log, f"Parsing '{text}'", self.name)
|
||||||
res = []
|
res = []
|
||||||
sheerka = context.sheerka
|
sheerka = context.sheerka
|
||||||
words = self.get_words(text)
|
try:
|
||||||
|
words = self.get_words(text)
|
||||||
|
except LexerError as e:
|
||||||
|
context.log(self.verbose_log, f"Error found in tokenizer {e}", self.name)
|
||||||
|
return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.ERROR, body=e))
|
||||||
|
|
||||||
if len(words) > self.MAX_WORDS_SIZE:
|
if len(words) > self.MAX_WORDS_SIZE:
|
||||||
context.log(self.verbose_log, f"Max words reached. Stopping.", self.name)
|
context.log(self.verbose_log, f"Max words reached. Stopping.", self.name)
|
||||||
return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.CONCEPT_TOO_LONG, body=text))
|
return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.CONCEPT_TOO_LONG, body=text))
|
||||||
|
|||||||
+25
-19
@@ -1,5 +1,5 @@
|
|||||||
from core.builtin_concepts import BuiltinConcepts
|
from core.builtin_concepts import BuiltinConcepts
|
||||||
from core.tokenizer import Tokenizer
|
from core.tokenizer import Tokenizer, LexerError
|
||||||
from parsers.BaseParser import BaseParser, Node, ErrorNode
|
from parsers.BaseParser import BaseParser, Node, ErrorNode
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import ast
|
import ast
|
||||||
@@ -63,27 +63,33 @@ class PythonParser(BaseParser):
|
|||||||
self.source = kwargs.get("source", "<undef>")
|
self.source = kwargs.get("source", "<undef>")
|
||||||
|
|
||||||
def parse(self, context, text):
|
def parse(self, context, text):
|
||||||
if isinstance(text, str) and "c:" in text:
|
|
||||||
source = self.get_text_from_tokens(list(Tokenizer(text)))
|
|
||||||
elif isinstance(text, str):
|
|
||||||
source = text
|
|
||||||
else:
|
|
||||||
source = self.get_text_from_tokens(text)
|
|
||||||
source = source.strip()
|
|
||||||
|
|
||||||
text = text if isinstance(text, str) else source
|
|
||||||
|
|
||||||
sheerka = context.sheerka
|
sheerka = context.sheerka
|
||||||
|
tree = None
|
||||||
|
|
||||||
# first, try to parse an expression
|
try:
|
||||||
res, tree, error = self.try_parse_expression(source)
|
if isinstance(text, str) and "c:" in text:
|
||||||
if not res:
|
source = self.get_text_from_tokens(list(Tokenizer(text)))
|
||||||
# then try to parse a statement
|
elif isinstance(text, str):
|
||||||
res, tree, error = self.try_parse_statement(source)
|
source = text
|
||||||
|
else:
|
||||||
|
source = self.get_text_from_tokens(text)
|
||||||
|
source = source.strip()
|
||||||
|
|
||||||
|
text = text if isinstance(text, str) else source
|
||||||
|
|
||||||
|
# first, try to parse an expression
|
||||||
|
res, tree, error = self.try_parse_expression(source)
|
||||||
if not res:
|
if not res:
|
||||||
self.has_error = True
|
# then try to parse a statement
|
||||||
error_node = PythonErrorNode(text, error)
|
res, tree, error = self.try_parse_statement(source)
|
||||||
self.error_sink.append(error_node)
|
if not res:
|
||||||
|
self.has_error = True
|
||||||
|
error_node = PythonErrorNode(text, error)
|
||||||
|
self.error_sink.append(error_node)
|
||||||
|
|
||||||
|
except LexerError as e:
|
||||||
|
self.has_error = True
|
||||||
|
self.error_sink.append(e)
|
||||||
|
|
||||||
ret = sheerka.ret(
|
ret = sheerka.ret(
|
||||||
self.name,
|
self.name,
|
||||||
|
|||||||
@@ -180,3 +180,5 @@ def test_i_can_get_props_from_definition():
|
|||||||
ret_val = get_concept_definition("mult (('+'|'-') add)?", parsing_expression)
|
ret_val = get_concept_definition("mult (('+'|'-') add)?", parsing_expression)
|
||||||
|
|
||||||
assert AddConceptEvaluator.get_props(get_context(), ret_val, []) == ["add", "mult"]
|
assert AddConceptEvaluator.get_props(get_context(), ret_val, []) == ["add", "mult"]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import pytest
|
|||||||
|
|
||||||
from core.concept import Concept
|
from core.concept import Concept
|
||||||
from core.sheerka import Sheerka, ExecutionContext
|
from core.sheerka import Sheerka, ExecutionContext
|
||||||
from core.tokenizer import Tokenizer, TokenKind
|
from core.tokenizer import Tokenizer, TokenKind, LexerError
|
||||||
from parsers.BaseParser import UnexpectedTokenErrorNode
|
from parsers.BaseParser import UnexpectedTokenErrorNode
|
||||||
from parsers.BnfParser import BnfParser, UnexpectedEndOfFileError
|
from parsers.BnfParser import BnfParser, UnexpectedEndOfFileError
|
||||||
from parsers.ConceptLexerParser import StrMatch, Optional, ZeroOrMore, OrderedChoice, Sequence, OneOrMore, \
|
from parsers.ConceptLexerParser import StrMatch, Optional, ZeroOrMore, OrderedChoice, Sequence, OneOrMore, \
|
||||||
@@ -80,6 +80,7 @@ def test_i_can_parse_regex(expression, expected):
|
|||||||
("1|", UnexpectedEndOfFileError()),
|
("1|", UnexpectedEndOfFileError()),
|
||||||
("(1|)", UnexpectedTokenErrorNode("Unexpected token 'Token(<EOF>)'", [TokenKind.RPAR])),
|
("(1|)", UnexpectedTokenErrorNode("Unexpected token 'Token(<EOF>)'", [TokenKind.RPAR])),
|
||||||
("1=", UnexpectedTokenErrorNode("Unexpected token 'Token(<EOF>)'", [TokenKind.IDENTIFIER])),
|
("1=", UnexpectedTokenErrorNode("Unexpected token 'Token(<EOF>)'", [TokenKind.IDENTIFIER])),
|
||||||
|
("'name", LexerError("Missing Trailing quote", "'name", 5, 1, 6))
|
||||||
])
|
])
|
||||||
def test_i_can_detect_errors(expression, error):
|
def test_i_can_detect_errors(expression, error):
|
||||||
parser = BnfParser()
|
parser = BnfParser()
|
||||||
|
|||||||
@@ -5,12 +5,11 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnVa
|
|||||||
from core.sheerka import Sheerka, ExecutionContext
|
from core.sheerka import Sheerka, ExecutionContext
|
||||||
from parsers.ConceptLexerParser import OrderedChoice, StrMatch, ConceptMatch
|
from parsers.ConceptLexerParser import OrderedChoice, StrMatch, ConceptMatch
|
||||||
from parsers.PythonParser import PythonParser, PythonNode
|
from parsers.PythonParser import PythonParser, PythonNode
|
||||||
from core.tokenizer import Keywords, Tokenizer
|
from core.tokenizer import Keywords, Tokenizer, LexerError
|
||||||
from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode, IsaConceptNode
|
from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode, IsaConceptNode
|
||||||
from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode
|
from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode
|
||||||
from parsers.BnfParser import BnfParser
|
from parsers.BnfParser import BnfParser
|
||||||
|
|
||||||
|
|
||||||
from sdp.sheerkaDataProvider import Event
|
from sdp.sheerkaDataProvider import Event
|
||||||
|
|
||||||
|
|
||||||
@@ -321,3 +320,22 @@ def test_i_cannot_parse_invalid_entries(text):
|
|||||||
assert not res.status
|
assert not res.status
|
||||||
assert isinstance(res.body, ParserResultConcept)
|
assert isinstance(res.body, ParserResultConcept)
|
||||||
assert isinstance(res.body.body[0], UnexpectedTokenErrorNode)
|
assert isinstance(res.body.body[0], UnexpectedTokenErrorNode)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text, error_msg, error_text", [
|
||||||
|
("'name", "Missing Trailing quote", "'name"),
|
||||||
|
("foo isa 'name", "Missing Trailing quote", "'name"),
|
||||||
|
("def concept 'name", "Missing Trailing quote", "'name"),
|
||||||
|
("def concept name as 'body", "Missing Trailing quote", "'body"),
|
||||||
|
("def concept name from bnf 'expression", "Missing Trailing quote", "'expression"),
|
||||||
|
("def concept c::", "Concept name not found", ""),
|
||||||
|
])
|
||||||
|
def test_i_cannot_parse_when_tokenizer_fails(text, error_msg, error_text):
|
||||||
|
parser = DefaultParser()
|
||||||
|
res = parser.parse(get_context(), text)
|
||||||
|
|
||||||
|
assert not res.status
|
||||||
|
assert isinstance(res.body, ParserResultConcept)
|
||||||
|
assert isinstance(res.body.body[0], LexerError)
|
||||||
|
assert res.body.body[0].message == error_msg
|
||||||
|
assert res.body.body[0].text == error_text
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import pytest
|
|||||||
|
|
||||||
from core.builtin_concepts import ParserResultConcept
|
from core.builtin_concepts import ParserResultConcept
|
||||||
from core.sheerka import Sheerka, ExecutionContext
|
from core.sheerka import Sheerka, ExecutionContext
|
||||||
from core.tokenizer import Tokenizer
|
from core.tokenizer import Tokenizer, LexerError
|
||||||
from parsers.PythonParser import PythonNode, PythonParser, PythonErrorNode
|
from parsers.PythonParser import PythonNode, PythonParser, PythonErrorNode
|
||||||
from sdp.sheerkaDataProvider import Event
|
from sdp.sheerkaDataProvider import Event
|
||||||
|
|
||||||
@@ -44,7 +44,12 @@ def test_i_can_parse_from_tokens(text, expected):
|
|||||||
assert res.value.value == expected
|
assert res.value.value == expected
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_detect_error():
|
@pytest.mark.parametrize("text", [
|
||||||
|
"1+",
|
||||||
|
"'name",
|
||||||
|
"foo = 'name"
|
||||||
|
])
|
||||||
|
def test_i_can_detect_error(text):
|
||||||
text = "1+"
|
text = "1+"
|
||||||
|
|
||||||
parser = PythonParser()
|
parser = PythonParser()
|
||||||
@@ -57,6 +62,21 @@ def test_i_can_detect_error():
|
|||||||
assert isinstance(res.value.value[0].exception, SyntaxError)
|
assert isinstance(res.value.value[0].exception, SyntaxError)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text, error_msg, error_text", [
|
||||||
|
("c::", "Concept name not found", ""),
|
||||||
|
("c:: + 1", "Concept name not found", ""),
|
||||||
|
])
|
||||||
|
def test_i_can_detect_lexer_errors(text, error_msg, error_text):
|
||||||
|
parser = PythonParser()
|
||||||
|
res = parser.parse(get_context(), text)
|
||||||
|
|
||||||
|
assert not res.status
|
||||||
|
assert isinstance(res.body, ParserResultConcept)
|
||||||
|
assert isinstance(res.body.body[0], LexerError)
|
||||||
|
assert res.body.body[0].message == error_msg
|
||||||
|
assert res.body.body[0].text == error_text
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_parse_a_concept():
|
def test_i_can_parse_a_concept():
|
||||||
text = "c:concept_name: + 1"
|
text = "c:concept_name: + 1"
|
||||||
|
|
||||||
|
|||||||
@@ -416,3 +416,23 @@ def test_eval_does_not_break_valid_result():
|
|||||||
assert len(res) == 1
|
assert len(res) == 1
|
||||||
assert res[0].status
|
assert res[0].status
|
||||||
assert res[0].body == 3
|
assert res[0].body == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", [
|
||||||
|
"'hello",
|
||||||
|
'"foo" + "string',
|
||||||
|
"c::",
|
||||||
|
"c:foo\nbar:",
|
||||||
|
"c:foo",
|
||||||
|
"def concept 'name",
|
||||||
|
"def concept name from bnf 'name"
|
||||||
|
])
|
||||||
|
def test_i_can_manage_tokenizer_error(text):
|
||||||
|
sheerka = get_sheerka()
|
||||||
|
sheerka.add_in_cache(Concept("foo"))
|
||||||
|
|
||||||
|
res = sheerka.evaluate_user_input(text)
|
||||||
|
|
||||||
|
assert len(res) > 1
|
||||||
|
for r in [r for r in res if r.who.startswith("parsers.")]:
|
||||||
|
assert not r.status
|
||||||
|
|||||||
@@ -66,11 +66,11 @@ def test_i_can_tokenize_identifiers(text, expected):
|
|||||||
('"string', "Missing Trailing quote", '"string', 7, 1, 8),
|
('"string', "Missing Trailing quote", '"string', 7, 1, 8),
|
||||||
('"a" + "string', "Missing Trailing quote", '"string', 13, 1, 14),
|
('"a" + "string', "Missing Trailing quote", '"string', 13, 1, 14),
|
||||||
('"a"\n\n"string', "Missing Trailing quote", '"string', 12, 3, 8),
|
('"a"\n\n"string', "Missing Trailing quote", '"string', 12, 3, 8),
|
||||||
("c::", "Context name not found", "", 2, 1, 3),
|
("c::", "Concept name not found", "", 2, 1, 3),
|
||||||
("c:foo\nbar:", "New line is forbidden in concept name", "foo", 5, 1, 6),
|
("c:foo\nbar:", "New line is forbidden in concept name", "foo", 5, 1, 6),
|
||||||
("c:foo", "Missing ending colon", "foo", 5, 1, 6)
|
("c:foo", "Missing ending colon", "foo", 5, 1, 6)
|
||||||
])
|
])
|
||||||
def test_i_can_detect_unfinished_strings(text, message, error_text, index, line, column):
|
def test_i_can_detect_tokenizer_errors(text, message, error_text, index, line, column):
|
||||||
with pytest.raises(LexerError) as e:
|
with pytest.raises(LexerError) as e:
|
||||||
list(Tokenizer(text))
|
list(Tokenizer(text))
|
||||||
assert e.value.message == message
|
assert e.value.message == message
|
||||||
|
|||||||
Reference in New Issue
Block a user