From adcbc6bb2eb2cd89da1df4a9e1fe9247e058df52 Mon Sep 17 00:00:00 2001 From: Kodjo Sossouvi Date: Tue, 31 Dec 2019 18:28:04 +0100 Subject: [PATCH] Tokenizer exceptions are not catched --- core/tokenizer.py | 2 +- parsers/BnfParser.py | 17 +++++++----- parsers/ConceptLexerParser.py | 14 ++++++++-- parsers/DefaultParser.py | 9 +++++-- parsers/ExactConceptParser.py | 9 +++++-- parsers/PythonParser.py | 44 ++++++++++++++++++------------- tests/test_AddConceptEvaluator.py | 2 ++ tests/test_BnfParser.py | 3 ++- tests/test_DefaultParser.py | 22 ++++++++++++++-- tests/test_PythonParser.py | 24 +++++++++++++++-- tests/test_sheerka_non_reg.py | 20 ++++++++++++++ tests/test_tokenizer.py | 4 +-- 12 files changed, 131 insertions(+), 39 deletions(-) diff --git a/core/tokenizer.py b/core/tokenizer.py index bf06e4a..e9c9b38 100644 --- a/core/tokenizer.py +++ b/core/tokenizer.py @@ -261,7 +261,7 @@ class Tokenizer: raise LexerError(f"Missing ending colon", result, i, line, column + 2 + len(result)) if result == "": - raise LexerError(f"Context name not found", result, start, line, column + 2 + len(result)) + raise LexerError(f"Concept name not found", result, start, line, column + 2 + len(result)) return result diff --git a/parsers/BnfParser.py b/parsers/BnfParser.py index 8f2cd79..a9ad97e 100644 --- a/parsers/BnfParser.py +++ b/parsers/BnfParser.py @@ -3,7 +3,7 @@ from dataclasses import dataclass import core.utils from core.builtin_concepts import BuiltinConcepts from core.sheerka import ExecutionContext -from core.tokenizer import Tokenizer, Token, TokenKind +from core.tokenizer import Tokenizer, Token, TokenKind, LexerError from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode from parsers.ConceptLexerParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptMatch, StrMatch @@ -114,12 +114,17 @@ class BnfParser: return token.type == second or token.type == first and self.next_after().type == second def parse(self, context: ExecutionContext, text): - self.reset_parser(context, text) - tree = self.parser_outer_rule_name() - token = self.get_token() - if token and token.type != TokenKind.EOF: - self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [])) + tree = None + try: + self.reset_parser(context, text) + tree = self.parser_outer_rule_name() + + token = self.get_token() + if token and token.type != TokenKind.EOF: + self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [])) + except LexerError as e: + self.add_error(e, False) ret = self.sheerka.ret( self.name, diff --git a/parsers/ConceptLexerParser.py b/parsers/ConceptLexerParser.py index 4f9a416..862da6a 100644 --- a/parsers/ConceptLexerParser.py +++ b/parsers/ConceptLexerParser.py @@ -555,7 +555,12 @@ class ConceptLexerParser(BaseParser): self.text = text if isinstance(text, str): - self.tokens = list(Tokenizer(text)) + try: + self.tokens = list(Tokenizer(text)) + except core.tokenizer.LexerError as e: + self.add_error(self.sheerka.new(BuiltinConcepts.ERROR, body=e), False) + return False + else: self.tokens = list(text) self.tokens.append(Token(TokenKind.EOF, "", -1, -1, -1)) # make sure to finish with end of file token @@ -563,6 +568,7 @@ class ConceptLexerParser(BaseParser): self.token = None self.pos = -1 self.next_token() + return True def get_token(self) -> Token: return self.token @@ -724,7 +730,11 @@ class ConceptLexerParser(BaseParser): context.sheerka.new(BuiltinConcepts.IS_EMPTY) ) - self.reset_parser(context, text) + if not self.reset_parser(context, text): + return self.sheerka.ret( + self.name, + False, + context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink)) concepts_found = [[]] unrecognized_tokens = None diff --git a/parsers/DefaultParser.py b/parsers/DefaultParser.py index c18f73f..8e2a2d9 100644 --- a/parsers/DefaultParser.py +++ b/parsers/DefaultParser.py @@ -183,6 +183,7 @@ class DefaultParser(BaseParser): self._current = next(self.lexer_iter) except StopIteration: self._current = None + return def parse(self, context, text): @@ -195,8 +196,12 @@ class DefaultParser(BaseParser): self.log_result(context, text, ret) return ret - self.reset_parser(context, text) - tree = self.parse_statement() + tree = None + try: + self.reset_parser(context, text) + tree = self.parse_statement() + except core.tokenizer.LexerError as e: + self.add_error(e, False) # If a error is found it must be sent to error_sink # tree must contain what was recognized diff --git a/parsers/ExactConceptParser.py b/parsers/ExactConceptParser.py index 427d01d..beb4c36 100644 --- a/parsers/ExactConceptParser.py +++ b/parsers/ExactConceptParser.py @@ -2,7 +2,7 @@ import logging from core.builtin_concepts import ReturnValueConcept, BuiltinConcepts from parsers.BaseParser import BaseParser -from core.tokenizer import Tokenizer, Keywords, TokenKind +from core.tokenizer import Tokenizer, Keywords, TokenKind, LexerError from core.concept import VARIABLE_PREFIX @@ -27,7 +27,12 @@ class ExactConceptParser(BaseParser): context.log(self.verbose_log, f"Parsing '{text}'", self.name) res = [] sheerka = context.sheerka - words = self.get_words(text) + try: + words = self.get_words(text) + except LexerError as e: + context.log(self.verbose_log, f"Error found in tokenizer {e}", self.name) + return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.ERROR, body=e)) + if len(words) > self.MAX_WORDS_SIZE: context.log(self.verbose_log, f"Max words reached. Stopping.", self.name) return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.CONCEPT_TOO_LONG, body=text)) diff --git a/parsers/PythonParser.py b/parsers/PythonParser.py index 773d2bd..a377785 100644 --- a/parsers/PythonParser.py +++ b/parsers/PythonParser.py @@ -1,5 +1,5 @@ from core.builtin_concepts import BuiltinConcepts -from core.tokenizer import Tokenizer +from core.tokenizer import Tokenizer, LexerError from parsers.BaseParser import BaseParser, Node, ErrorNode from dataclasses import dataclass import ast @@ -63,27 +63,33 @@ class PythonParser(BaseParser): self.source = kwargs.get("source", "") def parse(self, context, text): - if isinstance(text, str) and "c:" in text: - source = self.get_text_from_tokens(list(Tokenizer(text))) - elif isinstance(text, str): - source = text - else: - source = self.get_text_from_tokens(text) - source = source.strip() - - text = text if isinstance(text, str) else source - sheerka = context.sheerka + tree = None - # first, try to parse an expression - res, tree, error = self.try_parse_expression(source) - if not res: - # then try to parse a statement - res, tree, error = self.try_parse_statement(source) + try: + if isinstance(text, str) and "c:" in text: + source = self.get_text_from_tokens(list(Tokenizer(text))) + elif isinstance(text, str): + source = text + else: + source = self.get_text_from_tokens(text) + source = source.strip() + + text = text if isinstance(text, str) else source + + # first, try to parse an expression + res, tree, error = self.try_parse_expression(source) if not res: - self.has_error = True - error_node = PythonErrorNode(text, error) - self.error_sink.append(error_node) + # then try to parse a statement + res, tree, error = self.try_parse_statement(source) + if not res: + self.has_error = True + error_node = PythonErrorNode(text, error) + self.error_sink.append(error_node) + + except LexerError as e: + self.has_error = True + self.error_sink.append(e) ret = sheerka.ret( self.name, diff --git a/tests/test_AddConceptEvaluator.py b/tests/test_AddConceptEvaluator.py index 721b63c..cb35868 100644 --- a/tests/test_AddConceptEvaluator.py +++ b/tests/test_AddConceptEvaluator.py @@ -180,3 +180,5 @@ def test_i_can_get_props_from_definition(): ret_val = get_concept_definition("mult (('+'|'-') add)?", parsing_expression) assert AddConceptEvaluator.get_props(get_context(), ret_val, []) == ["add", "mult"] + + diff --git a/tests/test_BnfParser.py b/tests/test_BnfParser.py index ba714a9..cb0cc4f 100644 --- a/tests/test_BnfParser.py +++ b/tests/test_BnfParser.py @@ -2,7 +2,7 @@ import pytest from core.concept import Concept from core.sheerka import Sheerka, ExecutionContext -from core.tokenizer import Tokenizer, TokenKind +from core.tokenizer import Tokenizer, TokenKind, LexerError from parsers.BaseParser import UnexpectedTokenErrorNode from parsers.BnfParser import BnfParser, UnexpectedEndOfFileError from parsers.ConceptLexerParser import StrMatch, Optional, ZeroOrMore, OrderedChoice, Sequence, OneOrMore, \ @@ -80,6 +80,7 @@ def test_i_can_parse_regex(expression, expected): ("1|", UnexpectedEndOfFileError()), ("(1|)", UnexpectedTokenErrorNode("Unexpected token 'Token()'", [TokenKind.RPAR])), ("1=", UnexpectedTokenErrorNode("Unexpected token 'Token()'", [TokenKind.IDENTIFIER])), + ("'name", LexerError("Missing Trailing quote", "'name", 5, 1, 6)) ]) def test_i_can_detect_errors(expression, error): parser = BnfParser() diff --git a/tests/test_DefaultParser.py b/tests/test_DefaultParser.py index 68b1c49..1fefe92 100644 --- a/tests/test_DefaultParser.py +++ b/tests/test_DefaultParser.py @@ -5,12 +5,11 @@ from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnVa from core.sheerka import Sheerka, ExecutionContext from parsers.ConceptLexerParser import OrderedChoice, StrMatch, ConceptMatch from parsers.PythonParser import PythonParser, PythonNode -from core.tokenizer import Keywords, Tokenizer +from core.tokenizer import Keywords, Tokenizer, LexerError from parsers.DefaultParser import DefaultParser, NameNode, SyntaxErrorNode, CannotHandleErrorNode, IsaConceptNode from parsers.DefaultParser import UnexpectedTokenErrorNode, DefConceptNode from parsers.BnfParser import BnfParser - from sdp.sheerkaDataProvider import Event @@ -321,3 +320,22 @@ def test_i_cannot_parse_invalid_entries(text): assert not res.status assert isinstance(res.body, ParserResultConcept) assert isinstance(res.body.body[0], UnexpectedTokenErrorNode) + + +@pytest.mark.parametrize("text, error_msg, error_text", [ + ("'name", "Missing Trailing quote", "'name"), + ("foo isa 'name", "Missing Trailing quote", "'name"), + ("def concept 'name", "Missing Trailing quote", "'name"), + ("def concept name as 'body", "Missing Trailing quote", "'body"), + ("def concept name from bnf 'expression", "Missing Trailing quote", "'expression"), + ("def concept c::", "Concept name not found", ""), +]) +def test_i_cannot_parse_when_tokenizer_fails(text, error_msg, error_text): + parser = DefaultParser() + res = parser.parse(get_context(), text) + + assert not res.status + assert isinstance(res.body, ParserResultConcept) + assert isinstance(res.body.body[0], LexerError) + assert res.body.body[0].message == error_msg + assert res.body.body[0].text == error_text diff --git a/tests/test_PythonParser.py b/tests/test_PythonParser.py index 24db25b..7df3668 100644 --- a/tests/test_PythonParser.py +++ b/tests/test_PythonParser.py @@ -4,7 +4,7 @@ import pytest from core.builtin_concepts import ParserResultConcept from core.sheerka import Sheerka, ExecutionContext -from core.tokenizer import Tokenizer +from core.tokenizer import Tokenizer, LexerError from parsers.PythonParser import PythonNode, PythonParser, PythonErrorNode from sdp.sheerkaDataProvider import Event @@ -44,7 +44,12 @@ def test_i_can_parse_from_tokens(text, expected): assert res.value.value == expected -def test_i_can_detect_error(): +@pytest.mark.parametrize("text", [ + "1+", + "'name", + "foo = 'name" +]) +def test_i_can_detect_error(text): text = "1+" parser = PythonParser() @@ -57,6 +62,21 @@ def test_i_can_detect_error(): assert isinstance(res.value.value[0].exception, SyntaxError) +@pytest.mark.parametrize("text, error_msg, error_text", [ + ("c::", "Concept name not found", ""), + ("c:: + 1", "Concept name not found", ""), +]) +def test_i_can_detect_lexer_errors(text, error_msg, error_text): + parser = PythonParser() + res = parser.parse(get_context(), text) + + assert not res.status + assert isinstance(res.body, ParserResultConcept) + assert isinstance(res.body.body[0], LexerError) + assert res.body.body[0].message == error_msg + assert res.body.body[0].text == error_text + + def test_i_can_parse_a_concept(): text = "c:concept_name: + 1" diff --git a/tests/test_sheerka_non_reg.py b/tests/test_sheerka_non_reg.py index 87ed695..3c38843 100644 --- a/tests/test_sheerka_non_reg.py +++ b/tests/test_sheerka_non_reg.py @@ -416,3 +416,23 @@ def test_eval_does_not_break_valid_result(): assert len(res) == 1 assert res[0].status assert res[0].body == 3 + + +@pytest.mark.parametrize("text", [ + "'hello", + '"foo" + "string', + "c::", + "c:foo\nbar:", + "c:foo", + "def concept 'name", + "def concept name from bnf 'name" +]) +def test_i_can_manage_tokenizer_error(text): + sheerka = get_sheerka() + sheerka.add_in_cache(Concept("foo")) + + res = sheerka.evaluate_user_input(text) + + assert len(res) > 1 + for r in [r for r in res if r.who.startswith("parsers.")]: + assert not r.status diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 3584d2c..dd9440b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -66,11 +66,11 @@ def test_i_can_tokenize_identifiers(text, expected): ('"string', "Missing Trailing quote", '"string', 7, 1, 8), ('"a" + "string', "Missing Trailing quote", '"string', 13, 1, 14), ('"a"\n\n"string', "Missing Trailing quote", '"string', 12, 3, 8), - ("c::", "Context name not found", "", 2, 1, 3), + ("c::", "Concept name not found", "", 2, 1, 3), ("c:foo\nbar:", "New line is forbidden in concept name", "foo", 5, 1, 6), ("c:foo", "Missing ending colon", "foo", 5, 1, 6) ]) -def test_i_can_detect_unfinished_strings(text, message, error_text, index, line, column): +def test_i_can_detect_tokenizer_errors(text, message, error_text, index, line, column): with pytest.raises(LexerError) as e: list(Tokenizer(text)) assert e.value.message == message