ConceptLexerParser can how handle UnrecognizedTokens

This commit is contained in:
2019-12-26 15:20:45 +01:00
parent bcb2308ea5
commit 26daae4acf
8 changed files with 483 additions and 125 deletions
+290 -81
View File
@@ -2,8 +2,10 @@ import pytest
from core.builtin_concepts import BuiltinConcepts
from core.concept import Concept
from core.sheerka import Sheerka, ExecutionContext
from core.tokenizer import Tokenizer, TokenKind, Token
from parsers.ConceptLexerParser import ConceptLexerParser, ConceptNode, Sequence, StrMatch, OrderedChoice, Optional, \
ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore
ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore, \
UnrecognizedTokensNode
from sdp.sheerkaDataProvider import Event
@@ -25,6 +27,16 @@ def u(parsing_expression, start, end, children=None):
return NonTerminalNode(parsing_expression, start, end, [], children)
def t(text):
if text.startswith("'") or text.startswith('"'):
return Token(TokenKind.STRING, text, 0, 0, 0)
if text.startswith(" "):
return Token(TokenKind.WHITESPACE, text, 0, 0, 0)
return Token(TokenKind.IDENTIFIER, text, 0, 0, 0)
@pytest.mark.parametrize("match, text", [
("foo", "foo"),
("'foo'", "'foo'"),
@@ -70,36 +82,6 @@ def test_i_can_match_multiple_concepts_in_one_input():
]
def test_i_cannot_match_an_unknown_input():
context = get_context()
parser = ConceptLexerParser() # no grammar registered
res = parser.parse(context, "foo")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "foo"
def test_i_cannot_match_when_part_of_the_input_is_unknown():
context = get_context()
one = Concept(name="one")
two = Concept(name="two")
concepts = {one: "one", two: "two"}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two three")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == [
ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2))] # these two were recognized
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "three"
def test_i_can_match_sequence():
context = get_context()
foo = Concept(name="foo")
@@ -118,37 +100,6 @@ def test_i_can_match_sequence():
u("three", 4, 4)]))]
def test_wrong_sequence_is_not_matched():
context = get_context()
foo = Concept(name="foo")
concepts = {foo: Sequence("one", "two", "three")}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two three one")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == [(foo, "one two three")]
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "one"
def test_i_cannot_match_sequence_if_end_of_file():
context = get_context()
foo = Concept(name="foo")
concepts = {foo: Sequence("one", "two", "three")}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == []
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "one"
def test_i_always_choose_the_longest_match():
context = get_context()
foo = Concept(name="foo")
@@ -205,8 +156,10 @@ def test_i_can_match_ordered_choice():
res3 = parser.parse(context, "three")
assert not res3.status
assert context.sheerka.isinstance(res3.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res3.value.body[0].body == "three"
assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
assert res3.value.value == [
UnrecognizedTokensNode(0, 0, [t("three")])
]
def test_i_cannot_match_ordered_choice_with_empty_alternative():
@@ -218,6 +171,10 @@ def test_i_cannot_match_ordered_choice_with_empty_alternative():
res = parser.parse(context, "ok") # because token[0] is not "one" and not "" (it is 'two')
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [
UnrecognizedTokensNode(0, 0, [t("ok")])
]
def test_i_can_mix_sequences_and_ordered_choices():
@@ -248,8 +205,10 @@ def test_i_can_mix_sequences_and_ordered_choices():
res3 = parser.parse(context, "twenty one")
assert not res3.status
assert res3.value.body[0].body == "twenty"
assert res3.value.try_parsed == []
assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
assert res3.value.value == [
UnrecognizedTokensNode(0, 2, [t("twenty"), t(" "), t("one")])
]
def test_i_can_mix_ordered_choices_and_sequences():
@@ -364,9 +323,9 @@ def test_i_cannot_parse_wrong_input_with_optional():
res = parser.parse(context, "two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == []
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "two"
assert res.value.value == [
UnrecognizedTokensNode(0, 0, [t("two")])
]
def test_i_can_use_reference():
@@ -463,7 +422,63 @@ def test_i_can_parse_when_reference():
assert res.value.body == [(foo, 0, 0, "twenty")]
def test_i_can_detect_duplicates_when_reference():
def test_i_can_parse_multiple_results():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two")
assert len(res) == 2
assert res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [(bar, 0, 2, "one two")]
assert res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [(foo, 0, 2, "one two")]
def test_i_can_parse_multiple_results_times_two():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two one two")
assert len(res) == 4
assert res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [(bar, "one two"), (bar, "one two")]
assert res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [(foo, "one two"), (bar, "one two")]
assert res[2].status
assert context.sheerka.isinstance(res[2].value, BuiltinConcepts.PARSER_RESULT)
assert res[2].value.body == [(bar, "one two"), (foo, "one two")]
assert res[3].status
assert context.sheerka.isinstance(res[3].value, BuiltinConcepts.PARSER_RESULT)
assert res[3].value.body == [(foo, "one two"), (foo, "one two")]
def test_i_can_parse_multiple_results_when_reference():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
@@ -557,17 +572,17 @@ def test_i_cannot_parse_zero_and_more_when_wrong_entry():
res = parser.parse(context, "one two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == [
ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)]))]
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "two"
assert res.value.value == [
ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)])),
UnrecognizedTokensNode(2, 2, [t("two")])
]
res = parser.parse(context, "two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == []
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "two"
assert res.value.value == [
UnrecognizedTokensNode(0, 0, [t("two")])
]
def test_i_can_parse_zero_and_more_with_separator():
@@ -636,10 +651,9 @@ def test_i_can_parse_sequence_and_one_or_more():
res = parser.parse(context, "two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == []
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "two"
assert res.value.body == [
UnrecognizedTokensNode(0, 0, [t("two")])
]
def test_i_can_parse_one_and_more_with_separator():
@@ -803,6 +817,201 @@ def test_i_can_initialize_rule_names():
assert return_value[bar].rule_name == "foo"
@pytest.mark.parametrize("text, end_position", [
("foo", 0),
("foo bar", 2)
])
def test_cannot_parser_unknown_concepts(text, end_position):
context = get_context()
parser = ConceptLexerParser()
parser.initialize(context, {})
res = parser.parse(context, text)
tokens = list(Tokenizer(text))[:-1]
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [UnrecognizedTokensNode(0, end_position, tokens)]
def test_i_cannot_parse_when_part_of_the_input_is_unrecognized():
context = get_context()
one = Concept(name="one")
two = Concept(name="two")
concepts = {one: "one", two: "two"}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two three")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [
ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2)),
UnrecognizedTokensNode(4, 4, [t("three")])
]
def test_i_cannot_parse_when_wrong_sequence():
context = get_context()
foo = Concept(name="foo")
concepts = {foo: Sequence("one", "two", "three")}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two three one")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [
(foo, "one two three"),
UnrecognizedTokensNode(6, 6, [t("one")])
]
def test_i_cannot_parse_when_sequence_cannot_match_because_of_end_of_file():
context = get_context()
foo = Concept(name="foo")
concepts = {foo: Sequence("one", "two", "three")}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [
UnrecognizedTokensNode(0, 2, [t("one"), t(" "), t("two")])
]
def test_i_cannot_parse_multiple_results_when_unknown_tokens_at_the_end():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two four five")
assert len(res) == 2
assert not res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [
(bar, 0, 2, "one two"),
UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")])
]
assert not res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [
(foo, 0, 2, "one two"),
UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")])
]
def test_i_cannot_parse_multiple_results_when_beginning_by_unknown_tokens():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "four five one two")
assert len(res) == 2
assert not res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
(bar, 4, 6, "one two"),
]
assert not res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
(foo, 4, 6, "one two"),
]
def test_i_cannot_parse_multiple_results_when_surrounded_by_unknown_tokens():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "four five one two six seven")
assert len(res) == 2
assert not res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
(bar, 4, 6, "one two"),
UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]),
]
assert not res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
(foo, 4, 6, "one two"),
UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]),
]
def test_i_cannot_parse_multiple_results_when_unknown_tokens_in_the_middle():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
baz = Concept(name="baz")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three")),
baz: StrMatch("six"),
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two four five six")
assert len(res) == 2
assert not res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [
(bar, 0, 2, "one two"),
UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]),
(baz, 8, 8, "six"),
]
assert not res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [
(foo, 0, 2, "one two"),
UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]),
(baz, 8, 8, "six"),
]
#
# def test_i_can_parse_basic_arithmetic_operations_and_resolve_properties():
# context = get_context()
+5 -1
View File
@@ -5,7 +5,7 @@ from core.concept import Concept
from core.sheerka import Sheerka, ExecutionContext
from evaluators.ConceptNodeEvaluator import ConceptNodeEvaluator
from parsers.ConceptLexerParser import ConceptNode, ConceptLexerParser, Sequence, TerminalNode, \
StrMatch, Optional, OrderedChoice, ZeroOrMore
StrMatch, Optional, OrderedChoice, ZeroOrMore, UnrecognizedTokensNode
from sdp.sheerkaDataProvider import Event
@@ -37,8 +37,12 @@ def get_concept_node(context, grammar, expression):
@pytest.mark.parametrize("ret_val, expected", [
(ReturnValueConcept("some_name", True, ParserResultConcept(value=[ConceptNode(Concept(), 0, 0)])), True),
(ReturnValueConcept("some_name", True, ParserResultConcept(value=ConceptNode(Concept(), 0, 0))), True),
(ReturnValueConcept("some_name", True, ParserResultConcept(value=[UnrecognizedTokensNode(0, 0, [])])), True),
(ReturnValueConcept("some_name", True, ParserResultConcept(value=UnrecognizedTokensNode(0, 0, []))), True),
(ReturnValueConcept("some_name", False, ParserResultConcept(value=[ConceptNode(Concept(), 0, 0)])), False),
(ReturnValueConcept("some_name", False, ParserResultConcept(value=ConceptNode(Concept(), 0, 0))), False),
(ReturnValueConcept("some_name", False, ParserResultConcept(value=[UnrecognizedTokensNode(0, 0, [])])), False),
(ReturnValueConcept("some_name", False, ParserResultConcept(value=UnrecognizedTokensNode(0, 0, []))), False),
(ReturnValueConcept("some_name", True, ParserResultConcept(value="Not a concept node")), False),
(ReturnValueConcept("some_name", True, ParserResultConcept(value=["Not a concept node"])), False),
(ReturnValueConcept("some_name", True, [ConceptNode(Concept(), 0, 0)]), False),
+2
View File
@@ -311,6 +311,8 @@ def test_i_can_parse_is_a():
"concept",
"isa number",
"name isa",
"def",
"def concept_name"
])
def test_i_cannot_parse_invalid_entries(text):
parser = DefaultParser()
+2 -1
View File
@@ -7,6 +7,7 @@ from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept
from core.concept import Concept, PROPERTIES_TO_SERIALIZE, Property
from core.sheerka import Sheerka, ExecutionContext
from evaluators.MutipleSameSuccessEvaluator import MultipleSameSuccessEvaluator
from parsers.BaseParser import BaseParser
from parsers.ConceptLexerParser import Sequence, ZeroOrMore, StrMatch, OrderedChoice, Optional, ConceptMatch, \
ConceptLexerParser
from sdp.sheerkaDataProvider import SheerkaDataProvider, Event
@@ -291,7 +292,7 @@ def test_i_can_manage_concepts_with_the_same_key_when_values_are_the_same():
res = sheerka.evaluate_user_input("hello 'foo'")
assert len(res) == 1
assert res[0].status
assert res[0].value.body == "hello foo" # I don't know yet the one to choose
assert res[0].value.body == "hello foo" # I don't know yet the one to choose
assert res[0].who == sheerka.get_evaluator_name(MultipleSameSuccessEvaluator.NAME)