ConceptLexerParser can how handle UnrecognizedTokens
This commit is contained in:
@@ -2,8 +2,10 @@ import pytest
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from core.concept import Concept
|
||||
from core.sheerka import Sheerka, ExecutionContext
|
||||
from core.tokenizer import Tokenizer, TokenKind, Token
|
||||
from parsers.ConceptLexerParser import ConceptLexerParser, ConceptNode, Sequence, StrMatch, OrderedChoice, Optional, \
|
||||
ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore
|
||||
ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore, \
|
||||
UnrecognizedTokensNode
|
||||
from sdp.sheerkaDataProvider import Event
|
||||
|
||||
|
||||
@@ -25,6 +27,16 @@ def u(parsing_expression, start, end, children=None):
|
||||
return NonTerminalNode(parsing_expression, start, end, [], children)
|
||||
|
||||
|
||||
def t(text):
|
||||
if text.startswith("'") or text.startswith('"'):
|
||||
return Token(TokenKind.STRING, text, 0, 0, 0)
|
||||
|
||||
if text.startswith(" "):
|
||||
return Token(TokenKind.WHITESPACE, text, 0, 0, 0)
|
||||
|
||||
return Token(TokenKind.IDENTIFIER, text, 0, 0, 0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("match, text", [
|
||||
("foo", "foo"),
|
||||
("'foo'", "'foo'"),
|
||||
@@ -70,36 +82,6 @@ def test_i_can_match_multiple_concepts_in_one_input():
|
||||
]
|
||||
|
||||
|
||||
def test_i_cannot_match_an_unknown_input():
|
||||
context = get_context()
|
||||
parser = ConceptLexerParser() # no grammar registered
|
||||
|
||||
res = parser.parse(context, "foo")
|
||||
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res.value.body[0].body == "foo"
|
||||
|
||||
|
||||
def test_i_cannot_match_when_part_of_the_input_is_unknown():
|
||||
context = get_context()
|
||||
one = Concept(name="one")
|
||||
two = Concept(name="two")
|
||||
concepts = {one: "one", two: "two"}
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two three")
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.try_parsed == [
|
||||
ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
|
||||
ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2))] # these two were recognized
|
||||
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res.value.body[0].body == "three"
|
||||
|
||||
|
||||
def test_i_can_match_sequence():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
@@ -118,37 +100,6 @@ def test_i_can_match_sequence():
|
||||
u("three", 4, 4)]))]
|
||||
|
||||
|
||||
def test_wrong_sequence_is_not_matched():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
concepts = {foo: Sequence("one", "two", "three")}
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two three one")
|
||||
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.try_parsed == [(foo, "one two three")]
|
||||
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res.value.body[0].body == "one"
|
||||
|
||||
|
||||
def test_i_cannot_match_sequence_if_end_of_file():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
concepts = {foo: Sequence("one", "two", "three")}
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two")
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.try_parsed == []
|
||||
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res.value.body[0].body == "one"
|
||||
|
||||
|
||||
def test_i_always_choose_the_longest_match():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
@@ -205,8 +156,10 @@ def test_i_can_match_ordered_choice():
|
||||
|
||||
res3 = parser.parse(context, "three")
|
||||
assert not res3.status
|
||||
assert context.sheerka.isinstance(res3.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res3.value.body[0].body == "three"
|
||||
assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res3.value.value == [
|
||||
UnrecognizedTokensNode(0, 0, [t("three")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_cannot_match_ordered_choice_with_empty_alternative():
|
||||
@@ -218,6 +171,10 @@ def test_i_cannot_match_ordered_choice_with_empty_alternative():
|
||||
|
||||
res = parser.parse(context, "ok") # because token[0] is not "one" and not "" (it is 'two')
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.value == [
|
||||
UnrecognizedTokensNode(0, 0, [t("ok")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_can_mix_sequences_and_ordered_choices():
|
||||
@@ -248,8 +205,10 @@ def test_i_can_mix_sequences_and_ordered_choices():
|
||||
|
||||
res3 = parser.parse(context, "twenty one")
|
||||
assert not res3.status
|
||||
assert res3.value.body[0].body == "twenty"
|
||||
assert res3.value.try_parsed == []
|
||||
assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res3.value.value == [
|
||||
UnrecognizedTokensNode(0, 2, [t("twenty"), t(" "), t("one")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_can_mix_ordered_choices_and_sequences():
|
||||
@@ -364,9 +323,9 @@ def test_i_cannot_parse_wrong_input_with_optional():
|
||||
res = parser.parse(context, "two")
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.try_parsed == []
|
||||
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res.value.body[0].body == "two"
|
||||
assert res.value.value == [
|
||||
UnrecognizedTokensNode(0, 0, [t("two")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_can_use_reference():
|
||||
@@ -463,7 +422,63 @@ def test_i_can_parse_when_reference():
|
||||
assert res.value.body == [(foo, 0, 0, "twenty")]
|
||||
|
||||
|
||||
def test_i_can_detect_duplicates_when_reference():
|
||||
def test_i_can_parse_multiple_results():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
bar = Concept(name="bar")
|
||||
|
||||
concepts = {
|
||||
bar: Sequence("one", "two"),
|
||||
foo: Sequence("one", OrderedChoice("two", "three"))
|
||||
}
|
||||
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two")
|
||||
assert len(res) == 2
|
||||
assert res[0].status
|
||||
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[0].value.body == [(bar, 0, 2, "one two")]
|
||||
|
||||
assert res[1].status
|
||||
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[1].value.body == [(foo, 0, 2, "one two")]
|
||||
|
||||
|
||||
def test_i_can_parse_multiple_results_times_two():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
bar = Concept(name="bar")
|
||||
|
||||
concepts = {
|
||||
bar: Sequence("one", "two"),
|
||||
foo: Sequence("one", OrderedChoice("two", "three"))
|
||||
}
|
||||
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two one two")
|
||||
assert len(res) == 4
|
||||
assert res[0].status
|
||||
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[0].value.body == [(bar, "one two"), (bar, "one two")]
|
||||
|
||||
assert res[1].status
|
||||
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[1].value.body == [(foo, "one two"), (bar, "one two")]
|
||||
|
||||
assert res[2].status
|
||||
assert context.sheerka.isinstance(res[2].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[2].value.body == [(bar, "one two"), (foo, "one two")]
|
||||
|
||||
assert res[3].status
|
||||
assert context.sheerka.isinstance(res[3].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[3].value.body == [(foo, "one two"), (foo, "one two")]
|
||||
|
||||
|
||||
def test_i_can_parse_multiple_results_when_reference():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
bar = Concept(name="bar")
|
||||
@@ -557,17 +572,17 @@ def test_i_cannot_parse_zero_and_more_when_wrong_entry():
|
||||
res = parser.parse(context, "one two")
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.try_parsed == [
|
||||
ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)]))]
|
||||
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res.value.body[0].body == "two"
|
||||
assert res.value.value == [
|
||||
ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)])),
|
||||
UnrecognizedTokensNode(2, 2, [t("two")])
|
||||
]
|
||||
|
||||
res = parser.parse(context, "two")
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.try_parsed == []
|
||||
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res.value.body[0].body == "two"
|
||||
assert res.value.value == [
|
||||
UnrecognizedTokensNode(0, 0, [t("two")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_can_parse_zero_and_more_with_separator():
|
||||
@@ -636,10 +651,9 @@ def test_i_can_parse_sequence_and_one_or_more():
|
||||
|
||||
res = parser.parse(context, "two")
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.try_parsed == []
|
||||
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
|
||||
assert res.value.body[0].body == "two"
|
||||
assert res.value.body == [
|
||||
UnrecognizedTokensNode(0, 0, [t("two")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_can_parse_one_and_more_with_separator():
|
||||
@@ -803,6 +817,201 @@ def test_i_can_initialize_rule_names():
|
||||
assert return_value[bar].rule_name == "foo"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text, end_position", [
|
||||
("foo", 0),
|
||||
("foo bar", 2)
|
||||
])
|
||||
def test_cannot_parser_unknown_concepts(text, end_position):
|
||||
context = get_context()
|
||||
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, {})
|
||||
|
||||
res = parser.parse(context, text)
|
||||
tokens = list(Tokenizer(text))[:-1]
|
||||
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.value == [UnrecognizedTokensNode(0, end_position, tokens)]
|
||||
|
||||
|
||||
def test_i_cannot_parse_when_part_of_the_input_is_unrecognized():
|
||||
context = get_context()
|
||||
one = Concept(name="one")
|
||||
two = Concept(name="two")
|
||||
concepts = {one: "one", two: "two"}
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two three")
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.value == [
|
||||
ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
|
||||
ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2)),
|
||||
UnrecognizedTokensNode(4, 4, [t("three")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_cannot_parse_when_wrong_sequence():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
concepts = {foo: Sequence("one", "two", "three")}
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two three one")
|
||||
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.value == [
|
||||
(foo, "one two three"),
|
||||
UnrecognizedTokensNode(6, 6, [t("one")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_cannot_parse_when_sequence_cannot_match_because_of_end_of_file():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
concepts = {foo: Sequence("one", "two", "three")}
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two")
|
||||
|
||||
assert not res.status
|
||||
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res.value.value == [
|
||||
UnrecognizedTokensNode(0, 2, [t("one"), t(" "), t("two")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_cannot_parse_multiple_results_when_unknown_tokens_at_the_end():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
bar = Concept(name="bar")
|
||||
|
||||
concepts = {
|
||||
bar: Sequence("one", "two"),
|
||||
foo: Sequence("one", OrderedChoice("two", "three"))
|
||||
}
|
||||
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two four five")
|
||||
assert len(res) == 2
|
||||
assert not res[0].status
|
||||
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[0].value.body == [
|
||||
(bar, 0, 2, "one two"),
|
||||
UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")])
|
||||
]
|
||||
|
||||
assert not res[1].status
|
||||
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[1].value.body == [
|
||||
(foo, 0, 2, "one two"),
|
||||
UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")])
|
||||
]
|
||||
|
||||
|
||||
def test_i_cannot_parse_multiple_results_when_beginning_by_unknown_tokens():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
bar = Concept(name="bar")
|
||||
|
||||
concepts = {
|
||||
bar: Sequence("one", "two"),
|
||||
foo: Sequence("one", OrderedChoice("two", "three"))
|
||||
}
|
||||
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "four five one two")
|
||||
assert len(res) == 2
|
||||
assert not res[0].status
|
||||
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[0].value.body == [
|
||||
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
|
||||
(bar, 4, 6, "one two"),
|
||||
]
|
||||
|
||||
assert not res[1].status
|
||||
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[1].value.body == [
|
||||
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
|
||||
(foo, 4, 6, "one two"),
|
||||
]
|
||||
|
||||
|
||||
def test_i_cannot_parse_multiple_results_when_surrounded_by_unknown_tokens():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
bar = Concept(name="bar")
|
||||
|
||||
concepts = {
|
||||
bar: Sequence("one", "two"),
|
||||
foo: Sequence("one", OrderedChoice("two", "three"))
|
||||
}
|
||||
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "four five one two six seven")
|
||||
assert len(res) == 2
|
||||
assert not res[0].status
|
||||
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[0].value.body == [
|
||||
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
|
||||
(bar, 4, 6, "one two"),
|
||||
UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]),
|
||||
]
|
||||
|
||||
assert not res[1].status
|
||||
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[1].value.body == [
|
||||
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
|
||||
(foo, 4, 6, "one two"),
|
||||
UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]),
|
||||
]
|
||||
|
||||
|
||||
def test_i_cannot_parse_multiple_results_when_unknown_tokens_in_the_middle():
|
||||
context = get_context()
|
||||
foo = Concept(name="foo")
|
||||
bar = Concept(name="bar")
|
||||
baz = Concept(name="baz")
|
||||
|
||||
concepts = {
|
||||
bar: Sequence("one", "two"),
|
||||
foo: Sequence("one", OrderedChoice("two", "three")),
|
||||
baz: StrMatch("six"),
|
||||
}
|
||||
|
||||
parser = ConceptLexerParser()
|
||||
parser.initialize(context, concepts)
|
||||
|
||||
res = parser.parse(context, "one two four five six")
|
||||
assert len(res) == 2
|
||||
assert not res[0].status
|
||||
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[0].value.body == [
|
||||
(bar, 0, 2, "one two"),
|
||||
UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]),
|
||||
(baz, 8, 8, "six"),
|
||||
]
|
||||
|
||||
assert not res[1].status
|
||||
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
|
||||
assert res[1].value.body == [
|
||||
(foo, 0, 2, "one two"),
|
||||
UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]),
|
||||
(baz, 8, 8, "six"),
|
||||
]
|
||||
|
||||
|
||||
#
|
||||
# def test_i_can_parse_basic_arithmetic_operations_and_resolve_properties():
|
||||
# context = get_context()
|
||||
|
||||
Reference in New Issue
Block a user