ConceptLexerParser can how handle UnrecognizedTokens

This commit is contained in:
2019-12-26 15:20:45 +01:00
parent bcb2308ea5
commit 26daae4acf
8 changed files with 483 additions and 125 deletions
+97 -1
View File
@@ -19,7 +19,7 @@ For those you don't know this old cartoon, it's the Odyssey story from Homer,
ported in the 31st century. Ulysses has a spacecraft with an AI named Shyrka
I was a great fan of this cartoon when I was young. I thought that the idea of
bringing the ancient story of Ulysses in the future was a bright.
bringing the ancient story of Ulysses in the future was bright.
Ever since then, Sheerka was my reference for any sophisticated computer. Unfortunately
for me, at that time there was no wikipedia to tell the the correct spelling.
@@ -654,3 +654,99 @@ For the two questions, I will first try the simple implementations and see there
* the entry in sdp will not be all_number, but all_id_of_number. I will use the concept id instead of its name
2019-24-12
**********
Going back on BNF implementation. As it's Christmas eve today, I won't stay very long.
So, the implementation lies in the class ConceptLexerParser, a it's a lexer not for token, but for concept.
The purpose of this class is to recognize a sequence of Concept.
So if we defines the following concepts
::
def concept foo from bnf one two three
def concept bar form bnf four five
when you input
::
one two three four five
the list of :code:`[foo, bar]` will be returned by the parser (as return values)
How does it works ?
As explained in the code, my implementation is highly inspired by Arpegio project. To define your grammar, you
use **ParsingExpressions**. There are several types
* some use to recognize tokens StrMatch, ConceptMatch
* other use to tell how to recognize Sequence, OrderedChoice, Optional, OneOrMore, ZeroOrMore...
Some example :
::
to recognize 'foo' -> StrMatch("foo')
to recognize 'foo bar' -> Sequence(StrMatch("foo'), StrMatch("bar'))
to recognize 'foo' or 'bar' -> OrderedChoice(StrMatch("foo'), StrMatch("bar'))
and so on...
So when a concept is defined using its bnf definition, I use the **BnfParser** to create the grammar, and then
I use the **ConceptLexerParser** to recognize the concepts
The current implementation to recognize a concept is not very efficient. All the definitions are in a dictionary
and I go thru the whole dictionary to see if some concepts are recognized. Once a concept is found, I loop again
on the whole dictionary to find the next concept.
| -> I need a btree to order the concept
| -> I need a predictive algorithm to guess the next concept
But it is for later.
So once the parsing is effective, I return a **ConceptNode** object
.. code-block:: python
class ConceptNode(LexerNode):
"""
Returned by the ConceptLexerParser
It represents a recognized concept
"""
def __init__(self, concept, start, end, tokens=None, source=None, underlying=None):
super().__init__(start, end, tokens, source)
self.concept = concept
self.underlying = underlying
if self.source is None:
self.source = BaseParser.get_text_from_tokens(self.tokens)
concept
| Remember that all grammars are listed in a dictionary of <Concept, ParsingExpression>.
| So when a parsing expression is verified, it's easy to link it with the concept
start
position first of the token
end
position of the last token
tokens
list of tokens that are recognized
underling
**NonTerminalNode** or **TerminalNode** that wraps the underlying **ParsingExpression** used to recognize the concept
source
| The source is deduced from the tokens
| But in the unit tests, they are directly given for speed up and simplicity
What is the difference between the **[Non]TerminalNode** and the **ParsingExpression** ?
The ParsingExpression
defines how to recognize a concept
The [Non]TerminalNode
represents what was found. So similarly to the ConceptNode, you will find the start, end and token attributes
That's all for today !
+22 -11
View File
@@ -1,7 +1,7 @@
from core.builtin_concepts import ParserResultConcept, BuiltinConcepts
from evaluators.BaseEvaluator import OneReturnValueEvaluator
from parsers.ConceptLexerParser import ConceptNode, NonTerminalNode, ConceptMatch
from parsers.ConceptLexerParser import ConceptNode, NonTerminalNode, ConceptMatch, UnrecognizedTokensNode
class ConceptNodeEvaluator(OneReturnValueEvaluator):
@@ -17,15 +17,22 @@ class ConceptNodeEvaluator(OneReturnValueEvaluator):
def matches(self, context, return_value):
if not return_value.status:
return False
if not isinstance(return_value.value, ParserResultConcept):
return False
return (isinstance(return_value.value.value, ConceptNode) or
return (
isinstance(return_value.value.value, ConceptNode) or
isinstance(return_value.value.value, UnrecognizedTokensNode) or
(
hasattr(return_value.value.value, "__iter__") and
len(return_value.value.value) > 0 and
(
hasattr(return_value.value.value, "__iter__") and
len(return_value.value.value) > 0 and
isinstance(return_value.value.value[0], ConceptNode)
))
isinstance(return_value.value.value[0], ConceptNode) or
isinstance(return_value.value.value[0], UnrecognizedTokensNode)
)
)
)
def eval(self, context, return_value):
"""
@@ -38,19 +45,23 @@ class ConceptNodeEvaluator(OneReturnValueEvaluator):
nodes = [nodes]
concepts = []
error_found = False
for node in nodes:
concept = sheerka.new(node.concept.key)
concept = self.update_concept(sheerka, concept, node.underlying)
concepts.append(concept)
if isinstance(node, ConceptNode):
concept = sheerka.new(node.concept.key)
concept = self.update_concept(sheerka, concept, node.underlying)
concepts.append(concept)
else:
error_found = True
if len(concepts) == 1:
return sheerka.ret(
self.name,
True,
not error_found,
concepts[0],
parents=[return_value])
raise NotImplementedError("Not yet")
return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.NOT_FOR_ME), parents=[return_value])
def update_concept(self, sheerka, concept, underlying, init_empty_body=True):
"""
+62 -30
View File
@@ -47,7 +47,33 @@ class LexerNode(Node):
if not isinstance(other, LexerNode):
return False
return self.start == other.start and self.end == other.end
return self.start == other.start and \
self.end == other.end and \
self.source == other.source and \
self.tokens == other.tokens
class UnrecognizedTokensNode(LexerNode):
def __init__(self, start, end, tokens):
super().__init__(start, end, tokens)
def add_token(self, token, pos):
self.tokens.append(token)
self.end = pos
def fix_source(self):
self.source = BaseParser.get_text_from_tokens(self.tokens)
def __eq__(self, other):
if not isinstance(other, UnrecognizedTokensNode):
return False
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __repr__(self):
return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"
class ConceptNode(LexerNode):
@@ -74,13 +100,15 @@ class ConceptNode(LexerNode):
self.end == other[2] and \
self.source == other[3]
if not super().__eq__(other):
return False
# if not super().__eq__(other):
# return False
if not isinstance(other, ConceptNode):
return False
return self.concept == other.concept and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source and \
self.underlying == other.underlying
@@ -110,8 +138,8 @@ class NonTerminalNode(LexerNode):
return name + sub_names
def __eq__(self, other):
if not super().__eq__(other):
return False
# if not super().__eq__(other):
# return False
if not isinstance(other, NonTerminalNode):
return False
@@ -140,8 +168,8 @@ class TerminalNode(LexerNode):
return name + f"'{self.value}'"
def __eq__(self, other):
if not super().__eq__(other):
return False
# if not super().__eq__(other):
# return False
if not isinstance(other, TerminalNode):
return False
@@ -699,6 +727,9 @@ class ConceptLexerParser(BaseParser):
self.reset_parser(context, text)
concepts_found = [[]]
unrecognized_tokens = None
has_unrecognized = False
# actually list of list
# The first dimension is the number of possibilities found
# The second dimension is the number of concepts found, under one possibility
@@ -716,6 +747,7 @@ class ConceptLexerParser(BaseParser):
while True:
init_pos = self.pos
res = []
for concept, grammar in self.concepts_grammars.items():
self.seek(init_pos)
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
@@ -731,31 +763,31 @@ class ConceptLexerParser(BaseParser):
if len(res) == 0: # not recognized
self.seek(init_pos)
not_recognized = self.get_text_from_tokens(self.get_token())
self.add_error(self.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body=not_recognized))
break
if unrecognized_tokens:
unrecognized_tokens.add_token(self.get_token(), init_pos)
else:
unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
res = self.get_bests(res) # only keep the concepts that eat the more tokens
concepts_found = core.utils.product(concepts_found, res)
if not self.next_token(False):
break
# loop
self.seek(res[0].end)
if not self.next_token():
break
else: # some concepts are recognized
if unrecognized_tokens:
unrecognized_tokens.fix_source()
unrecognized_tokens = None
res = self.get_bests(res) # only keep the concepts that eat the more tokens
concepts_found = core.utils.product(concepts_found, res)
# manage when nothing is recognized (or other error)
if self.has_error:
ret = self.sheerka.ret(
self.name,
False,
self.sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=text,
body=self.error_sink,
try_parsed=concepts_found[0] if len(concepts_found) == 1 else concepts_found))
self.log_result(context, text, ret)
return ret
# loop
self.seek(res[0].end)
if not self.next_token():
break
# Fix the source if we were working on unrecognized tokens
if unrecognized_tokens:
unrecognized_tokens.fix_source()
# else
# returns as many ReturnValue than choices found
@@ -764,7 +796,7 @@ class ConceptLexerParser(BaseParser):
ret.append(
self.sheerka.ret(
self.name,
True,
not has_unrecognized,
self.sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
+3
View File
@@ -248,6 +248,9 @@ class DefaultParser(BaseParser):
# Regroup the tokens by parts
first_token, tokens_found_by_parts = self.regroup_tokens_by_parts(keywords_tokens)
if first_token.type == TokenKind.EOF:
return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT]))
# get the name
concept_found.name = self.get_concept_name(first_token, tokens_found_by_parts)
+290 -81
View File
@@ -2,8 +2,10 @@ import pytest
from core.builtin_concepts import BuiltinConcepts
from core.concept import Concept
from core.sheerka import Sheerka, ExecutionContext
from core.tokenizer import Tokenizer, TokenKind, Token
from parsers.ConceptLexerParser import ConceptLexerParser, ConceptNode, Sequence, StrMatch, OrderedChoice, Optional, \
ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore
ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore, \
UnrecognizedTokensNode
from sdp.sheerkaDataProvider import Event
@@ -25,6 +27,16 @@ def u(parsing_expression, start, end, children=None):
return NonTerminalNode(parsing_expression, start, end, [], children)
def t(text):
if text.startswith("'") or text.startswith('"'):
return Token(TokenKind.STRING, text, 0, 0, 0)
if text.startswith(" "):
return Token(TokenKind.WHITESPACE, text, 0, 0, 0)
return Token(TokenKind.IDENTIFIER, text, 0, 0, 0)
@pytest.mark.parametrize("match, text", [
("foo", "foo"),
("'foo'", "'foo'"),
@@ -70,36 +82,6 @@ def test_i_can_match_multiple_concepts_in_one_input():
]
def test_i_cannot_match_an_unknown_input():
context = get_context()
parser = ConceptLexerParser() # no grammar registered
res = parser.parse(context, "foo")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "foo"
def test_i_cannot_match_when_part_of_the_input_is_unknown():
context = get_context()
one = Concept(name="one")
two = Concept(name="two")
concepts = {one: "one", two: "two"}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two three")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == [
ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2))] # these two were recognized
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "three"
def test_i_can_match_sequence():
context = get_context()
foo = Concept(name="foo")
@@ -118,37 +100,6 @@ def test_i_can_match_sequence():
u("three", 4, 4)]))]
def test_wrong_sequence_is_not_matched():
context = get_context()
foo = Concept(name="foo")
concepts = {foo: Sequence("one", "two", "three")}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two three one")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == [(foo, "one two three")]
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "one"
def test_i_cannot_match_sequence_if_end_of_file():
context = get_context()
foo = Concept(name="foo")
concepts = {foo: Sequence("one", "two", "three")}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == []
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "one"
def test_i_always_choose_the_longest_match():
context = get_context()
foo = Concept(name="foo")
@@ -205,8 +156,10 @@ def test_i_can_match_ordered_choice():
res3 = parser.parse(context, "three")
assert not res3.status
assert context.sheerka.isinstance(res3.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res3.value.body[0].body == "three"
assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
assert res3.value.value == [
UnrecognizedTokensNode(0, 0, [t("three")])
]
def test_i_cannot_match_ordered_choice_with_empty_alternative():
@@ -218,6 +171,10 @@ def test_i_cannot_match_ordered_choice_with_empty_alternative():
res = parser.parse(context, "ok") # because token[0] is not "one" and not "" (it is 'two')
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [
UnrecognizedTokensNode(0, 0, [t("ok")])
]
def test_i_can_mix_sequences_and_ordered_choices():
@@ -248,8 +205,10 @@ def test_i_can_mix_sequences_and_ordered_choices():
res3 = parser.parse(context, "twenty one")
assert not res3.status
assert res3.value.body[0].body == "twenty"
assert res3.value.try_parsed == []
assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
assert res3.value.value == [
UnrecognizedTokensNode(0, 2, [t("twenty"), t(" "), t("one")])
]
def test_i_can_mix_ordered_choices_and_sequences():
@@ -364,9 +323,9 @@ def test_i_cannot_parse_wrong_input_with_optional():
res = parser.parse(context, "two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == []
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "two"
assert res.value.value == [
UnrecognizedTokensNode(0, 0, [t("two")])
]
def test_i_can_use_reference():
@@ -463,7 +422,63 @@ def test_i_can_parse_when_reference():
assert res.value.body == [(foo, 0, 0, "twenty")]
def test_i_can_detect_duplicates_when_reference():
def test_i_can_parse_multiple_results():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two")
assert len(res) == 2
assert res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [(bar, 0, 2, "one two")]
assert res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [(foo, 0, 2, "one two")]
def test_i_can_parse_multiple_results_times_two():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two one two")
assert len(res) == 4
assert res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [(bar, "one two"), (bar, "one two")]
assert res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [(foo, "one two"), (bar, "one two")]
assert res[2].status
assert context.sheerka.isinstance(res[2].value, BuiltinConcepts.PARSER_RESULT)
assert res[2].value.body == [(bar, "one two"), (foo, "one two")]
assert res[3].status
assert context.sheerka.isinstance(res[3].value, BuiltinConcepts.PARSER_RESULT)
assert res[3].value.body == [(foo, "one two"), (foo, "one two")]
def test_i_can_parse_multiple_results_when_reference():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
@@ -557,17 +572,17 @@ def test_i_cannot_parse_zero_and_more_when_wrong_entry():
res = parser.parse(context, "one two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == [
ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)]))]
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "two"
assert res.value.value == [
ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)])),
UnrecognizedTokensNode(2, 2, [t("two")])
]
res = parser.parse(context, "two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == []
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "two"
assert res.value.value == [
UnrecognizedTokensNode(0, 0, [t("two")])
]
def test_i_can_parse_zero_and_more_with_separator():
@@ -636,10 +651,9 @@ def test_i_can_parse_sequence_and_one_or_more():
res = parser.parse(context, "two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.try_parsed == []
assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
assert res.value.body[0].body == "two"
assert res.value.body == [
UnrecognizedTokensNode(0, 0, [t("two")])
]
def test_i_can_parse_one_and_more_with_separator():
@@ -803,6 +817,201 @@ def test_i_can_initialize_rule_names():
assert return_value[bar].rule_name == "foo"
@pytest.mark.parametrize("text, end_position", [
("foo", 0),
("foo bar", 2)
])
def test_cannot_parser_unknown_concepts(text, end_position):
context = get_context()
parser = ConceptLexerParser()
parser.initialize(context, {})
res = parser.parse(context, text)
tokens = list(Tokenizer(text))[:-1]
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [UnrecognizedTokensNode(0, end_position, tokens)]
def test_i_cannot_parse_when_part_of_the_input_is_unrecognized():
context = get_context()
one = Concept(name="one")
two = Concept(name="two")
concepts = {one: "one", two: "two"}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two three")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [
ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2)),
UnrecognizedTokensNode(4, 4, [t("three")])
]
def test_i_cannot_parse_when_wrong_sequence():
context = get_context()
foo = Concept(name="foo")
concepts = {foo: Sequence("one", "two", "three")}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two three one")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [
(foo, "one two three"),
UnrecognizedTokensNode(6, 6, [t("one")])
]
def test_i_cannot_parse_when_sequence_cannot_match_because_of_end_of_file():
context = get_context()
foo = Concept(name="foo")
concepts = {foo: Sequence("one", "two", "three")}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two")
assert not res.status
assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
assert res.value.value == [
UnrecognizedTokensNode(0, 2, [t("one"), t(" "), t("two")])
]
def test_i_cannot_parse_multiple_results_when_unknown_tokens_at_the_end():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two four five")
assert len(res) == 2
assert not res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [
(bar, 0, 2, "one two"),
UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")])
]
assert not res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [
(foo, 0, 2, "one two"),
UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")])
]
def test_i_cannot_parse_multiple_results_when_beginning_by_unknown_tokens():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "four five one two")
assert len(res) == 2
assert not res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
(bar, 4, 6, "one two"),
]
assert not res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
(foo, 4, 6, "one two"),
]
def test_i_cannot_parse_multiple_results_when_surrounded_by_unknown_tokens():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three"))
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "four five one two six seven")
assert len(res) == 2
assert not res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
(bar, 4, 6, "one two"),
UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]),
]
assert not res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [
UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
(foo, 4, 6, "one two"),
UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]),
]
def test_i_cannot_parse_multiple_results_when_unknown_tokens_in_the_middle():
context = get_context()
foo = Concept(name="foo")
bar = Concept(name="bar")
baz = Concept(name="baz")
concepts = {
bar: Sequence("one", "two"),
foo: Sequence("one", OrderedChoice("two", "three")),
baz: StrMatch("six"),
}
parser = ConceptLexerParser()
parser.initialize(context, concepts)
res = parser.parse(context, "one two four five six")
assert len(res) == 2
assert not res[0].status
assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
assert res[0].value.body == [
(bar, 0, 2, "one two"),
UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]),
(baz, 8, 8, "six"),
]
assert not res[1].status
assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
assert res[1].value.body == [
(foo, 0, 2, "one two"),
UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]),
(baz, 8, 8, "six"),
]
#
# def test_i_can_parse_basic_arithmetic_operations_and_resolve_properties():
# context = get_context()
+5 -1
View File
@@ -5,7 +5,7 @@ from core.concept import Concept
from core.sheerka import Sheerka, ExecutionContext
from evaluators.ConceptNodeEvaluator import ConceptNodeEvaluator
from parsers.ConceptLexerParser import ConceptNode, ConceptLexerParser, Sequence, TerminalNode, \
StrMatch, Optional, OrderedChoice, ZeroOrMore
StrMatch, Optional, OrderedChoice, ZeroOrMore, UnrecognizedTokensNode
from sdp.sheerkaDataProvider import Event
@@ -37,8 +37,12 @@ def get_concept_node(context, grammar, expression):
@pytest.mark.parametrize("ret_val, expected", [
(ReturnValueConcept("some_name", True, ParserResultConcept(value=[ConceptNode(Concept(), 0, 0)])), True),
(ReturnValueConcept("some_name", True, ParserResultConcept(value=ConceptNode(Concept(), 0, 0))), True),
(ReturnValueConcept("some_name", True, ParserResultConcept(value=[UnrecognizedTokensNode(0, 0, [])])), True),
(ReturnValueConcept("some_name", True, ParserResultConcept(value=UnrecognizedTokensNode(0, 0, []))), True),
(ReturnValueConcept("some_name", False, ParserResultConcept(value=[ConceptNode(Concept(), 0, 0)])), False),
(ReturnValueConcept("some_name", False, ParserResultConcept(value=ConceptNode(Concept(), 0, 0))), False),
(ReturnValueConcept("some_name", False, ParserResultConcept(value=[UnrecognizedTokensNode(0, 0, [])])), False),
(ReturnValueConcept("some_name", False, ParserResultConcept(value=UnrecognizedTokensNode(0, 0, []))), False),
(ReturnValueConcept("some_name", True, ParserResultConcept(value="Not a concept node")), False),
(ReturnValueConcept("some_name", True, ParserResultConcept(value=["Not a concept node"])), False),
(ReturnValueConcept("some_name", True, [ConceptNode(Concept(), 0, 0)]), False),
+2
View File
@@ -311,6 +311,8 @@ def test_i_can_parse_is_a():
"concept",
"isa number",
"name isa",
"def",
"def concept_name"
])
def test_i_cannot_parse_invalid_entries(text):
parser = DefaultParser()
+2 -1
View File
@@ -7,6 +7,7 @@ from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept
from core.concept import Concept, PROPERTIES_TO_SERIALIZE, Property
from core.sheerka import Sheerka, ExecutionContext
from evaluators.MutipleSameSuccessEvaluator import MultipleSameSuccessEvaluator
from parsers.BaseParser import BaseParser
from parsers.ConceptLexerParser import Sequence, ZeroOrMore, StrMatch, OrderedChoice, Optional, ConceptMatch, \
ConceptLexerParser
from sdp.sheerkaDataProvider import SheerkaDataProvider, Event
@@ -291,7 +292,7 @@ def test_i_can_manage_concepts_with_the_same_key_when_values_are_the_same():
res = sheerka.evaluate_user_input("hello 'foo'")
assert len(res) == 1
assert res[0].status
assert res[0].value.body == "hello foo" # I don't know yet the one to choose
assert res[0].value.body == "hello foo" # I don't know yet the one to choose
assert res[0].who == sheerka.get_evaluator_name(MultipleSameSuccessEvaluator.NAME)