ConceptLexerParser can how handle UnrecognizedTokens

2019-12-26 15:20:45 +01:00
parent bcb2308ea5
commit 26daae4acf
8 changed files with 483 additions and 125 deletions
@@ -19,7 +19,7 @@ For those you don't know this old cartoon, it's the Odyssey story from Homer,
 ported in the 31st century. Ulysses has a spacecraft with an AI named Shyrka

 I was a great fan of this cartoon when I was young. I thought that the idea of
-bringing the ancient story of Ulysses in the future was a bright.
+bringing the ancient story of Ulysses in the future was bright.

 Ever since then, Sheerka was my reference for any sophisticated computer. Unfortunately
 for me, at that time there was no wikipedia to tell the the correct spelling.
@@ -654,3 +654,99 @@ For the two questions, I will first try the simple implementations and see there
 * the entry in sdp will not be all_number, but all_id_of_number. I will use the concept id instead of its name


+2019-24-12
+**********
+
+Going back on BNF implementation. As it's Christmas eve today, I won't stay very long.
+
+So, the implementation lies in the class ConceptLexerParser, a it's a lexer not for token, but for concept.
+The purpose of this class is to recognize a sequence of Concept.
+
+So if we defines the following concepts
+
+::
+
+    def concept foo from bnf one two three
+    def concept bar form bnf four five
+
+when you input
+
+::
+
+    one two three four five
+
+the list of :code:`[foo, bar]` will be returned by the parser (as return values)
+
+How does it works ?
+
+As explained in the code, my implementation is highly inspired by Arpegio project. To define your grammar, you
+use **ParsingExpressions**. There are several types
+
+* some use to recognize tokens StrMatch, ConceptMatch
+* other use to tell how to recognize Sequence, OrderedChoice, Optional, OneOrMore, ZeroOrMore...
+
+Some example :
+
+::
+
+    to recognize 'foo' -> StrMatch("foo')
+    to recognize 'foo bar' -> Sequence(StrMatch("foo'), StrMatch("bar'))
+    to recognize 'foo' or  'bar' -> OrderedChoice(StrMatch("foo'), StrMatch("bar'))
+
+    and so on...
+
+So when a concept is defined using its bnf definition, I use the **BnfParser** to create the grammar, and then
+I use the **ConceptLexerParser** to recognize the concepts
+
+The current implementation to recognize a concept is not very efficient. All the definitions are in a dictionary
+and I go thru the whole dictionary to see if some concepts are recognized. Once a concept is found, I loop again
+on the whole dictionary to find the next concept.
+
+| -> I need a btree to order the concept
+| -> I need a predictive algorithm to guess the next concept
+
+But it is for later.
+
+So once the parsing is effective, I return a **ConceptNode** object
+
+.. code-block:: python
+
+    class ConceptNode(LexerNode):
+        """
+        Returned by the ConceptLexerParser
+        It represents a recognized concept
+        """
+
+        def __init__(self, concept, start, end, tokens=None, source=None, underlying=None):
+            super().__init__(start, end, tokens, source)
+            self.concept = concept
+            self.underlying = underlying
+
+            if self.source is None:
+                self.source = BaseParser.get_text_from_tokens(self.tokens)
+
+
+concept
+    | Remember that all grammars are listed in a dictionary of <Concept, ParsingExpression>.
+    | So when a parsing expression is verified, it's easy to link it with the concept
+start
+    position first of the token
+end
+    position of the last token
+tokens
+    list of tokens that are recognized
+underling
+    **NonTerminalNode** or **TerminalNode** that wraps the underlying **ParsingExpression** used to recognize the concept
+source
+    | The source is deduced from the tokens
+    | But in the unit tests, they are directly given for speed up and simplicity
+
+What is the difference between the **[Non]TerminalNode** and the **ParsingExpression** ?
+
+The ParsingExpression
+    defines how to recognize a concept
+
+The [Non]TerminalNode
+    represents what was found. So similarly to the ConceptNode, you will find the start, end and token attributes
+
+That's all for today !
@@ -1,7 +1,7 @@
 from core.builtin_concepts import ParserResultConcept, BuiltinConcepts
 from evaluators.BaseEvaluator import OneReturnValueEvaluator

-from parsers.ConceptLexerParser import ConceptNode, NonTerminalNode, ConceptMatch
+from parsers.ConceptLexerParser import ConceptNode, NonTerminalNode, ConceptMatch, UnrecognizedTokensNode


 class ConceptNodeEvaluator(OneReturnValueEvaluator):
@@ -17,15 +17,22 @@ class ConceptNodeEvaluator(OneReturnValueEvaluator):
    def matches(self, context, return_value):
        if not return_value.status:
            return False
+
        if not isinstance(return_value.value, ParserResultConcept):
            return False

-        return (isinstance(return_value.value.value, ConceptNode) or
+        return (
+            isinstance(return_value.value.value, ConceptNode) or
+            isinstance(return_value.value.value, UnrecognizedTokensNode) or
+            (
+                hasattr(return_value.value.value, "__iter__") and
+                len(return_value.value.value) > 0 and
                (
-                    hasattr(return_value.value.value, "__iter__") and
-                    len(return_value.value.value) > 0 and
-                    isinstance(return_value.value.value[0], ConceptNode)
-                ))
+                    isinstance(return_value.value.value[0], ConceptNode) or
+                    isinstance(return_value.value.value[0], UnrecognizedTokensNode)
+                )
+            )
+        )

    def eval(self, context, return_value):
        """
@@ -38,19 +45,23 @@ class ConceptNodeEvaluator(OneReturnValueEvaluator):
            nodes = [nodes]

        concepts = []
+        error_found = False
        for node in nodes:
-            concept = sheerka.new(node.concept.key)
-            concept = self.update_concept(sheerka, concept, node.underlying)
-            concepts.append(concept)
+            if isinstance(node, ConceptNode):
+                concept = sheerka.new(node.concept.key)
+                concept = self.update_concept(sheerka, concept, node.underlying)
+                concepts.append(concept)
+            else:
+                error_found = True

        if len(concepts) == 1:
            return sheerka.ret(
                self.name,
-                True,
+                not error_found,
                concepts[0],
                parents=[return_value])

-        raise NotImplementedError("Not yet")
+        return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.NOT_FOR_ME), parents=[return_value])

    def update_concept(self, sheerka, concept, underlying, init_empty_body=True):
        """
@@ -47,7 +47,33 @@ class LexerNode(Node):
        if not isinstance(other, LexerNode):
            return False

-        return self.start == other.start and self.end == other.end
+        return self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source and \
+               self.tokens == other.tokens
+
+
+class UnrecognizedTokensNode(LexerNode):
+    def __init__(self, start, end, tokens):
+        super().__init__(start, end, tokens)
+
+    def add_token(self, token, pos):
+        self.tokens.append(token)
+        self.end = pos
+
+    def fix_source(self):
+        self.source = BaseParser.get_text_from_tokens(self.tokens)
+
+    def __eq__(self, other):
+        if not isinstance(other, UnrecognizedTokensNode):
+            return False
+
+        return self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source
+
+    def __repr__(self):
+        return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"


 class ConceptNode(LexerNode):
@@ -74,13 +100,15 @@ class ConceptNode(LexerNode):
                       self.end == other[2] and \
                       self.source == other[3]

-        if not super().__eq__(other):
-            return False
+        # if not super().__eq__(other):
+        #     return False

        if not isinstance(other, ConceptNode):
            return False

        return self.concept == other.concept and \
+               self.start == other.start and \
+               self.end == other.end and \
               self.source == other.source and \
               self.underlying == other.underlying

@@ -110,8 +138,8 @@ class NonTerminalNode(LexerNode):
        return name + sub_names

    def __eq__(self, other):
-        if not super().__eq__(other):
-            return False
+        # if not super().__eq__(other):
+        #     return False

        if not isinstance(other, NonTerminalNode):
            return False
@@ -140,8 +168,8 @@ class TerminalNode(LexerNode):
        return name + f"'{self.value}'"

    def __eq__(self, other):
-        if not super().__eq__(other):
-            return False
+        # if not super().__eq__(other):
+        #     return False

        if not isinstance(other, TerminalNode):
            return False
@@ -699,6 +727,9 @@ class ConceptLexerParser(BaseParser):
        self.reset_parser(context, text)

        concepts_found = [[]]
+        unrecognized_tokens = None
+        has_unrecognized = False
+
        # actually list of list
        # The first dimension is the number of possibilities found
        # The second dimension is the number of concepts found, under one possibility
@@ -716,6 +747,7 @@ class ConceptLexerParser(BaseParser):
        while True:
            init_pos = self.pos
            res = []
+
            for concept, grammar in self.concepts_grammars.items():
                self.seek(init_pos)
                node = grammar.parse(self)  # a node is TerminalNode or NonTerminalNode
@@ -731,31 +763,31 @@ class ConceptLexerParser(BaseParser):

            if len(res) == 0:  # not recognized
                self.seek(init_pos)
-                not_recognized = self.get_text_from_tokens(self.get_token())
-                self.add_error(self.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body=not_recognized))
-                break
+                if unrecognized_tokens:
+                    unrecognized_tokens.add_token(self.get_token(), init_pos)
+                else:
+                    unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
+                    concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
+                    has_unrecognized = True

-            res = self.get_bests(res)  # only keep the concepts that eat the more tokens
-            concepts_found = core.utils.product(concepts_found, res)
+                if not self.next_token(False):
+                    break

-            # loop
-            self.seek(res[0].end)
-            if not self.next_token():
-                break
+            else:  # some concepts are recognized
+                if unrecognized_tokens:
+                    unrecognized_tokens.fix_source()
+                    unrecognized_tokens = None
+                res = self.get_bests(res)  # only keep the concepts that eat the more tokens
+                concepts_found = core.utils.product(concepts_found, res)

-        # manage when nothing is recognized (or other error)
-        if self.has_error:
-            ret = self.sheerka.ret(
-                self.name,
-                False,
-                self.sheerka.new(
-                    BuiltinConcepts.PARSER_RESULT,
-                    parser=self,
-                    source=text,
-                    body=self.error_sink,
-                    try_parsed=concepts_found[0] if len(concepts_found) == 1 else concepts_found))
-            self.log_result(context, text, ret)
-            return ret
+                # loop
+                self.seek(res[0].end)
+                if not self.next_token():
+                    break
+
+        # Fix the source if we were working on unrecognized tokens
+        if unrecognized_tokens:
+            unrecognized_tokens.fix_source()

        # else
        # returns as many ReturnValue than choices found
@@ -764,7 +796,7 @@ class ConceptLexerParser(BaseParser):
            ret.append(
                self.sheerka.ret(
                    self.name,
-                    True,
+                    not has_unrecognized,
                    self.sheerka.new(
                        BuiltinConcepts.PARSER_RESULT,
                        parser=self,
@@ -248,6 +248,9 @@ class DefaultParser(BaseParser):
        # Regroup the tokens by parts
        first_token, tokens_found_by_parts = self.regroup_tokens_by_parts(keywords_tokens)

+        if first_token.type == TokenKind.EOF:
+            return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT]))
+
        # get the name
        concept_found.name = self.get_concept_name(first_token, tokens_found_by_parts)

@@ -2,8 +2,10 @@ import pytest
 from core.builtin_concepts import BuiltinConcepts
 from core.concept import Concept
 from core.sheerka import Sheerka, ExecutionContext
+from core.tokenizer import Tokenizer, TokenKind, Token
 from parsers.ConceptLexerParser import ConceptLexerParser, ConceptNode, Sequence, StrMatch, OrderedChoice, Optional, \
-    ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore
+    ParsingExpressionVisitor, TerminalNode, NonTerminalNode, LexerNode, ConceptMatch, ZeroOrMore, OneOrMore, \
+    UnrecognizedTokensNode
 from sdp.sheerkaDataProvider import Event


@@ -25,6 +27,16 @@ def u(parsing_expression, start, end, children=None):
    return NonTerminalNode(parsing_expression, start, end, [], children)


+def t(text):
+    if text.startswith("'") or text.startswith('"'):
+        return Token(TokenKind.STRING, text, 0, 0, 0)
+
+    if text.startswith(" "):
+        return Token(TokenKind.WHITESPACE, text, 0, 0, 0)
+
+    return Token(TokenKind.IDENTIFIER, text, 0, 0, 0)
+
+
@pytest.mark.parametrize("match, text", [
    ("foo", "foo"),
    ("'foo'", "'foo'"),
@@ -70,36 +82,6 @@ def test_i_can_match_multiple_concepts_in_one_input():
    ]


-def test_i_cannot_match_an_unknown_input():
-    context = get_context()
-    parser = ConceptLexerParser()  # no grammar registered
-
-    res = parser.parse(context, "foo")
-
-    assert not res.status
-    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
-    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res.value.body[0].body == "foo"
-
-
-def test_i_cannot_match_when_part_of_the_input_is_unknown():
-    context = get_context()
-    one = Concept(name="one")
-    two = Concept(name="two")
-    concepts = {one: "one", two: "two"}
-    parser = ConceptLexerParser()
-    parser.initialize(context, concepts)
-
-    res = parser.parse(context, "one two three")
-    assert not res.status
-    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
-    assert res.value.try_parsed == [
-        ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
-        ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2))]  # these two were recognized
-    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res.value.body[0].body == "three"
-
-
 def test_i_can_match_sequence():
    context = get_context()
    foo = Concept(name="foo")
@@ -118,37 +100,6 @@ def test_i_can_match_sequence():
            u("three", 4, 4)]))]


-def test_wrong_sequence_is_not_matched():
-    context = get_context()
-    foo = Concept(name="foo")
-    concepts = {foo: Sequence("one", "two", "three")}
-    parser = ConceptLexerParser()
-    parser.initialize(context, concepts)
-
-    res = parser.parse(context, "one two three one")
-
-    assert not res.status
-    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
-    assert res.value.try_parsed == [(foo, "one two three")]
-    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res.value.body[0].body == "one"
-
-
-def test_i_cannot_match_sequence_if_end_of_file():
-    context = get_context()
-    foo = Concept(name="foo")
-    concepts = {foo: Sequence("one", "two", "three")}
-    parser = ConceptLexerParser()
-    parser.initialize(context, concepts)
-
-    res = parser.parse(context, "one two")
-    assert not res.status
-    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
-    assert res.value.try_parsed == []
-    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res.value.body[0].body == "one"
-
-
 def test_i_always_choose_the_longest_match():
    context = get_context()
    foo = Concept(name="foo")
@@ -205,8 +156,10 @@ def test_i_can_match_ordered_choice():

    res3 = parser.parse(context, "three")
    assert not res3.status
-    assert context.sheerka.isinstance(res3.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res3.value.body[0].body == "three"
+    assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
+    assert res3.value.value == [
+        UnrecognizedTokensNode(0, 0, [t("three")])
+    ]


 def test_i_cannot_match_ordered_choice_with_empty_alternative():
@@ -218,6 +171,10 @@ def test_i_cannot_match_ordered_choice_with_empty_alternative():

    res = parser.parse(context, "ok")  # because token[0] is not "one" and not "" (it is 'two')
    assert not res.status
+    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
+    assert res.value.value == [
+        UnrecognizedTokensNode(0, 0, [t("ok")])
+    ]


 def test_i_can_mix_sequences_and_ordered_choices():
@@ -248,8 +205,10 @@ def test_i_can_mix_sequences_and_ordered_choices():

    res3 = parser.parse(context, "twenty one")
    assert not res3.status
-    assert res3.value.body[0].body == "twenty"
-    assert res3.value.try_parsed == []
+    assert context.sheerka.isinstance(res2.value, BuiltinConcepts.PARSER_RESULT)
+    assert res3.value.value == [
+        UnrecognizedTokensNode(0, 2, [t("twenty"), t(" "), t("one")])
+    ]


 def test_i_can_mix_ordered_choices_and_sequences():
@@ -364,9 +323,9 @@ def test_i_cannot_parse_wrong_input_with_optional():
    res = parser.parse(context, "two")
    assert not res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
-    assert res.value.try_parsed == []
-    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res.value.body[0].body == "two"
+    assert res.value.value == [
+        UnrecognizedTokensNode(0, 0, [t("two")])
+    ]


 def test_i_can_use_reference():
@@ -463,7 +422,63 @@ def test_i_can_parse_when_reference():
    assert res.value.body == [(foo, 0, 0, "twenty")]


-def test_i_can_detect_duplicates_when_reference():
+def test_i_can_parse_multiple_results():
+    context = get_context()
+    foo = Concept(name="foo")
+    bar = Concept(name="bar")
+
+    concepts = {
+        bar: Sequence("one", "two"),
+        foo: Sequence("one", OrderedChoice("two", "three"))
+    }
+
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "one two")
+    assert len(res) == 2
+    assert res[0].status
+    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[0].value.body == [(bar, 0, 2, "one two")]
+
+    assert res[1].status
+    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[1].value.body == [(foo, 0, 2, "one two")]
+
+
+def test_i_can_parse_multiple_results_times_two():
+    context = get_context()
+    foo = Concept(name="foo")
+    bar = Concept(name="bar")
+
+    concepts = {
+        bar: Sequence("one", "two"),
+        foo: Sequence("one", OrderedChoice("two", "three"))
+    }
+
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "one two one two")
+    assert len(res) == 4
+    assert res[0].status
+    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[0].value.body == [(bar, "one two"), (bar, "one two")]
+
+    assert res[1].status
+    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[1].value.body == [(foo, "one two"), (bar, "one two")]
+
+    assert res[2].status
+    assert context.sheerka.isinstance(res[2].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[2].value.body == [(bar, "one two"), (foo, "one two")]
+
+    assert res[3].status
+    assert context.sheerka.isinstance(res[3].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[3].value.body == [(foo, "one two"), (foo, "one two")]
+
+
+def test_i_can_parse_multiple_results_when_reference():
    context = get_context()
    foo = Concept(name="foo")
    bar = Concept(name="bar")
@@ -557,17 +572,17 @@ def test_i_cannot_parse_zero_and_more_when_wrong_entry():
    res = parser.parse(context, "one two")
    assert not res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
-    assert res.value.try_parsed == [
-        ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)]))]
-    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res.value.body[0].body == "two"
+    assert res.value.value == [
+        ConceptNode(foo, 0, 0, source="one", underlying=u(ZeroOrMore("one"), 0, 0, [u("one", 0, 0)])),
+        UnrecognizedTokensNode(2, 2, [t("two")])
+    ]

    res = parser.parse(context, "two")
    assert not res.status
    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
-    assert res.value.try_parsed == []
-    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res.value.body[0].body == "two"
+    assert res.value.value == [
+        UnrecognizedTokensNode(0, 0, [t("two")])
+    ]


 def test_i_can_parse_zero_and_more_with_separator():
@@ -636,10 +651,9 @@ def test_i_can_parse_sequence_and_one_or_more():

    res = parser.parse(context, "two")
    assert not res.status
-    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
-    assert res.value.try_parsed == []
-    assert context.sheerka.isinstance(res.value.body[0], BuiltinConcepts.UNKNOWN_CONCEPT)
-    assert res.value.body[0].body == "two"
+    assert res.value.body == [
+        UnrecognizedTokensNode(0, 0, [t("two")])
+    ]


 def test_i_can_parse_one_and_more_with_separator():
@@ -803,6 +817,201 @@ def test_i_can_initialize_rule_names():
    assert return_value[bar].rule_name == "foo"


+@pytest.mark.parametrize("text, end_position", [
+    ("foo", 0),
+    ("foo bar", 2)
+])
+def test_cannot_parser_unknown_concepts(text, end_position):
+    context = get_context()
+
+    parser = ConceptLexerParser()
+    parser.initialize(context, {})
+
+    res = parser.parse(context, text)
+    tokens = list(Tokenizer(text))[:-1]
+
+    assert not res.status
+    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
+    assert res.value.value == [UnrecognizedTokensNode(0, end_position, tokens)]
+
+
+def test_i_cannot_parse_when_part_of_the_input_is_unrecognized():
+    context = get_context()
+    one = Concept(name="one")
+    two = Concept(name="two")
+    concepts = {one: "one", two: "two"}
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "one two three")
+    assert not res.status
+    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
+    assert res.value.value == [
+        ConceptNode(one, 0, 0, source="one", underlying=u("one", 0, 0)),
+        ConceptNode(two, 2, 2, source="two", underlying=u("two", 2, 2)),
+        UnrecognizedTokensNode(4, 4, [t("three")])
+    ]
+
+
+def test_i_cannot_parse_when_wrong_sequence():
+    context = get_context()
+    foo = Concept(name="foo")
+    concepts = {foo: Sequence("one", "two", "three")}
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "one two three one")
+
+    assert not res.status
+    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
+    assert res.value.value == [
+        (foo, "one two three"),
+        UnrecognizedTokensNode(6, 6, [t("one")])
+    ]
+
+
+def test_i_cannot_parse_when_sequence_cannot_match_because_of_end_of_file():
+    context = get_context()
+    foo = Concept(name="foo")
+    concepts = {foo: Sequence("one", "two", "three")}
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "one two")
+
+    assert not res.status
+    assert context.sheerka.isinstance(res.value, BuiltinConcepts.PARSER_RESULT)
+    assert res.value.value == [
+        UnrecognizedTokensNode(0, 2, [t("one"), t(" "), t("two")])
+    ]
+
+
+def test_i_cannot_parse_multiple_results_when_unknown_tokens_at_the_end():
+    context = get_context()
+    foo = Concept(name="foo")
+    bar = Concept(name="bar")
+
+    concepts = {
+        bar: Sequence("one", "two"),
+        foo: Sequence("one", OrderedChoice("two", "three"))
+    }
+
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "one two four five")
+    assert len(res) == 2
+    assert not res[0].status
+    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[0].value.body == [
+        (bar, 0, 2, "one two"),
+        UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")])
+    ]
+
+    assert not res[1].status
+    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[1].value.body == [
+        (foo, 0, 2, "one two"),
+        UnrecognizedTokensNode(4, 6, [t("four"), t(" "), t("five")])
+    ]
+
+
+def test_i_cannot_parse_multiple_results_when_beginning_by_unknown_tokens():
+    context = get_context()
+    foo = Concept(name="foo")
+    bar = Concept(name="bar")
+
+    concepts = {
+        bar: Sequence("one", "two"),
+        foo: Sequence("one", OrderedChoice("two", "three"))
+    }
+
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "four five one two")
+    assert len(res) == 2
+    assert not res[0].status
+    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[0].value.body == [
+        UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
+        (bar, 4, 6, "one two"),
+    ]
+
+    assert not res[1].status
+    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[1].value.body == [
+        UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
+        (foo, 4, 6, "one two"),
+    ]
+
+
+def test_i_cannot_parse_multiple_results_when_surrounded_by_unknown_tokens():
+    context = get_context()
+    foo = Concept(name="foo")
+    bar = Concept(name="bar")
+
+    concepts = {
+        bar: Sequence("one", "two"),
+        foo: Sequence("one", OrderedChoice("two", "three"))
+    }
+
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "four five one two six seven")
+    assert len(res) == 2
+    assert not res[0].status
+    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[0].value.body == [
+        UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
+        (bar, 4, 6, "one two"),
+        UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]),
+    ]
+
+    assert not res[1].status
+    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[1].value.body == [
+        UnrecognizedTokensNode(0, 3, [t("four"), t(" "), t("five"), t(" ")]),
+        (foo, 4, 6, "one two"),
+        UnrecognizedTokensNode(8, 10, [t("six"), t(" "), t("seven")]),
+    ]
+
+
+def test_i_cannot_parse_multiple_results_when_unknown_tokens_in_the_middle():
+    context = get_context()
+    foo = Concept(name="foo")
+    bar = Concept(name="bar")
+    baz = Concept(name="baz")
+
+    concepts = {
+        bar: Sequence("one", "two"),
+        foo: Sequence("one", OrderedChoice("two", "three")),
+        baz: StrMatch("six"),
+    }
+
+    parser = ConceptLexerParser()
+    parser.initialize(context, concepts)
+
+    res = parser.parse(context, "one two four five six")
+    assert len(res) == 2
+    assert not res[0].status
+    assert context.sheerka.isinstance(res[0].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[0].value.body == [
+        (bar, 0, 2, "one two"),
+        UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]),
+        (baz, 8, 8, "six"),
+    ]
+
+    assert not res[1].status
+    assert context.sheerka.isinstance(res[1].value, BuiltinConcepts.PARSER_RESULT)
+    assert res[1].value.body == [
+        (foo, 0, 2, "one two"),
+        UnrecognizedTokensNode(4, 7, [t("four"), t(" "), t("five"), t(" ")]),
+        (baz, 8, 8, "six"),
+    ]
+
+
 #
 # def test_i_can_parse_basic_arithmetic_operations_and_resolve_properties():
 #     context = get_context()
@@ -5,7 +5,7 @@ from core.concept import Concept
 from core.sheerka import Sheerka, ExecutionContext
 from evaluators.ConceptNodeEvaluator import ConceptNodeEvaluator
 from parsers.ConceptLexerParser import ConceptNode, ConceptLexerParser, Sequence, TerminalNode, \
-    StrMatch, Optional, OrderedChoice, ZeroOrMore
+    StrMatch, Optional, OrderedChoice, ZeroOrMore, UnrecognizedTokensNode
 from sdp.sheerkaDataProvider import Event


@@ -37,8 +37,12 @@ def get_concept_node(context, grammar, expression):
@pytest.mark.parametrize("ret_val, expected", [
    (ReturnValueConcept("some_name", True, ParserResultConcept(value=[ConceptNode(Concept(), 0, 0)])), True),
    (ReturnValueConcept("some_name", True, ParserResultConcept(value=ConceptNode(Concept(), 0, 0))), True),
+    (ReturnValueConcept("some_name", True, ParserResultConcept(value=[UnrecognizedTokensNode(0, 0, [])])), True),
+    (ReturnValueConcept("some_name", True, ParserResultConcept(value=UnrecognizedTokensNode(0, 0, []))), True),
    (ReturnValueConcept("some_name", False, ParserResultConcept(value=[ConceptNode(Concept(), 0, 0)])), False),
    (ReturnValueConcept("some_name", False, ParserResultConcept(value=ConceptNode(Concept(), 0, 0))), False),
+    (ReturnValueConcept("some_name", False, ParserResultConcept(value=[UnrecognizedTokensNode(0, 0, [])])), False),
+    (ReturnValueConcept("some_name", False, ParserResultConcept(value=UnrecognizedTokensNode(0, 0, []))), False),
    (ReturnValueConcept("some_name", True, ParserResultConcept(value="Not a concept node")), False),
    (ReturnValueConcept("some_name", True, ParserResultConcept(value=["Not a concept node"])), False),
    (ReturnValueConcept("some_name", True, [ConceptNode(Concept(), 0, 0)]), False),
@@ -311,6 +311,8 @@ def test_i_can_parse_is_a():
    "concept",
    "isa number",
    "name isa",
+    "def",
+    "def concept_name"
 ])
 def test_i_cannot_parse_invalid_entries(text):
    parser = DefaultParser()
@@ -7,6 +7,7 @@ from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept
 from core.concept import Concept, PROPERTIES_TO_SERIALIZE, Property
 from core.sheerka import Sheerka, ExecutionContext
 from evaluators.MutipleSameSuccessEvaluator import MultipleSameSuccessEvaluator
+from parsers.BaseParser import BaseParser
 from parsers.ConceptLexerParser import Sequence, ZeroOrMore, StrMatch, OrderedChoice, Optional, ConceptMatch, \
    ConceptLexerParser
 from sdp.sheerkaDataProvider import SheerkaDataProvider, Event
@@ -291,7 +292,7 @@ def test_i_can_manage_concepts_with_the_same_key_when_values_are_the_same():
    res = sheerka.evaluate_user_input("hello 'foo'")
    assert len(res) == 1
    assert res[0].status
-    assert res[0].value.body == "hello foo" # I don't know yet the one to choose
+    assert res[0].value.body == "hello foo"  # I don't know yet the one to choose
    assert res[0].who == sheerka.get_evaluator_name(MultipleSameSuccessEvaluator.NAME)