ConceptLexerParser can how handle UnrecognizedTokens

2019-12-26 15:20:45 +01:00
parent bcb2308ea5
commit 26daae4acf
8 changed files with 483 additions and 125 deletions
@@ -47,7 +47,33 @@ class LexerNode(Node):
        if not isinstance(other, LexerNode):
            return False

-        return self.start == other.start and self.end == other.end
+        return self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source and \
+               self.tokens == other.tokens
+
+
+class UnrecognizedTokensNode(LexerNode):
+    def __init__(self, start, end, tokens):
+        super().__init__(start, end, tokens)
+
+    def add_token(self, token, pos):
+        self.tokens.append(token)
+        self.end = pos
+
+    def fix_source(self):
+        self.source = BaseParser.get_text_from_tokens(self.tokens)
+
+    def __eq__(self, other):
+        if not isinstance(other, UnrecognizedTokensNode):
+            return False
+
+        return self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source
+
+    def __repr__(self):
+        return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"


 class ConceptNode(LexerNode):
@@ -74,13 +100,15 @@ class ConceptNode(LexerNode):
                       self.end == other[2] and \
                       self.source == other[3]

-        if not super().__eq__(other):
-            return False
+        # if not super().__eq__(other):
+        #     return False

        if not isinstance(other, ConceptNode):
            return False

        return self.concept == other.concept and \
+               self.start == other.start and \
+               self.end == other.end and \
               self.source == other.source and \
               self.underlying == other.underlying

@@ -110,8 +138,8 @@ class NonTerminalNode(LexerNode):
        return name + sub_names

    def __eq__(self, other):
-        if not super().__eq__(other):
-            return False
+        # if not super().__eq__(other):
+        #     return False

        if not isinstance(other, NonTerminalNode):
            return False
@@ -140,8 +168,8 @@ class TerminalNode(LexerNode):
        return name + f"'{self.value}'"

    def __eq__(self, other):
-        if not super().__eq__(other):
-            return False
+        # if not super().__eq__(other):
+        #     return False

        if not isinstance(other, TerminalNode):
            return False
@@ -699,6 +727,9 @@ class ConceptLexerParser(BaseParser):
        self.reset_parser(context, text)

        concepts_found = [[]]
+        unrecognized_tokens = None
+        has_unrecognized = False
+
        # actually list of list
        # The first dimension is the number of possibilities found
        # The second dimension is the number of concepts found, under one possibility
@@ -716,6 +747,7 @@ class ConceptLexerParser(BaseParser):
        while True:
            init_pos = self.pos
            res = []
+
            for concept, grammar in self.concepts_grammars.items():
                self.seek(init_pos)
                node = grammar.parse(self)  # a node is TerminalNode or NonTerminalNode
@@ -731,31 +763,31 @@ class ConceptLexerParser(BaseParser):

            if len(res) == 0:  # not recognized
                self.seek(init_pos)
-                not_recognized = self.get_text_from_tokens(self.get_token())
-                self.add_error(self.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body=not_recognized))
-                break
+                if unrecognized_tokens:
+                    unrecognized_tokens.add_token(self.get_token(), init_pos)
+                else:
+                    unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
+                    concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
+                    has_unrecognized = True

-            res = self.get_bests(res)  # only keep the concepts that eat the more tokens
-            concepts_found = core.utils.product(concepts_found, res)
+                if not self.next_token(False):
+                    break

-            # loop
-            self.seek(res[0].end)
-            if not self.next_token():
-                break
+            else:  # some concepts are recognized
+                if unrecognized_tokens:
+                    unrecognized_tokens.fix_source()
+                    unrecognized_tokens = None
+                res = self.get_bests(res)  # only keep the concepts that eat the more tokens
+                concepts_found = core.utils.product(concepts_found, res)

-        # manage when nothing is recognized (or other error)
-        if self.has_error:
-            ret = self.sheerka.ret(
-                self.name,
-                False,
-                self.sheerka.new(
-                    BuiltinConcepts.PARSER_RESULT,
-                    parser=self,
-                    source=text,
-                    body=self.error_sink,
-                    try_parsed=concepts_found[0] if len(concepts_found) == 1 else concepts_found))
-            self.log_result(context, text, ret)
-            return ret
+                # loop
+                self.seek(res[0].end)
+                if not self.next_token():
+                    break
+
+        # Fix the source if we were working on unrecognized tokens
+        if unrecognized_tokens:
+            unrecognized_tokens.fix_source()

        # else
        # returns as many ReturnValue than choices found
@@ -764,7 +796,7 @@ class ConceptLexerParser(BaseParser):
            ret.append(
                self.sheerka.ret(
                    self.name,
-                    True,
+                    not has_unrecognized,
                    self.sheerka.new(
                        BuiltinConcepts.PARSER_RESULT,
                        parser=self,
@@ -248,6 +248,9 @@ class DefaultParser(BaseParser):
        # Regroup the tokens by parts
        first_token, tokens_found_by_parts = self.regroup_tokens_by_parts(keywords_tokens)

+        if first_token.type == TokenKind.EOF:
+            return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT]))
+
        # get the name
        concept_found.name = self.get_concept_name(first_token, tokens_found_by_parts)