ConceptLexerParser can how handle UnrecognizedTokens
This commit is contained in:
@@ -47,7 +47,33 @@ class LexerNode(Node):
|
||||
if not isinstance(other, LexerNode):
|
||||
return False
|
||||
|
||||
return self.start == other.start and self.end == other.end
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source and \
|
||||
self.tokens == other.tokens
|
||||
|
||||
|
||||
class UnrecognizedTokensNode(LexerNode):
|
||||
def __init__(self, start, end, tokens):
|
||||
super().__init__(start, end, tokens)
|
||||
|
||||
def add_token(self, token, pos):
|
||||
self.tokens.append(token)
|
||||
self.end = pos
|
||||
|
||||
def fix_source(self):
|
||||
self.source = BaseParser.get_text_from_tokens(self.tokens)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, UnrecognizedTokensNode):
|
||||
return False
|
||||
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
def __repr__(self):
|
||||
return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"
|
||||
|
||||
|
||||
class ConceptNode(LexerNode):
|
||||
@@ -74,13 +100,15 @@ class ConceptNode(LexerNode):
|
||||
self.end == other[2] and \
|
||||
self.source == other[3]
|
||||
|
||||
if not super().__eq__(other):
|
||||
return False
|
||||
# if not super().__eq__(other):
|
||||
# return False
|
||||
|
||||
if not isinstance(other, ConceptNode):
|
||||
return False
|
||||
|
||||
return self.concept == other.concept and \
|
||||
self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source and \
|
||||
self.underlying == other.underlying
|
||||
|
||||
@@ -110,8 +138,8 @@ class NonTerminalNode(LexerNode):
|
||||
return name + sub_names
|
||||
|
||||
def __eq__(self, other):
|
||||
if not super().__eq__(other):
|
||||
return False
|
||||
# if not super().__eq__(other):
|
||||
# return False
|
||||
|
||||
if not isinstance(other, NonTerminalNode):
|
||||
return False
|
||||
@@ -140,8 +168,8 @@ class TerminalNode(LexerNode):
|
||||
return name + f"'{self.value}'"
|
||||
|
||||
def __eq__(self, other):
|
||||
if not super().__eq__(other):
|
||||
return False
|
||||
# if not super().__eq__(other):
|
||||
# return False
|
||||
|
||||
if not isinstance(other, TerminalNode):
|
||||
return False
|
||||
@@ -699,6 +727,9 @@ class ConceptLexerParser(BaseParser):
|
||||
self.reset_parser(context, text)
|
||||
|
||||
concepts_found = [[]]
|
||||
unrecognized_tokens = None
|
||||
has_unrecognized = False
|
||||
|
||||
# actually list of list
|
||||
# The first dimension is the number of possibilities found
|
||||
# The second dimension is the number of concepts found, under one possibility
|
||||
@@ -716,6 +747,7 @@ class ConceptLexerParser(BaseParser):
|
||||
while True:
|
||||
init_pos = self.pos
|
||||
res = []
|
||||
|
||||
for concept, grammar in self.concepts_grammars.items():
|
||||
self.seek(init_pos)
|
||||
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
|
||||
@@ -731,31 +763,31 @@ class ConceptLexerParser(BaseParser):
|
||||
|
||||
if len(res) == 0: # not recognized
|
||||
self.seek(init_pos)
|
||||
not_recognized = self.get_text_from_tokens(self.get_token())
|
||||
self.add_error(self.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body=not_recognized))
|
||||
break
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.add_token(self.get_token(), init_pos)
|
||||
else:
|
||||
unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
|
||||
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
|
||||
has_unrecognized = True
|
||||
|
||||
res = self.get_bests(res) # only keep the concepts that eat the more tokens
|
||||
concepts_found = core.utils.product(concepts_found, res)
|
||||
if not self.next_token(False):
|
||||
break
|
||||
|
||||
# loop
|
||||
self.seek(res[0].end)
|
||||
if not self.next_token():
|
||||
break
|
||||
else: # some concepts are recognized
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.fix_source()
|
||||
unrecognized_tokens = None
|
||||
res = self.get_bests(res) # only keep the concepts that eat the more tokens
|
||||
concepts_found = core.utils.product(concepts_found, res)
|
||||
|
||||
# manage when nothing is recognized (or other error)
|
||||
if self.has_error:
|
||||
ret = self.sheerka.ret(
|
||||
self.name,
|
||||
False,
|
||||
self.sheerka.new(
|
||||
BuiltinConcepts.PARSER_RESULT,
|
||||
parser=self,
|
||||
source=text,
|
||||
body=self.error_sink,
|
||||
try_parsed=concepts_found[0] if len(concepts_found) == 1 else concepts_found))
|
||||
self.log_result(context, text, ret)
|
||||
return ret
|
||||
# loop
|
||||
self.seek(res[0].end)
|
||||
if not self.next_token():
|
||||
break
|
||||
|
||||
# Fix the source if we were working on unrecognized tokens
|
||||
if unrecognized_tokens:
|
||||
unrecognized_tokens.fix_source()
|
||||
|
||||
# else
|
||||
# returns as many ReturnValue than choices found
|
||||
@@ -764,7 +796,7 @@ class ConceptLexerParser(BaseParser):
|
||||
ret.append(
|
||||
self.sheerka.ret(
|
||||
self.name,
|
||||
True,
|
||||
not has_unrecognized,
|
||||
self.sheerka.new(
|
||||
BuiltinConcepts.PARSER_RESULT,
|
||||
parser=self,
|
||||
|
||||
@@ -248,6 +248,9 @@ class DefaultParser(BaseParser):
|
||||
# Regroup the tokens by parts
|
||||
first_token, tokens_found_by_parts = self.regroup_tokens_by_parts(keywords_tokens)
|
||||
|
||||
if first_token.type == TokenKind.EOF:
|
||||
return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT]))
|
||||
|
||||
# get the name
|
||||
concept_found.name = self.get_concept_name(first_token, tokens_found_by_parts)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user