ConceptLexerParser can how handle UnrecognizedTokens

This commit is contained in:
2019-12-26 15:20:45 +01:00
parent bcb2308ea5
commit 26daae4acf
8 changed files with 483 additions and 125 deletions
+62 -30
View File
@@ -47,7 +47,33 @@ class LexerNode(Node):
if not isinstance(other, LexerNode):
return False
return self.start == other.start and self.end == other.end
return self.start == other.start and \
self.end == other.end and \
self.source == other.source and \
self.tokens == other.tokens
class UnrecognizedTokensNode(LexerNode):
def __init__(self, start, end, tokens):
super().__init__(start, end, tokens)
def add_token(self, token, pos):
self.tokens.append(token)
self.end = pos
def fix_source(self):
self.source = BaseParser.get_text_from_tokens(self.tokens)
def __eq__(self, other):
if not isinstance(other, UnrecognizedTokensNode):
return False
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __repr__(self):
return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"
class ConceptNode(LexerNode):
@@ -74,13 +100,15 @@ class ConceptNode(LexerNode):
self.end == other[2] and \
self.source == other[3]
if not super().__eq__(other):
return False
# if not super().__eq__(other):
# return False
if not isinstance(other, ConceptNode):
return False
return self.concept == other.concept and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source and \
self.underlying == other.underlying
@@ -110,8 +138,8 @@ class NonTerminalNode(LexerNode):
return name + sub_names
def __eq__(self, other):
if not super().__eq__(other):
return False
# if not super().__eq__(other):
# return False
if not isinstance(other, NonTerminalNode):
return False
@@ -140,8 +168,8 @@ class TerminalNode(LexerNode):
return name + f"'{self.value}'"
def __eq__(self, other):
if not super().__eq__(other):
return False
# if not super().__eq__(other):
# return False
if not isinstance(other, TerminalNode):
return False
@@ -699,6 +727,9 @@ class ConceptLexerParser(BaseParser):
self.reset_parser(context, text)
concepts_found = [[]]
unrecognized_tokens = None
has_unrecognized = False
# actually list of list
# The first dimension is the number of possibilities found
# The second dimension is the number of concepts found, under one possibility
@@ -716,6 +747,7 @@ class ConceptLexerParser(BaseParser):
while True:
init_pos = self.pos
res = []
for concept, grammar in self.concepts_grammars.items():
self.seek(init_pos)
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
@@ -731,31 +763,31 @@ class ConceptLexerParser(BaseParser):
if len(res) == 0: # not recognized
self.seek(init_pos)
not_recognized = self.get_text_from_tokens(self.get_token())
self.add_error(self.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body=not_recognized))
break
if unrecognized_tokens:
unrecognized_tokens.add_token(self.get_token(), init_pos)
else:
unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
res = self.get_bests(res) # only keep the concepts that eat the more tokens
concepts_found = core.utils.product(concepts_found, res)
if not self.next_token(False):
break
# loop
self.seek(res[0].end)
if not self.next_token():
break
else: # some concepts are recognized
if unrecognized_tokens:
unrecognized_tokens.fix_source()
unrecognized_tokens = None
res = self.get_bests(res) # only keep the concepts that eat the more tokens
concepts_found = core.utils.product(concepts_found, res)
# manage when nothing is recognized (or other error)
if self.has_error:
ret = self.sheerka.ret(
self.name,
False,
self.sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=text,
body=self.error_sink,
try_parsed=concepts_found[0] if len(concepts_found) == 1 else concepts_found))
self.log_result(context, text, ret)
return ret
# loop
self.seek(res[0].end)
if not self.next_token():
break
# Fix the source if we were working on unrecognized tokens
if unrecognized_tokens:
unrecognized_tokens.fix_source()
# else
# returns as many ReturnValue than choices found
@@ -764,7 +796,7 @@ class ConceptLexerParser(BaseParser):
ret.append(
self.sheerka.ret(
self.name,
True,
not has_unrecognized,
self.sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
+3
View File
@@ -248,6 +248,9 @@ class DefaultParser(BaseParser):
# Regroup the tokens by parts
first_token, tokens_found_by_parts = self.regroup_tokens_by_parts(keywords_tokens)
if first_token.type == TokenKind.EOF:
return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT]))
# get the name
concept_found.name = self.get_concept_name(first_token, tokens_found_by_parts)