Added ZeroAndMore and OneAndMore to BNF. BNF expressions can now be captured
This commit is contained in:
+41
-12
@@ -115,7 +115,11 @@ class BnfParser:
|
||||
|
||||
def parse(self, context: ExecutionContext, text):
|
||||
self.reset_parser(context, text)
|
||||
tree = self.parse_choice()
|
||||
tree = self.parser_outer_rule_name()
|
||||
|
||||
token = self.get_token()
|
||||
if token and token.type != TokenKind.EOF:
|
||||
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", []))
|
||||
|
||||
ret = self.sheerka.ret(
|
||||
self.name,
|
||||
@@ -129,12 +133,15 @@ class BnfParser:
|
||||
|
||||
return ret
|
||||
|
||||
def parser_outer_rule_name(self):
|
||||
return self.parser_rule_name(self.parse_choice)
|
||||
|
||||
def parse_choice(self):
|
||||
sequence = self.parse_sequence()
|
||||
|
||||
self.eat_white_space()
|
||||
token = self.get_token()
|
||||
if token is None or token.type == TokenKind.EOF or token.type != TokenKind.VBAR:
|
||||
if token is None or token.type != TokenKind.VBAR:
|
||||
return sequence
|
||||
|
||||
elements = [sequence]
|
||||
@@ -142,7 +149,7 @@ class BnfParser:
|
||||
# maybe eat the vertical bar
|
||||
self.eat_white_space()
|
||||
token = self.get_token()
|
||||
if token is None or token.type == TokenKind.EOF or token.type != TokenKind.VBAR:
|
||||
if token is None or token.type != TokenKind.VBAR:
|
||||
break
|
||||
self.next_token(skip_whitespace=True)
|
||||
|
||||
@@ -152,30 +159,33 @@ class BnfParser:
|
||||
return OrderedChoice(*elements)
|
||||
|
||||
def parse_sequence(self):
|
||||
expr_and_modifier = self.parse_expression_and_modifier()
|
||||
expr_and_modifier = self.parse_modifier()
|
||||
token = self.get_token()
|
||||
if token is None or token.type == TokenKind.EOF or \
|
||||
if token is None or \
|
||||
token.type == TokenKind.EOF or \
|
||||
token.type == TokenKind.EQUALS or \
|
||||
self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \
|
||||
self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR):
|
||||
return expr_and_modifier
|
||||
|
||||
elements = [expr_and_modifier]
|
||||
while True:
|
||||
# maybe eat the comma
|
||||
token = self.get_token()
|
||||
if token is None or token.type == TokenKind.EOF or \
|
||||
if token is None or \
|
||||
token.type == TokenKind.EOF or \
|
||||
token.type == TokenKind.EQUALS or \
|
||||
self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \
|
||||
self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR):
|
||||
break
|
||||
self.eat_white_space()
|
||||
|
||||
sequence = self.parse_expression_and_modifier()
|
||||
sequence = self.parse_modifier()
|
||||
elements.append(sequence)
|
||||
|
||||
return Sequence(*elements)
|
||||
|
||||
def parse_expression_and_modifier(self):
|
||||
expression = self.parse_expression()
|
||||
def parse_modifier(self):
|
||||
expression = self.parser_inner_rule_name()
|
||||
|
||||
token = self.get_token()
|
||||
|
||||
@@ -193,6 +203,9 @@ class BnfParser:
|
||||
|
||||
return expression
|
||||
|
||||
def parser_inner_rule_name(self):
|
||||
return self.parser_rule_name(self.parse_expression)
|
||||
|
||||
def parse_expression(self):
|
||||
token = self.get_token()
|
||||
if token.type == TokenKind.EOF:
|
||||
@@ -207,7 +220,7 @@ class BnfParser:
|
||||
self.next_token()
|
||||
return expression
|
||||
else:
|
||||
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token.type}'", [TokenKind.RPAR]))
|
||||
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [TokenKind.RPAR]))
|
||||
return expression
|
||||
|
||||
if token.type == TokenKind.IDENTIFIER:
|
||||
@@ -224,4 +237,20 @@ class BnfParser:
|
||||
|
||||
ret = StrMatch(core.utils.strip_quotes(token.value))
|
||||
self.next_token()
|
||||
return ret
|
||||
return ret
|
||||
|
||||
def parser_rule_name(self, next_to_parse):
|
||||
expression = next_to_parse()
|
||||
token = self.get_token()
|
||||
if token is None or token.type != TokenKind.EQUALS:
|
||||
return expression
|
||||
|
||||
self.next_token() # eat equals
|
||||
token = self.get_token()
|
||||
|
||||
if token is None or token.type != TokenKind.IDENTIFIER:
|
||||
return self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [TokenKind.IDENTIFIER]))
|
||||
|
||||
expression.rule_name = token.value
|
||||
self.next_token()
|
||||
return expression
|
||||
|
||||
@@ -212,11 +212,11 @@ class Sequence(ParsingExpression):
|
||||
if node is None:
|
||||
return None
|
||||
else:
|
||||
if node.end != -1: # because Optional returns -1 when no match
|
||||
if node.end != -1: # because returns -1 when no match
|
||||
children.append(node)
|
||||
end_pos = node.end
|
||||
|
||||
return NonTerminalNode(self, init_pos, end_pos, parser.tokens[init_pos: end_pos + 1], children or [])
|
||||
return NonTerminalNode(self, init_pos, end_pos, parser.tokens[init_pos: end_pos + 1], children)
|
||||
|
||||
def __repr__(self):
|
||||
to_str = ", ".join(repr(n) for n in self.elements)
|
||||
@@ -284,31 +284,101 @@ class Optional(ParsingExpression):
|
||||
return f"({to_str})?"
|
||||
|
||||
|
||||
class ZeroOrMore(ParsingExpression):
|
||||
class Repetition(ParsingExpression):
|
||||
"""
|
||||
Base class for all repetition-like parser expressions (?,*,+)
|
||||
Args:
|
||||
eolterm(bool): Flag that indicates that end of line should
|
||||
terminate repetition match.
|
||||
"""
|
||||
|
||||
def __init__(self, *elements, **kwargs):
|
||||
super(Repetition, self).__init__(*elements, **kwargs)
|
||||
self.sep = kwargs.get('sep', None)
|
||||
|
||||
|
||||
class ZeroOrMore(Repetition):
|
||||
"""
|
||||
ZeroOrMore will try to match parser expression specified zero or more
|
||||
times. It will never fail.
|
||||
"""
|
||||
|
||||
def _parse(self, parser):
|
||||
raise NotImplementedError()
|
||||
init_pos = parser.pos
|
||||
end_pos = -1
|
||||
children = []
|
||||
|
||||
# Uncomment when _parse is implemented
|
||||
# def __repr__(self):
|
||||
# to_str = ", ".join(repr(n) for n in self.elements)
|
||||
# return f"({to_str})*"
|
||||
while True:
|
||||
current_pos = parser.pos
|
||||
|
||||
# maybe eat the separator if needed
|
||||
if self.sep and children:
|
||||
sep_result = self.sep.parse(parser)
|
||||
if sep_result is None:
|
||||
parser.seek(current_pos)
|
||||
break
|
||||
|
||||
# eat the ZeroOrMore
|
||||
node = self.nodes[0].parse(parser)
|
||||
if node is None:
|
||||
parser.seek(current_pos)
|
||||
break
|
||||
else:
|
||||
if node.end != -1: # because returns -1 when no match
|
||||
children.append(node)
|
||||
end_pos = node.end
|
||||
|
||||
if len(children) == 0:
|
||||
return NonTerminalNode(self, init_pos, -1, [], [])
|
||||
|
||||
return NonTerminalNode(self, init_pos, end_pos, parser.tokens[init_pos: end_pos + 1], children)
|
||||
|
||||
def __repr__(self):
|
||||
to_str = ", ".join(repr(n) for n in self.elements)
|
||||
return f"({to_str})*"
|
||||
|
||||
|
||||
class OneOrMore(ParsingExpression):
|
||||
class OneOrMore(Repetition):
|
||||
"""
|
||||
OneOrMore will try to match parser expression specified one or more times.
|
||||
"""
|
||||
|
||||
def _parse(self, parser):
|
||||
raise NotImplementedError()
|
||||
init_pos = parser.pos
|
||||
end_pos = -1
|
||||
children = []
|
||||
|
||||
while True:
|
||||
current_pos = parser.pos
|
||||
|
||||
# maybe eat the separator if needed
|
||||
if self.sep and children:
|
||||
sep_result = self.sep.parse(parser)
|
||||
if sep_result is None:
|
||||
parser.seek(current_pos)
|
||||
break
|
||||
|
||||
# eat the ZeroOrMore
|
||||
node = self.nodes[0].parse(parser)
|
||||
if node is None:
|
||||
parser.seek(current_pos)
|
||||
break
|
||||
else:
|
||||
if node.end != -1: # because returns -1 when no match
|
||||
children.append(node)
|
||||
end_pos = node.end
|
||||
|
||||
if len(children) == 0: # if nothing is found, it's an error
|
||||
return None
|
||||
|
||||
return NonTerminalNode(self, init_pos, end_pos, parser.tokens[init_pos: end_pos + 1], children)
|
||||
|
||||
def __repr__(self):
|
||||
to_str = ", ".join(repr(n) for n in self.elements)
|
||||
return f"({to_str})+"
|
||||
|
||||
|
||||
class UnorderedGroup(ParsingExpression):
|
||||
class UnorderedGroup(Repetition):
|
||||
"""
|
||||
Will try to match all of the parsing expression in any order.
|
||||
"""
|
||||
@@ -316,6 +386,10 @@ class UnorderedGroup(ParsingExpression):
|
||||
def _parse(self, parser):
|
||||
raise NotImplementedError()
|
||||
|
||||
# def __repr__(self):
|
||||
# to_str = ", ".join(repr(n) for n in self.elements)
|
||||
# return f"({to_str})#"
|
||||
|
||||
|
||||
class Match(ParsingExpression):
|
||||
"""
|
||||
@@ -541,11 +615,18 @@ class ConceptLexerParser(BaseParser):
|
||||
ret.ignore_case = self.ignore_case
|
||||
elif isinstance(expression, Sequence) or \
|
||||
isinstance(expression, OrderedChoice) or \
|
||||
isinstance(expression, ZeroOrMore) or \
|
||||
isinstance(expression, OneOrMore) or \
|
||||
isinstance(expression, Optional):
|
||||
ret = expression
|
||||
ret.nodes.extend([inner_get_model(e) for e in ret.elements])
|
||||
else:
|
||||
ret = self.add_error(GrammarErrorNode(f"Unrecognized grammar element '{expression}'."), False)
|
||||
|
||||
# Translate separator expression.
|
||||
if isinstance(expression, Repetition) and expression.sep:
|
||||
expression.sep = inner_get_model(expression.sep)
|
||||
|
||||
return ret
|
||||
|
||||
model = inner_get_model(concept_def)
|
||||
@@ -623,7 +704,7 @@ class ConceptLexerParser(BaseParser):
|
||||
for concept, grammar in self.concepts_grammars.items():
|
||||
self.seek(init_pos)
|
||||
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
|
||||
if node is not None:
|
||||
if node is not None and node.end != -1:
|
||||
concept_node = ConceptNode(
|
||||
concept,
|
||||
node.start,
|
||||
|
||||
Reference in New Issue
Block a user