Files
Sheerka/src/parsers/BnfDefinitionParser.py
T

213 lines
7.2 KiB
Python

from dataclasses import dataclass
from common.global_symbols import NotFound
from core.error import ErrorObj
from parsers.parser_utils import SimpleParser, UnexpectedEof, UnexpectedToken
from parsers.peg_parser import ConceptExpression, OneOrMore, Optional, OrderedChoice, RegExMatch, Sequence, StrMatch, \
VariableExpression, ZeroOrMore
from parsers.tokenizer import TokenKind, Tokenizer
@dataclass
class UnknownConcept(ErrorObj):
concept_id: str
concept_name: str
def get_error_msg(self) -> str:
return f"Cannot find concept defined by id='{self.concept_id}' and/or name '{self.concept_name}'"
class BnfDefinitionParser(SimpleParser):
"""
Parser used to transform literal into ParsingExpression
example :
a | b c -> Sequence(OrderedChoice(a, b), c)
'|' (pipe) is used for OrderedChoice
' ' space is used for Sequence
'?' (question mark) is used for Optional
'*' (star) is used for ZeroOrMore
'+' (plus) is used for OneOrMore
"""
def __init__(self, context, text, concept_name=None):
super().__init__(text, skip_whitespace_default_behaviour=False)
self.context = context
self.concept_name = concept_name # name of the concept currently being constructed
self.nb_open_par = 0
self.next_token(skip_whitespace=True)
def maybe_sequence(self, first, second):
return self.token.type == second or \
self.token.type == first and self.check_next_token().type == second
def parse(self):
tree = self._parse_choice()
if self.token.type != TokenKind.EOF:
self.add_error(UnexpectedToken(self.token, TokenKind.EOF))
return None if self.error_sink else tree
def _parse_choice(self):
"""
a | b | c
<choice> := <sequence> ( '|' <sequence> )*
:return:
:rtype:
"""
sequence = self._parse_sequence()
self.eat_whitespace()
if self.token.type != TokenKind.VBAR:
return sequence
elements = [sequence]
while True:
# maybe eat the vertical bar
self.eat_whitespace()
if self.token is None or self.token.type != TokenKind.VBAR:
break
self.next_token(skip_whitespace=True)
sequence = self._parse_sequence()
elements.append(sequence)
return self._eat_rule_name_if_needed(OrderedChoice(*elements))
def _parse_sequence(self):
"""
a b c
:return:
"""
expr_and_modifier = self._parse_modifier()
if self.token.type == TokenKind.EOF or \
self.token.type == TokenKind.EQUALS or \
self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \
self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR):
return expr_and_modifier
elements = [expr_and_modifier]
while True:
if self.token is None or \
self.token.type == TokenKind.EOF or \
self.token.type == TokenKind.EQUALS or \
self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \
self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR):
break
self.eat_whitespace()
sequence = self._parse_modifier()
elements.append(sequence)
return self._eat_rule_name_if_needed(Sequence(*elements))
def _parse_modifier(self):
"""
a? | a* | a+
:return:
"""
expression = self._parse_expression()
if self.token.type == TokenKind.QMARK:
self.next_token()
return self._eat_rule_name_if_needed(Optional(expression))
if self.token.type == TokenKind.STAR:
self.next_token()
return self._eat_rule_name_if_needed(ZeroOrMore(expression))
if self.token.type == TokenKind.PLUS:
self.next_token()
return self._eat_rule_name_if_needed(OneOrMore(expression))
return expression
def _parse_expression(self):
if self.token.type == TokenKind.EOF:
self.add_error(UnexpectedEof("lpar | concept | ident | string | regrex", self.token))
if self.token.type == TokenKind.LPAR:
self.nb_open_par += 1
self.next_token()
expr = self._parse_choice()
if self.token.type == TokenKind.RPAR:
self.nb_open_par -= 1
self.next_token()
return self._eat_rule_name_if_needed(expr)
else:
self.add_error(UnexpectedToken(self.token, TokenKind.RPAR))
return expr
if self.token.type == TokenKind.CONCEPT:
concept_name, concept_id = self.token.value
metadata = self.context.sheerka.get_by_id(concept_id) if concept_id \
else self.context.sheerka.get_by_name(concept_name)
if metadata is NotFound:
self.add_error(UnknownConcept(concept_id, concept_name))
self.next_token()
return None
expr = ConceptExpression(metadata.id, rule_name=metadata.name)
self.next_token()
return self._eat_rule_name_if_needed(expr)
if self.token.type == TokenKind.IDENTIFIER:
concept_name = self.token.str_value
if concept_name == self.concept_name:
# recursive construction, the concept id is not known yet
expr = ConceptExpression(None, rule_name=concept_name)
elif (metadata := self.context.sheerka.get_by_name(concept_name)) is NotFound:
# unknown concept, it's a variable definition
expr = VariableExpression(concept_name)
else:
expr = ConceptExpression(metadata.id, rule_name=concept_name)
self.next_token()
return self._eat_rule_name_if_needed(expr)
if self.token.type == TokenKind.STRING:
tokens = list(Tokenizer(self.token.strip_quote, yield_eof=False))
if len(tokens) == 1:
self.next_token()
return self._eat_rule_name_if_needed(StrMatch(tokens[0].str_value))
else:
elements = [StrMatch(t.str_value, skip_whitespace=False) for t in tokens]
elements[-1].skip_white_space = True
ret = Sequence(*elements)
self.next_token()
return self._eat_rule_name_if_needed(ret)
if self.token.type == TokenKind.REGEX:
ret = RegExMatch(self.token.strip_quote)
self.next_token()
return self._eat_rule_name_if_needed(ret)
ret = StrMatch(self.token.strip_quote)
self.next_token()
return self._eat_rule_name_if_needed(ret)
def _eat_rule_name_if_needed(self, expression):
if self.token.type == TokenKind.EQUALS:
self.next_token() # eat equals
if self.token.type != TokenKind.IDENTIFIER:
return self.add_error(UnexpectedToken(self.token, TokenKind.IDENTIFIER))
expression.rule_name = self.token.value
self.next_token()
return expression