from dataclasses import dataclass from common.global_symbols import NotFound from core.error import ErrorObj from parsers.parser_utils import SimpleParser, UnexpectedEof, UnexpectedToken from parsers.peg_parser import ConceptExpression, OneOrMore, Optional, OrderedChoice, RegExMatch, Sequence, StrMatch, \ VariableExpression, ZeroOrMore from parsers.tokenizer import TokenKind, Tokenizer @dataclass class UnknownConcept(ErrorObj): concept_id: str concept_name: str def get_error_msg(self) -> str: return f"Cannot find concept defined by id='{self.concept_id}' and/or name '{self.concept_name}'" class BnfDefinitionParser(SimpleParser): """ Parser used to transform literal into ParsingExpression example : a | b c -> Sequence(OrderedChoice(a, b), c) '|' (pipe) is used for OrderedChoice ' ' space is used for Sequence '?' (question mark) is used for Optional '*' (star) is used for ZeroOrMore '+' (plus) is used for OneOrMore """ def __init__(self, context, text, concept_name=None): super().__init__(text, skip_whitespace_default_behaviour=False) self.context = context self.concept_name = concept_name # name of the concept currently being constructed self.nb_open_par = 0 self.next_token(skip_whitespace=True) def maybe_sequence(self, first, second): return self.token.type == second or \ self.token.type == first and self.check_next_token().type == second def parse(self): tree = self._parse_choice() if self.token.type != TokenKind.EOF: self.add_error(UnexpectedToken(self.token, TokenKind.EOF)) return None if self.error_sink else tree def _parse_choice(self): """ a | b | c := ( '|' )* :return: :rtype: """ sequence = self._parse_sequence() self.eat_whitespace() if self.token.type != TokenKind.VBAR: return sequence elements = [sequence] while True: # maybe eat the vertical bar self.eat_whitespace() if self.token is None or self.token.type != TokenKind.VBAR: break self.next_token(skip_whitespace=True) sequence = self._parse_sequence() elements.append(sequence) return self._eat_rule_name_if_needed(OrderedChoice(*elements)) def _parse_sequence(self): """ a b c :return: """ expr_and_modifier = self._parse_modifier() if self.token.type == TokenKind.EOF or \ self.token.type == TokenKind.EQUALS or \ self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \ self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR): return expr_and_modifier elements = [expr_and_modifier] while True: if self.token is None or \ self.token.type == TokenKind.EOF or \ self.token.type == TokenKind.EQUALS or \ self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \ self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR): break self.eat_whitespace() sequence = self._parse_modifier() elements.append(sequence) return self._eat_rule_name_if_needed(Sequence(*elements)) def _parse_modifier(self): """ a? | a* | a+ :return: """ expression = self._parse_expression() if self.token.type == TokenKind.QMARK: self.next_token() return self._eat_rule_name_if_needed(Optional(expression)) if self.token.type == TokenKind.STAR: self.next_token() return self._eat_rule_name_if_needed(ZeroOrMore(expression)) if self.token.type == TokenKind.PLUS: self.next_token() return self._eat_rule_name_if_needed(OneOrMore(expression)) return expression def _parse_expression(self): if self.token.type == TokenKind.EOF: self.add_error(UnexpectedEof("lpar | concept | ident | string | regrex", self.token)) if self.token.type == TokenKind.LPAR: self.nb_open_par += 1 self.next_token() expr = self._parse_choice() if self.token.type == TokenKind.RPAR: self.nb_open_par -= 1 self.next_token() return self._eat_rule_name_if_needed(expr) else: self.add_error(UnexpectedToken(self.token, TokenKind.RPAR)) return expr if self.token.type == TokenKind.CONCEPT: concept_name, concept_id = self.token.value metadata = self.context.sheerka.get_by_id(concept_id) if concept_id \ else self.context.sheerka.get_by_name(concept_name) if metadata is NotFound: self.add_error(UnknownConcept(concept_id, concept_name)) self.next_token() return None expr = ConceptExpression(metadata.id, rule_name=metadata.name) self.next_token() return self._eat_rule_name_if_needed(expr) if self.token.type == TokenKind.IDENTIFIER: concept_name = self.token.str_value if concept_name == self.concept_name: # recursive construction, the concept id is not known yet expr = ConceptExpression(None, rule_name=concept_name) elif (metadata := self.context.sheerka.get_by_name(concept_name)) is NotFound: # unknown concept, it's a variable definition expr = VariableExpression(concept_name) else: expr = ConceptExpression(metadata.id, rule_name=concept_name) self.next_token() return self._eat_rule_name_if_needed(expr) if self.token.type == TokenKind.STRING: tokens = list(Tokenizer(self.token.strip_quote, yield_eof=False)) if len(tokens) == 1: self.next_token() return self._eat_rule_name_if_needed(StrMatch(tokens[0].str_value)) else: elements = [StrMatch(t.str_value, skip_whitespace=False) for t in tokens] elements[-1].skip_white_space = True ret = Sequence(*elements) self.next_token() return self._eat_rule_name_if_needed(ret) if self.token.type == TokenKind.REGEX: ret = RegExMatch(self.token.strip_quote) self.next_token() return self._eat_rule_name_if_needed(ret) ret = StrMatch(self.token.strip_quote) self.next_token() return self._eat_rule_name_if_needed(ret) def _eat_rule_name_if_needed(self, expression): if self.token.type == TokenKind.EQUALS: self.next_token() # eat equals if self.token.type != TokenKind.IDENTIFIER: return self.add_error(UnexpectedToken(self.token, TokenKind.IDENTIFIER)) expression.rule_name = self.token.value self.next_token() return expression