from dataclasses import dataclass import core.utils from core.builtin_concepts import BuiltinConcepts from core.sheerka.Sheerka import ExecutionContext from core.tokenizer import Tokenizer, Token, TokenKind, LexerError from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode from parsers.BnfNodeParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptExpression, \ StrMatch @dataclass() class UnexpectedEndOfFileError(ErrorNode): pass class BnfParser(BaseParser): """ Parser used to transform litteral into ParsingExpression example : a | b, c -> Sequence(OrderedChoice(a, b) ,c) '|' (pipe) is used for OrderedChoice ',' (comma) is used for Sequence '?' (question mark) is used for Optional '*' (star) is used for ZeroOrMore '+' (plus) is used for OneOrMore """ def __init__(self, **kwargs): super().__init__("Bnf", 50, False) # self.error_sink = [] # self.name = BaseParser.PREFIX + "Bnf" self.lexer_iter = None self._current = None self.after_current = None self.nb_open_par = 0 self.context = None self.source = "" self.sheerka = None def __eq__(self, other): if not isinstance(other, BnfParser): return False return True def reset_parser(self, context, text): self.context = context self.sheerka = context.sheerka self.lexer_iter = iter(Tokenizer(text.strip())) if isinstance(text, str) else iter(text) self._current = None self.after_current = None self.nb_open_par = 0 self.next_token() self.eat_white_space() def add_error(self, error, next_token=True): self.error_sink.append(error) if next_token: self.next_token() return error def get_token(self) -> Token: return self._current def next_token(self, skip_whitespace=False): if self._current and self._current.type == TokenKind.EOF: return try: self._current = self.after_current or next(self.lexer_iter) self.source += self._current.str_value self.after_current = None if skip_whitespace: while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE: self._current = next(self.lexer_iter) self.source += self._current.str_value except StopIteration: self._current = Token(TokenKind.EOF, "", -1, -1, -1) def next_after(self): if self.after_current is not None: return self.after_current try: self.after_current = next(self.lexer_iter) # self.source += self.after_current.str_value return self.after_current except StopIteration: self.after_current = Token(TokenKind.EOF, "", -1, -1, -1) return self.after_current def eat_white_space(self): if self.after_current is not None: self._current = self.after_current self.source += self._current.str_value self.after_current = None try: while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE: self._current = next(self.lexer_iter) self.source += self._current.str_value except StopIteration: self._current = None def maybe_sequence(self, first, second): token = self.get_token() return token.type == second or token.type == first and self.next_after().type == second def parse(self, context: ExecutionContext, parser_input): tree = None try: self.reset_parser(context, parser_input) tree = self.parse_choice() token = self.get_token() if token and token.type != TokenKind.EOF: self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", token, [])) except LexerError as e: self.add_error(e, False) value = self.get_return_value_body(context.sheerka, self.source, tree, tree) ret = self.sheerka.ret( self.name, not self.has_error, value) return ret def parse_choice(self): """ a | b | c :return: """ sequence = self.parse_sequence() self.eat_white_space() token = self.get_token() if token is None or token.type != TokenKind.VBAR: return sequence elements = [sequence] while True: # maybe eat the vertical bar self.eat_white_space() token = self.get_token() if token is None or token.type != TokenKind.VBAR: break self.next_token(skip_whitespace=True) sequence = self.parse_sequence() elements.append(sequence) return self.eat_rule_name_if_needed(OrderedChoice(*elements)) def parse_sequence(self): """ a b c :return: """ expr_and_modifier = self.parse_modifier() token = self.get_token() if token is None or \ token.type == TokenKind.EOF or \ token.type == TokenKind.EQUALS or \ self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \ self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR): return expr_and_modifier elements = [expr_and_modifier] while True: token = self.get_token() if token is None or \ token.type == TokenKind.EOF or \ token.type == TokenKind.EQUALS or \ self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \ self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR): break self.eat_white_space() sequence = self.parse_modifier() elements.append(sequence) return self.eat_rule_name_if_needed(Sequence(*elements)) def parse_modifier(self): """ a? | a* | a+ :return: """ expression = self.parse_expression() token = self.get_token() if token.type == TokenKind.QMARK: self.next_token() return self.eat_rule_name_if_needed(Optional(expression)) if token.type == TokenKind.STAR: self.next_token() return self.eat_rule_name_if_needed(ZeroOrMore(expression)) if token.type == TokenKind.PLUS: self.next_token() return self.eat_rule_name_if_needed(OneOrMore(expression)) return expression def parse_expression(self): token = self.get_token() if token.type == TokenKind.EOF: self.add_error(UnexpectedEndOfFileError(), False) if token.type == TokenKind.LPAR: self.nb_open_par += 1 self.next_token() expr = self.parse_choice() token = self.get_token() if token.type == TokenKind.RPAR: self.nb_open_par -= 1 self.next_token() return self.eat_rule_name_if_needed(expr) else: self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", token, [TokenKind.RPAR])) return expr if token.type == TokenKind.CONCEPT: self.next_token() concept = self.sheerka.new((token.value[0], token.value[1])) expr = ConceptExpression(concept) # expr = ConceptGroupExpression(concept) if self.sheerka.isaset(self.context, concept) \ # else ConceptExpression(concept) return self.eat_rule_name_if_needed(expr) if token.type in (TokenKind.IDENTIFIER, TokenKind.KEYWORD): self.next_token() concept_name = token.str_value # we are trying to match against a concept which is still under construction ! # (for example of recursive bnf definition) if self.context.obj and hasattr(self.context.obj, "name"): if concept_name == str(self.context.obj.name): return self.eat_rule_name_if_needed(ConceptExpression(concept_name)) concept = self.context.get_concept(concept_name) if not self.sheerka.is_known(concept): self.add_error(concept) return None elif hasattr(concept, "__iter__"): self.add_error( self.sheerka.new(BuiltinConcepts.CANNOT_RESOLVE_CONCEPT, body=("key", concept_name))) return None else: expr = ConceptExpression(concept) expr.rule_name = concept.name return self.eat_rule_name_if_needed(expr) ret = StrMatch(core.utils.strip_quotes(token.value)) self.next_token() return self.eat_rule_name_if_needed(ret) def eat_rule_name_if_needed(self, expression): token = self.get_token() if token is None or token.type != TokenKind.EQUALS: return expression self.next_token() # eat equals token = self.get_token() if token is None or token.type != TokenKind.IDENTIFIER: return self.add_error( UnexpectedTokenErrorNode(f"Unexpected token '{token}'", token, [TokenKind.IDENTIFIER])) expression.rule_name = token.value self.next_token() return expression