from dataclasses import dataclass import core.utils from core.builtin_concepts import BuiltinConcepts from core.sheerka import ExecutionContext from core.tokenizer import Tokenizer, Token, TokenKind, LexerError from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode from parsers.ConceptLexerParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptMatch, StrMatch @dataclass() class UnexpectedEndOfFileError(ErrorNode): pass class BnfParser: """ Parser used to transform litteral into ParsingExpression example : a | b, c -> Sequence(OrderedChoice(a, b) ,c) '|' (pipe) is used for OrderedChoice ',' (comma) is used for Sequence '?' (question mark) is used for Optional '*' (star) is used for ZeroOrMore '+' (plus) is used for OneOrMore """ def __init__(self): self.has_error = False self.error_sink = [] self.name = BaseParser.PREFIX + "Bnf" self.lexer_iter = None self._current = None self.after_current = None self.nb_open_par = 0 self.context = None self.source = "" self.sheerka = None def __eq__(self, other): if not isinstance(other, BnfParser): return False return True def reset_parser(self, context, text): self.context = context self.sheerka = context.sheerka self.lexer_iter = iter(Tokenizer(text.strip())) if isinstance(text, str) else iter(text) self._current = None self.after_current = None self.nb_open_par = 0 self.next_token() self.eat_white_space() def add_error(self, error, next_token=True): self.has_error = True self.error_sink.append(error) if next_token: self.next_token() return error def get_token(self) -> Token: return self._current def next_token(self, skip_whitespace=False): if self._current and self._current.type == TokenKind.EOF: return try: self._current = self.after_current or next(self.lexer_iter) self.source += str(self._current.value) self.after_current = None if skip_whitespace: while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE: self._current = next(self.lexer_iter) self.source += str(self._current.value) except StopIteration: self._current = Token(TokenKind.EOF, "", -1, -1, -1) def next_after(self): if self.after_current is not None: return self.after_current try: self.after_current = next(self.lexer_iter) # self.source += str(self.after_current.value) return self.after_current except StopIteration: self.after_current = Token(TokenKind.EOF, "", -1, -1, -1) return self.after_current def eat_white_space(self): if self.after_current is not None: self._current = self.after_current self.source += str(self._current.value) self.after_current = None try: while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE: self._current = next(self.lexer_iter) self.source += str(self._current.value) except StopIteration: self._current = None def maybe_sequence(self, first, second): token = self.get_token() return token.type == second or token.type == first and self.next_after().type == second def parse(self, context: ExecutionContext, text): tree = None try: self.reset_parser(context, text) tree = self.parser_outer_rule_name() token = self.get_token() if token and token.type != TokenKind.EOF: self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [])) except LexerError as e: self.add_error(e, False) ret = self.sheerka.ret( self.name, not self.has_error, self.sheerka.new( BuiltinConcepts.PARSER_RESULT, parser=self, source=self.source, body=self.error_sink if self.has_error else tree, try_parsed=tree)) return ret def parser_outer_rule_name(self): return self.parser_rule_name(self.parse_choice) def parse_choice(self): sequence = self.parse_sequence() self.eat_white_space() token = self.get_token() if token is None or token.type != TokenKind.VBAR: return sequence elements = [sequence] while True: # maybe eat the vertical bar self.eat_white_space() token = self.get_token() if token is None or token.type != TokenKind.VBAR: break self.next_token(skip_whitespace=True) sequence = self.parse_sequence() elements.append(sequence) return OrderedChoice(*elements) def parse_sequence(self): expr_and_modifier = self.parse_modifier() token = self.get_token() if token is None or \ token.type == TokenKind.EOF or \ token.type == TokenKind.EQUALS or \ self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \ self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR): return expr_and_modifier elements = [expr_and_modifier] while True: token = self.get_token() if token is None or \ token.type == TokenKind.EOF or \ token.type == TokenKind.EQUALS or \ self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \ self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR): break self.eat_white_space() sequence = self.parse_modifier() elements.append(sequence) return Sequence(*elements) def parse_modifier(self): expression = self.parser_inner_rule_name() token = self.get_token() if token.type == TokenKind.QMARK: self.next_token() return Optional(expression) if token.type == TokenKind.STAR: self.next_token() return ZeroOrMore(expression) if token.type == TokenKind.PLUS: self.next_token() return OneOrMore(expression) return expression def parser_inner_rule_name(self): return self.parser_rule_name(self.parse_expression) def parse_expression(self): token = self.get_token() if token.type == TokenKind.EOF: self.add_error(UnexpectedEndOfFileError(), False) if token.type == TokenKind.LPAR: self.nb_open_par += 1 self.next_token() expression = self.parse_choice() token = self.get_token() if token.type == TokenKind.RPAR: self.nb_open_par -= 1 self.next_token() return expression else: self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [TokenKind.RPAR])) return expression if token.type == TokenKind.IDENTIFIER: self.next_token() return ConceptMatch(token.value) # concept = self.sheerka.get(str(token.value)) # if hasattr(concept, "__iter__") or self.sheerka.isinstance(concept, BuiltinConcepts.UNKNOWN_CONCEPT): # self.add_error(CannotResolveConceptNode(str(token.value))) # self.next_token() # return None # else: # self.next_token() # return concept ret = StrMatch(core.utils.strip_quotes(token.value)) self.next_token() return ret def parser_rule_name(self, next_to_parse): expression = next_to_parse() token = self.get_token() if token is None or token.type != TokenKind.EQUALS: return expression self.next_token() # eat equals token = self.get_token() if token is None or token.type != TokenKind.IDENTIFIER: return self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [TokenKind.IDENTIFIER])) expression.rule_name = token.value self.next_token() return expression