263 lines
8.5 KiB
Python
263 lines
8.5 KiB
Python
from dataclasses import dataclass
|
|
|
|
import core.utils
|
|
from core.builtin_concepts import BuiltinConcepts
|
|
from core.sheerka import ExecutionContext
|
|
from core.tokenizer import Tokenizer, Token, TokenKind, LexerError
|
|
from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode
|
|
from parsers.ConceptLexerParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptMatch, StrMatch
|
|
|
|
|
|
@dataclass()
|
|
class UnexpectedEndOfFileError(ErrorNode):
|
|
pass
|
|
|
|
|
|
class BnfParser(BaseParser):
|
|
"""
|
|
Parser used to transform litteral into ParsingExpression
|
|
example :
|
|
a | b, c -> Sequence(OrderedChoice(a, b) ,c)
|
|
|
|
'|' (pipe) is used for OrderedChoice
|
|
',' (comma) is used for Sequence
|
|
'?' (question mark) is used for Optional
|
|
'*' (star) is used for ZeroOrMore
|
|
'+' (plus) is used for OneOrMore
|
|
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__("Bnf", 50, False)
|
|
# self.has_error = False
|
|
# self.error_sink = []
|
|
# self.name = BaseParser.PREFIX + "Bnf"
|
|
|
|
self.lexer_iter = None
|
|
self._current = None
|
|
self.after_current = None
|
|
self.nb_open_par = 0
|
|
self.context = None
|
|
self.source = ""
|
|
self.sheerka = None
|
|
|
|
def __eq__(self, other):
|
|
if not isinstance(other, BnfParser):
|
|
return False
|
|
|
|
return True
|
|
|
|
def reset_parser(self, context, text):
|
|
self.context = context
|
|
self.sheerka = context.sheerka
|
|
|
|
self.lexer_iter = iter(Tokenizer(text.strip())) if isinstance(text, str) else iter(text)
|
|
self._current = None
|
|
self.after_current = None
|
|
self.nb_open_par = 0
|
|
|
|
self.next_token()
|
|
self.eat_white_space()
|
|
|
|
def add_error(self, error, next_token=True):
|
|
self.has_error = True
|
|
self.error_sink.append(error)
|
|
if next_token:
|
|
self.next_token()
|
|
return error
|
|
|
|
def get_token(self) -> Token:
|
|
return self._current
|
|
|
|
def next_token(self, skip_whitespace=False):
|
|
if self._current and self._current.type == TokenKind.EOF:
|
|
return
|
|
|
|
try:
|
|
self._current = self.after_current or next(self.lexer_iter)
|
|
self.source += str(self._current.value)
|
|
self.after_current = None
|
|
|
|
if skip_whitespace:
|
|
while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE:
|
|
self._current = next(self.lexer_iter)
|
|
self.source += str(self._current.value)
|
|
except StopIteration:
|
|
self._current = Token(TokenKind.EOF, "", -1, -1, -1)
|
|
|
|
def next_after(self):
|
|
if self.after_current is not None:
|
|
return self.after_current
|
|
|
|
try:
|
|
self.after_current = next(self.lexer_iter)
|
|
# self.source += str(self.after_current.value)
|
|
return self.after_current
|
|
except StopIteration:
|
|
self.after_current = Token(TokenKind.EOF, "", -1, -1, -1)
|
|
return self.after_current
|
|
|
|
def eat_white_space(self):
|
|
if self.after_current is not None:
|
|
self._current = self.after_current
|
|
self.source += str(self._current.value)
|
|
self.after_current = None
|
|
|
|
try:
|
|
while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE:
|
|
self._current = next(self.lexer_iter)
|
|
self.source += str(self._current.value)
|
|
except StopIteration:
|
|
self._current = None
|
|
|
|
def maybe_sequence(self, first, second):
|
|
token = self.get_token()
|
|
return token.type == second or token.type == first and self.next_after().type == second
|
|
|
|
def parse(self, context: ExecutionContext, text):
|
|
|
|
tree = None
|
|
try:
|
|
self.reset_parser(context, text)
|
|
tree = self.parser_outer_rule_name()
|
|
|
|
token = self.get_token()
|
|
if token and token.type != TokenKind.EOF:
|
|
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", []))
|
|
except LexerError as e:
|
|
self.add_error(e, False)
|
|
|
|
ret = self.sheerka.ret(
|
|
self.name,
|
|
not self.has_error,
|
|
self.sheerka.new(
|
|
BuiltinConcepts.PARSER_RESULT,
|
|
parser=self,
|
|
source=self.source,
|
|
body=self.error_sink if self.has_error else tree,
|
|
try_parsed=tree))
|
|
|
|
return ret
|
|
|
|
def parser_outer_rule_name(self):
|
|
return self.parser_rule_name(self.parse_choice)
|
|
|
|
def parse_choice(self):
|
|
sequence = self.parse_sequence()
|
|
|
|
self.eat_white_space()
|
|
token = self.get_token()
|
|
if token is None or token.type != TokenKind.VBAR:
|
|
return sequence
|
|
|
|
elements = [sequence]
|
|
while True:
|
|
# maybe eat the vertical bar
|
|
self.eat_white_space()
|
|
token = self.get_token()
|
|
if token is None or token.type != TokenKind.VBAR:
|
|
break
|
|
self.next_token(skip_whitespace=True)
|
|
|
|
sequence = self.parse_sequence()
|
|
elements.append(sequence)
|
|
|
|
return OrderedChoice(*elements)
|
|
|
|
def parse_sequence(self):
|
|
expr_and_modifier = self.parse_modifier()
|
|
token = self.get_token()
|
|
if token is None or \
|
|
token.type == TokenKind.EOF or \
|
|
token.type == TokenKind.EQUALS or \
|
|
self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \
|
|
self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR):
|
|
return expr_and_modifier
|
|
|
|
elements = [expr_and_modifier]
|
|
while True:
|
|
token = self.get_token()
|
|
if token is None or \
|
|
token.type == TokenKind.EOF or \
|
|
token.type == TokenKind.EQUALS or \
|
|
self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \
|
|
self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR):
|
|
break
|
|
self.eat_white_space()
|
|
|
|
sequence = self.parse_modifier()
|
|
elements.append(sequence)
|
|
|
|
return Sequence(*elements)
|
|
|
|
def parse_modifier(self):
|
|
expression = self.parser_inner_rule_name()
|
|
|
|
token = self.get_token()
|
|
|
|
if token.type == TokenKind.QMARK:
|
|
self.next_token()
|
|
return Optional(expression)
|
|
|
|
if token.type == TokenKind.STAR:
|
|
self.next_token()
|
|
return ZeroOrMore(expression)
|
|
|
|
if token.type == TokenKind.PLUS:
|
|
self.next_token()
|
|
return OneOrMore(expression)
|
|
|
|
return expression
|
|
|
|
def parser_inner_rule_name(self):
|
|
return self.parser_rule_name(self.parse_expression)
|
|
|
|
def parse_expression(self):
|
|
token = self.get_token()
|
|
if token.type == TokenKind.EOF:
|
|
self.add_error(UnexpectedEndOfFileError(), False)
|
|
if token.type == TokenKind.LPAR:
|
|
self.nb_open_par += 1
|
|
self.next_token()
|
|
expression = self.parse_choice()
|
|
token = self.get_token()
|
|
if token.type == TokenKind.RPAR:
|
|
self.nb_open_par -= 1
|
|
self.next_token()
|
|
return expression
|
|
else:
|
|
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [TokenKind.RPAR]))
|
|
return expression
|
|
|
|
if token.type == TokenKind.IDENTIFIER:
|
|
self.next_token()
|
|
return ConceptMatch(token.value)
|
|
# concept = self.sheerka.get(str(token.value))
|
|
# if hasattr(concept, "__iter__") or self.sheerka.isinstance(concept, BuiltinConcepts.UNKNOWN_CONCEPT):
|
|
# self.add_error(CannotResolveConceptNode(str(token.value)))
|
|
# self.next_token()
|
|
# return None
|
|
# else:
|
|
# self.next_token()
|
|
# return concept
|
|
|
|
ret = StrMatch(core.utils.strip_quotes(token.value))
|
|
self.next_token()
|
|
return ret
|
|
|
|
def parser_rule_name(self, next_to_parse):
|
|
expression = next_to_parse()
|
|
token = self.get_token()
|
|
if token is None or token.type != TokenKind.EQUALS:
|
|
return expression
|
|
|
|
self.next_token() # eat equals
|
|
token = self.get_token()
|
|
|
|
if token is None or token.type != TokenKind.IDENTIFIER:
|
|
return self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", [TokenKind.IDENTIFIER]))
|
|
|
|
expression.rule_name = token.value
|
|
self.next_token()
|
|
return expression
|