Files
Sheerka-Old/src/parsers/BnfParser.py
T

287 lines
9.5 KiB
Python

from dataclasses import dataclass
import core.utils
from core.builtin_concepts import BuiltinConcepts
from core.sheerka.Sheerka import ExecutionContext
from core.tokenizer import Tokenizer, Token, TokenKind, LexerError
from parsers.BaseParser import BaseParser, ErrorNode, UnexpectedTokenErrorNode
from parsers.BnfNodeParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, ConceptExpression, \
StrMatch
@dataclass()
class UnexpectedEndOfFileError(ErrorNode):
pass
class BnfParser(BaseParser):
"""
Parser used to transform litteral into ParsingExpression
example :
a | b, c -> Sequence(OrderedChoice(a, b) ,c)
'|' (pipe) is used for OrderedChoice
',' (comma) is used for Sequence
'?' (question mark) is used for Optional
'*' (star) is used for ZeroOrMore
'+' (plus) is used for OneOrMore
"""
def __init__(self, **kwargs):
super().__init__("Bnf", 50, False)
# self.error_sink = []
# self.name = BaseParser.PREFIX + "Bnf"
self.lexer_iter = None
self._current = None
self.after_current = None
self.nb_open_par = 0
self.context = None
self.source = ""
self.sheerka = None
def __eq__(self, other):
if not isinstance(other, BnfParser):
return False
return True
def reset_parser(self, context, text):
self.context = context
self.sheerka = context.sheerka
self.lexer_iter = iter(Tokenizer(text.strip())) if isinstance(text, str) else iter(text)
self._current = None
self.after_current = None
self.nb_open_par = 0
self.next_token()
self.eat_white_space()
def add_error(self, error, next_token=True):
self.error_sink.append(error)
if next_token:
self.next_token()
return error
def get_token(self) -> Token:
return self._current
def next_token(self, skip_whitespace=False):
if self._current and self._current.type == TokenKind.EOF:
return
try:
self._current = self.after_current or next(self.lexer_iter)
self.source += self._current.str_value
self.after_current = None
if skip_whitespace:
while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE:
self._current = next(self.lexer_iter)
self.source += self._current.str_value
except StopIteration:
self._current = Token(TokenKind.EOF, "", -1, -1, -1)
def next_after(self):
if self.after_current is not None:
return self.after_current
try:
self.after_current = next(self.lexer_iter)
# self.source += self.after_current.str_value
return self.after_current
except StopIteration:
self.after_current = Token(TokenKind.EOF, "", -1, -1, -1)
return self.after_current
def eat_white_space(self):
if self.after_current is not None:
self._current = self.after_current
self.source += self._current.str_value
self.after_current = None
try:
while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE:
self._current = next(self.lexer_iter)
self.source += self._current.str_value
except StopIteration:
self._current = None
def maybe_sequence(self, first, second):
token = self.get_token()
return token.type == second or token.type == first and self.next_after().type == second
def parse(self, context: ExecutionContext, parser_input):
tree = None
try:
self.reset_parser(context, parser_input)
tree = self.parse_choice()
token = self.get_token()
if token and token.type != TokenKind.EOF:
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", token, []))
except LexerError as e:
self.add_error(e, False)
value = self.get_return_value_body(context.sheerka, self.source, tree, tree)
ret = self.sheerka.ret(
self.name,
not self.has_error,
value)
return ret
def parse_choice(self):
"""
a | b | c
:return:
"""
sequence = self.parse_sequence()
self.eat_white_space()
token = self.get_token()
if token is None or token.type != TokenKind.VBAR:
return sequence
elements = [sequence]
while True:
# maybe eat the vertical bar
self.eat_white_space()
token = self.get_token()
if token is None or token.type != TokenKind.VBAR:
break
self.next_token(skip_whitespace=True)
sequence = self.parse_sequence()
elements.append(sequence)
return self.eat_rule_name_if_needed(OrderedChoice(*elements))
def parse_sequence(self):
"""
a b c
:return:
"""
expr_and_modifier = self.parse_modifier()
token = self.get_token()
if token is None or \
token.type == TokenKind.EOF or \
token.type == TokenKind.EQUALS or \
self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \
self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR):
return expr_and_modifier
elements = [expr_and_modifier]
while True:
token = self.get_token()
if token is None or \
token.type == TokenKind.EOF or \
token.type == TokenKind.EQUALS or \
self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.VBAR) or \
self.nb_open_par > 0 and self.maybe_sequence(TokenKind.WHITESPACE, TokenKind.RPAR):
break
self.eat_white_space()
sequence = self.parse_modifier()
elements.append(sequence)
return self.eat_rule_name_if_needed(Sequence(*elements))
def parse_modifier(self):
"""
a? | a* | a+
:return:
"""
expression = self.parse_expression()
token = self.get_token()
if token.type == TokenKind.QMARK:
self.next_token()
return self.eat_rule_name_if_needed(Optional(expression))
if token.type == TokenKind.STAR:
self.next_token()
return self.eat_rule_name_if_needed(ZeroOrMore(expression))
if token.type == TokenKind.PLUS:
self.next_token()
return self.eat_rule_name_if_needed(OneOrMore(expression))
return expression
def parse_expression(self):
token = self.get_token()
if token.type == TokenKind.EOF:
self.add_error(UnexpectedEndOfFileError(), False)
if token.type == TokenKind.LPAR:
self.nb_open_par += 1
self.next_token()
expr = self.parse_choice()
token = self.get_token()
if token.type == TokenKind.RPAR:
self.nb_open_par -= 1
self.next_token()
return self.eat_rule_name_if_needed(expr)
else:
self.add_error(UnexpectedTokenErrorNode(f"Unexpected token '{token}'", token, [TokenKind.RPAR]))
return expr
if token.type == TokenKind.CONCEPT:
self.next_token()
concept = self.sheerka.new((token.value[0], token.value[1]))
expr = ConceptExpression(concept)
# expr = ConceptGroupExpression(concept) if self.sheerka.isaset(self.context, concept) \
# else ConceptExpression(concept)
return self.eat_rule_name_if_needed(expr)
if token.type in (TokenKind.IDENTIFIER, TokenKind.KEYWORD):
self.next_token()
concept_name = token.str_value
# we are trying to match against a concept which is still under construction !
# (for example of recursive bnf definition)
if self.context.obj and hasattr(self.context.obj, "name"):
if concept_name == str(self.context.obj.name):
return self.eat_rule_name_if_needed(ConceptExpression(concept_name))
concept = self.context.get_concept(concept_name)
if not self.sheerka.is_known(concept):
self.add_error(concept)
return None
elif hasattr(concept, "__iter__"):
self.add_error(
self.sheerka.new(BuiltinConcepts.CANNOT_RESOLVE_CONCEPT,
body=("key", concept_name)))
return None
else:
expr = ConceptExpression(concept)
expr.rule_name = concept.name
return self.eat_rule_name_if_needed(expr)
ret = StrMatch(core.utils.strip_quotes(token.value))
self.next_token()
return self.eat_rule_name_if_needed(ret)
def eat_rule_name_if_needed(self, expression):
token = self.get_token()
if token is None or token.type != TokenKind.EQUALS:
return expression
self.next_token() # eat equals
token = self.get_token()
if token is None or token.type != TokenKind.IDENTIFIER:
return self.add_error(
UnexpectedTokenErrorNode(f"Unexpected token '{token}'", token, [TokenKind.IDENTIFIER]))
expression.rule_name = token.value
self.next_token()
return expression