from parsers.BaseParser import BaseParser, Node, NopNode, ErrorNode from parsers.tokenizer import Tokenizer, TokenKind, Token, Keywords from dataclasses import dataclass, field import logging log = logging.getLogger(__name__) @dataclass() class DefaultParserNode(Node): tokens: list = field(compare=False) def is_same(self, other): if type(self) != type(other): return False if hasattr(self, "value") and self.value != other.value: return False return True @dataclass() class DefaultParserErrorNode(DefaultParserNode, ErrorNode): pass @dataclass() class UnexpectedTokenErrorNode(DefaultParserErrorNode): message: str expected_tokens: list def __post_init__(self): log.debug("UnexpectedToken : " + self.message) @dataclass() class SyntaxErrorNode(DefaultParserErrorNode): message: str def __post_init__(self): log.debug("SyntaxError : " + self.message) @dataclass() class DefConceptNode(DefaultParserNode): name: str where: Node = None pre: Node = None post: Node = None body: Node = NopNode def get_codes(self): codes = {} for prop in ["where", "pre", "post", "body"]: prop_value = getattr(self, prop) if hasattr(prop_value, "ast"): codes[prop] = prop_value.ast return codes @dataclass() class NumberNode(DefaultParserNode): value: object def __repr__(self): return str(self.value) @dataclass() class StringNode(DefaultParserNode): value: str quote: str def is_same(self, other): if not super(StringNode, self).is_same(other): return False return self.quote == other.quote def __repr__(self): return self.quote + self.value + self.quote @dataclass() class VariableNode(DefaultParserNode): value: str def __repr__(self): return self.value @dataclass() class TrueNode(DefaultParserNode): pass def __repr__(self): return "true" @dataclass() class FalseNode(DefaultParserNode): pass def __repr__(self): return "false" @dataclass() class NullNode(DefaultParserNode): pass def __repr__(self): return "null" @dataclass() class BinaryNode(DefaultParserNode): operator: TokenKind left: Node right: Node def is_same(self, other): if not super(BinaryNode, self).is_same(other): return False if self.operator != other.operator: return False if not self.left.is_same(other.left): return False return self.right.is_same(other.right) def __repr__(self): return f"({self.left} {self.operator} {self.right})" class DefaultParser(BaseParser): def __init__(self, text, sub_parser): BaseParser.__init__(self, "Default", text) self.sub_parser = sub_parser self.lexer = Tokenizer(text) self.lexer_iter = iter(Tokenizer(text)) self._current = None self.next_token() def collect_tokens(self, *args): result = [] for item in args: if isinstance(item, Node): result.extend(item.tokens) else: result.append(item) return result def add_error(self, error, next_token=True): self.has_error = True self.error_sink.append(error) if next_token: self.next_token() return error def get_token(self) -> Token: return self._current def next_token(self, skip_whitespace=True): try: self._current = next(self.lexer_iter) if skip_whitespace: while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE: self._current = next(self.lexer_iter) except StopIteration: self._current = None return @staticmethod def get_concept_name(tokens, variables=None): name = "" first = True for token in tokens: if token.type == TokenKind.EOF: break if not first: name += " " if variables is not None and token.value in variables: name += "__var__" + str(variables.index(token.value)) else: name += token.value[1:-1] if token.type == TokenKind.STRING else token.value first = False return name @staticmethod def fix_indentation(tokens): """ In the following example def concept add one to a as: def func(x): return x+1 func(a) indentations in front of 'def func(x)', 'return x+1' and 'func(a)' must be fixed to avoid a python syntax error :param tokens: :return: """ if tokens[1].type != TokenKind.COLON: return tokens[1:] if len(tokens) < 3: return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE]) if tokens[2].type != TokenKind.NEWLINE: return UnexpectedTokenErrorNode([tokens[2]], "Unexpected token after colon", [TokenKind.NEWLINE]) if tokens[3].type != TokenKind.WHITESPACE: return SyntaxErrorNode([tokens[3]], "Indentation not found") indent_size = len(tokens[3].value) # now fix the other indentations i = 4 while i < len(tokens) - 1: if tokens[i].type == TokenKind.NEWLINE: if tokens[i + 1].type != TokenKind.WHITESPACE: return UnexpectedTokenErrorNode([tokens[i + 1]], "Unexpected token", [TokenKind.WHITESPACE]) if len(tokens[i + 1].value) < indent_size: return SyntaxErrorNode([tokens[i + 1]], "Invalid indentation.") tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size) i += 1 return tokens[4:] def parse(self): return self.parse_statement() def parse_statement(self): token = self.get_token() if token.value == Keywords.DEF: self.next_token() return self.parse_def_concept() else: return self.parse_expression() def parse_def_concept(self): """ def concept name [where xxx] [pre xxx] [post xxx] [as xxx] """ def_concept_parts = [Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST] tokens_found = {} # Node token is supposed to be a list, but here, it will be a dict token = self.get_token() if token.value != Keywords.CONCEPT: return self.add_error(UnexpectedTokenErrorNode([token], "Syntax error.", [Keywords.CONCEPT])) self.next_token() token = self.get_token() if token.value in (Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST): return self.add_error(UnexpectedTokenErrorNode([token], "Concept name is missing.", [""])) name_as_tokens = [] while token.type != TokenKind.EOF and token.value not in def_concept_parts: name_as_tokens.append(token) self.next_token() token = self.get_token() name = self.get_concept_name(name_as_tokens) tokens_found["name"] = name_as_tokens # try to parse as, where, pre and post declarations tokens = { Keywords.AS: None, Keywords.WHERE: None, Keywords.PRE: None, Keywords.POST: None, } current_part = None while token.type != TokenKind.EOF: if token.value in def_concept_parts: keyword = token.value if tokens[keyword]: return self.add_error(SyntaxErrorNode([token], f"Too many '{keyword.value}' declarations.")) tokens[keyword] = [token] # first element of the list is the keyword current_part = keyword self.next_token() else: if current_part is None: return self.add_error(UnexpectedTokenErrorNode([token], "Unexpected token", def_concept_parts)) else: tokens[current_part].append(token) self.next_token(False) token = self.get_token() for t in tokens: tokens_found[t.value] = tokens[t] asts = { Keywords.AS: NopNode(), Keywords.WHERE: NopNode(), Keywords.PRE: NopNode(), Keywords.POST: NopNode(), } # check for empty declarations for keyword in tokens: current_tokens = tokens[keyword] if current_tokens is not None: if len(current_tokens) == 0: # only one element means empty decl return self.add_error(SyntaxErrorNode([current_tokens[0]], "Empty declaration"), False) else: current_tokens = self.fix_indentation(current_tokens) if isinstance(current_tokens, ErrorNode): self.add_error(current_tokens) continue # start = current_tokens[0].index # end = current_tokens[-1].index + len(current_tokens[-1].value) sub_parser = self.sub_parser(current_tokens, source=keyword.value) sub_tree = sub_parser.parse() if isinstance(sub_tree, ErrorNode): self.add_error(sub_tree, False) asts[keyword] = sub_tree def_concept_node = DefConceptNode(tokens_found, # dict instead of list is wanted. name, asts[Keywords.WHERE], asts[Keywords.PRE], asts[Keywords.POST], asts[Keywords.AS]) log.debug(f"Found DefConcept node '{def_concept_node}'") return def_concept_node def parse_expression(self): return self.parse_addition() def parse_addition(self): left = self.parse_multiply() token = self.get_token() if token is None or token.type == TokenKind.EOF: return left if token.type == TokenKind.NUMBER: # example 15 +5 or 15 -5 right = self.parse_addition() return BinaryNode(self.collect_tokens(left, token, right), TokenKind.PLUS, left, right) if token.type not in (TokenKind.PLUS, TokenKind.MINUS): return left self.next_token() right = self.parse_addition() return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right) def parse_multiply(self): left = self.parse_atom() token = self.get_token() if token is None or token.type == TokenKind.EOF: return left if token.type not in (TokenKind.STAR, TokenKind.SLASH): return left self.next_token() right = self.parse_multiply() return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right) def parse_atom(self): token = self.get_token() if token.type == TokenKind.NUMBER: self.next_token() return NumberNode([token], float(token.value) if '.' in token.value else int(token.value)) elif token.type == TokenKind.STRING: self.next_token() return StringNode([token], token.value[1:-1], token.value[0]) elif token.type == TokenKind.IDENTIFIER: if token.value == "true": self.next_token() return TrueNode([token]) elif token.value == "false": self.next_token() return FalseNode([token]) elif token.value == "null": self.next_token() return NullNode([token]) else: self.next_token() return VariableNode([token], token.value) elif token.type == TokenKind.LPAR: self.next_token() exp = self.parse_expression() token = self.get_token() self.next_token() if token.type != TokenKind.RPAR: error = UnexpectedTokenErrorNode([token], "Right parenthesis not found.", [TokenKind.RPAR]) self.add_error(error) return error return exp else: error = UnexpectedTokenErrorNode([token], "Unexpected token", [TokenKind.NUMBER, TokenKind.STRING, TokenKind.IDENTIFIER, "true", "false", "null", TokenKind.LPAR]) return self.add_error(error)