Files
Sheerka-Old/parsers/DefaultParser.py
T
2019-10-29 18:39:51 +01:00

384 lines
12 KiB
Python

from parsers.BaseParser import BaseParser, Node, NopNode, ErrorNode
from parsers.tokenizer import Tokenizer, TokenKind, Token, Keywords
from dataclasses import dataclass, field
@dataclass()
class DefaultParserNode(Node):
tokens: list = field(compare=False)
def is_same(self, other):
if type(self) != type(other):
return False
if hasattr(self, "value") and self.value != other.value:
return False
return True
@dataclass()
class DefaultParserErrorNode(DefaultParserNode, ErrorNode):
pass
@dataclass()
class UnexpectedTokenErrorNode(DefaultParserErrorNode):
message: str
expected_tokens: list
@dataclass()
class SyntaxErrorNode(DefaultParserErrorNode):
message: str
pass
@dataclass()
class DefConceptNode(DefaultParserNode):
name: str
where: Node = None
pre: Node = None
post: Node = None
body: Node = NopNode
def get_codes(self):
codes = {}
for prop in ["where", "pre", "post", "body"]:
prop_value = getattr(self, prop)
if hasattr(prop_value, "ast"):
codes[prop] = prop_value.ast
return codes
@dataclass()
class NumberNode(DefaultParserNode):
value: object
def __repr__(self):
return str(self.value)
@dataclass()
class StringNode(DefaultParserNode):
value: str
quote: str
def is_same(self, other):
if not super(StringNode, self).is_same(other):
return False
return self.quote == other.quote
def __repr__(self):
return self.quote + self.value + self.quote
@dataclass()
class VariableNode(DefaultParserNode):
value: str
def __repr__(self):
return self.value
@dataclass()
class TrueNode(DefaultParserNode):
pass
def __repr__(self):
return "true"
@dataclass()
class FalseNode(DefaultParserNode):
pass
def __repr__(self):
return "false"
@dataclass()
class NullNode(DefaultParserNode):
pass
def __repr__(self):
return "null"
@dataclass()
class BinaryNode(DefaultParserNode):
operator: TokenKind
left: Node
right: Node
def is_same(self, other):
if not super(BinaryNode, self).is_same(other):
return False
if self.operator != other.operator:
return False
if not self.left.is_same(other.left):
return False
return self.right.is_same(other.right)
def __repr__(self):
return f"({self.left} {self.operator} {self.right})"
class DefaultParser(BaseParser):
def __init__(self, text, sub_parser):
BaseParser.__init__(self, "Default", text)
self.sub_parser = sub_parser
self.lexer = Tokenizer(text)
self.lexer_iter = iter(Tokenizer(text))
self._current = None
self.next_token()
def collect_tokens(self, *args):
result = []
for item in args:
if isinstance(item, Node):
result.extend(item.tokens)
else:
result.append(item)
return result
def add_error(self, error, next_token=True):
self.has_error = True
self.error_sink.append(error)
if next_token:
self.next_token()
return error
def get_token(self) -> Token:
return self._current
def next_token(self, skip_whitespace=True):
try:
self._current = next(self.lexer_iter)
if skip_whitespace:
while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE:
self._current = next(self.lexer_iter)
except StopIteration:
self._current = None
return
@staticmethod
def get_concept_name(tokens):
name = ""
first = True
for token in tokens:
if token.type == TokenKind.EOF:
break
if not first:
name += " "
name += token.value[1:-1] if token.type == TokenKind.STRING else token.value
first = False
return name
@staticmethod
def fix_indentation(tokens):
"""
In the following example
def concept add one to a as:
def func(x):
return x+1
func(a)
indentations in front of 'def func(x)', 'return x+1' and 'func(a)' must be fixed to avoid a python syntax error
:param tokens:
:return:
"""
if tokens[1].type != TokenKind.COLON:
return tokens[1:]
if len(tokens) < 3:
return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE])
if tokens[2].type != TokenKind.NEWLINE:
return UnexpectedTokenErrorNode([tokens[2]], "Unexpected token after colon", [TokenKind.NEWLINE])
if tokens[3].type != TokenKind.WHITESPACE:
return SyntaxErrorNode([tokens[3]], "Indentation not found")
indent_size = len(tokens[3].value)
# now fix the other indentations
i = 4
while i < len(tokens) - 1:
if tokens[i].type == TokenKind.NEWLINE:
if tokens[i + 1].type != TokenKind.WHITESPACE:
return UnexpectedTokenErrorNode([tokens[i + 1]], "Unexpected token", [TokenKind.WHITESPACE])
if len(tokens[i + 1].value) < indent_size:
return SyntaxErrorNode([tokens[i + 1]], "Invalid indentation.")
tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size)
i += 1
return tokens[4:]
def parse(self):
return self.parse_statement()
def parse_statement(self):
token = self.get_token()
if token.value == Keywords.DEF:
self.next_token()
return self.parse_def_concept()
else:
return self.parse_expression()
def parse_def_concept(self):
"""
def concept name [where xxx] [pre xxx] [post xxx] [as xxx]
"""
def_concept_parts = [Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST]
token = self.get_token()
if token.value != Keywords.CONCEPT:
return self.add_error(UnexpectedTokenErrorNode([token], "Syntax error.", [Keywords.CONCEPT]))
self.next_token()
token = self.get_token()
if token.value in (Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST):
return self.add_error(UnexpectedTokenErrorNode([token], "Concept name is missing.", ["<name>"]))
name_as_tokens = []
while token.type != TokenKind.EOF and token.value not in def_concept_parts:
name_as_tokens.append(token)
self.next_token()
token = self.get_token()
name = self.get_concept_name(name_as_tokens)
# try to parse as, where, pre and post declarations
tokens = {
Keywords.AS: None,
Keywords.WHERE: None,
Keywords.PRE: None,
Keywords.POST: None,
}
current_part = None
while token.type != TokenKind.EOF:
if token.value in def_concept_parts:
keyword = token.value
if tokens[keyword]:
return self.add_error(SyntaxErrorNode([token], f"Too many '{keyword.value}' declarations."))
tokens[keyword] = [token] # first element of the list is the keyword
current_part = keyword
self.next_token()
else:
if current_part is None:
return self.add_error(UnexpectedTokenErrorNode([token], "Unexpected token", def_concept_parts))
else:
tokens[current_part].append(token)
self.next_token(False)
token = self.get_token()
asts = {
Keywords.AS: NopNode(),
Keywords.WHERE: NopNode(),
Keywords.PRE: NopNode(),
Keywords.POST: NopNode(),
}
# check for empty declarations
for keyword in tokens:
current_tokens = tokens[keyword]
if current_tokens is not None:
if len(current_tokens) == 0: # only one element means empty decl
return self.add_error(SyntaxErrorNode([current_tokens[0]], "Empty declaration"), False)
else:
current_tokens = self.fix_indentation(current_tokens)
if isinstance(current_tokens, ErrorNode):
self.add_error(current_tokens)
continue
# start = current_tokens[0].index
# end = current_tokens[-1].index + len(current_tokens[-1].value)
sub_parser = self.sub_parser(current_tokens, source=keyword.value)
sub_tree = sub_parser.parse()
if isinstance(sub_tree, ErrorNode):
self.add_error(sub_tree, False)
asts[keyword] = sub_tree
return DefConceptNode([], name,
asts[Keywords.WHERE],
asts[Keywords.PRE],
asts[Keywords.POST],
asts[Keywords.AS])
def parse_expression(self):
return self.parse_addition()
def parse_addition(self):
left = self.parse_multiply()
token = self.get_token()
if token is None or token.type == TokenKind.EOF:
return left
if token.type == TokenKind.NUMBER: # example 15 +5 or 15 -5
right = self.parse_addition()
return BinaryNode(self.collect_tokens(left, token, right), TokenKind.PLUS, left, right)
if token.type not in (TokenKind.PLUS, TokenKind.MINUS):
return left
self.next_token()
right = self.parse_addition()
return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right)
def parse_multiply(self):
left = self.parse_atom()
token = self.get_token()
if token is None or token.type == TokenKind.EOF:
return left
if token.type not in (TokenKind.STAR, TokenKind.SLASH):
return left
self.next_token()
right = self.parse_multiply()
return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right)
def parse_atom(self):
token = self.get_token()
if token.type == TokenKind.NUMBER:
self.next_token()
return NumberNode([token], float(token.value) if '.' in token.value else int(token.value))
elif token.type == TokenKind.STRING:
self.next_token()
return StringNode([token], token.value[1:-1], token.value[0])
elif token.type == TokenKind.IDENTIFIER:
if token.value == "true":
self.next_token()
return TrueNode([token])
elif token.value == "false":
self.next_token()
return FalseNode([token])
elif token.value == "null":
self.next_token()
return NullNode([token])
else:
self.next_token()
return VariableNode([token], token.value)
elif token.type == TokenKind.LPAR:
self.next_token()
exp = self.parse_expression()
token = self.get_token()
self.next_token()
if token.type != TokenKind.RPAR:
error = UnexpectedTokenErrorNode([token], "Right parenthesis not found.", [TokenKind.RPAR])
self.add_error(error)
return error
return exp
else:
error = UnexpectedTokenErrorNode([token], "Unexpected token",
[TokenKind.NUMBER, TokenKind.STRING, TokenKind.IDENTIFIER, "true", "false",
"null", TokenKind.LPAR])
return self.add_error(error)