Files
Sheerka-Old/parsers/DefaultParser.py
T

499 lines
16 KiB
Python

from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept
from core.concept import ConceptParts
from parsers.BaseParser import BaseParser, Node, NopNode, ErrorNode, NotInitializedNode
from core.tokenizer import Tokenizer, TokenKind, Token, Keywords
from dataclasses import dataclass, field
import logging
log = logging.getLogger(__name__)
@dataclass()
class DefaultParserNode(Node):
"""
Base node for all default parser nodes
"""
tokens: list = field(compare=False, repr=False)
@dataclass()
class DefaultParserErrorNode(DefaultParserNode, ErrorNode):
pass
@dataclass()
class UnexpectedTokenErrorNode(DefaultParserErrorNode):
message: str
expected_tokens: list
# def __post_init__(self):
# log.debug("-> UnexpectedTokenErrorNode: " + self.message)
@dataclass()
class SyntaxErrorNode(DefaultParserErrorNode):
"""
The input is recognized, but there is a syntax error
"""
message: str
# def __post_init__(self):
# log.debug("-> SyntaxErrorNode: " + self.message)
@dataclass()
class CannotHandleErrorNode(DefaultParserErrorNode):
"""
The input is not recognized
"""
text: str
# def __post_init__(self):
# log.debug("-> CannotHandleErrorNode: " + self.text)
#
# @dataclass()
# class NumberNode(DefaultParserNode):
# value: object
#
# def __repr__(self):
# return str(self.value)
#
#
# @dataclass()
# class StringNode(DefaultParserNode):
# value: str
# quote: str
#
# def is_same(self, other):
# if not super(StringNode, self).is_same(other):
# return False
# return self.quote == other.quote
#
# def __repr__(self):
# return self.quote + self.value + self.quote
#
#
# @dataclass()
# class VariableNode(DefaultParserNode):
# value: str
#
# def __repr__(self):
# return self.value
#
#
# @dataclass()
# class TrueNode(DefaultParserNode):
# pass
#
# def __repr__(self):
# return "true"
#
#
# @dataclass()
# class FalseNode(DefaultParserNode):
# pass
#
# def __repr__(self):
# return "false"
#
#
# @dataclass()
# class NullNode(DefaultParserNode):
# pass
#
# def __repr__(self):
# return "null"
#
#
# @dataclass()
# class BinaryNode(DefaultParserNode):
# operator: TokenKind
# left: Node
# right: Node
#
# def is_same(self, other):
# if not super(BinaryNode, self).is_same(other):
# return False
# if self.operator != other.operator:
# return False
# if not self.left.is_same(other.left):
# return False
# return self.right.is_same(other.right)
#
# def __repr__(self):
# return f"({self.left} {self.operator} {self.right})"
#
@staticmethod
def get_concept_key(tokens, variables=None):
key = ""
first = True
for token in tokens:
if token.type == TokenKind.EOF:
break
if token.type == TokenKind.WHITESPACE:
continue
if not first:
key += " "
if variables is not None and token.value in variables:
key += "__var__" + str(variables.index(token.value))
else:
key += token.value[1:-1] if token.type == TokenKind.STRING else token.value
first = False
return key
@dataclass()
class NameNode(DefaultParserNode):
def get_name(self):
name = ""
first = True
for token in self.tokens:
if token.type == TokenKind.EOF:
break
if token.type == TokenKind.WHITESPACE:
continue
if not first:
name += " "
name += token.value[1:-1] if token.type == TokenKind.STRING else token.value
first = False
return name
def __repr__(self):
return self.get_name()
def __eq__(self, other):
if not isinstance(other, NameNode):
return False
return self.get_name() == other.get_name()
def __hash__(self):
return hash(self.get_name())
@dataclass()
class DefConceptNode(DefaultParserNode):
name: NameNode = NotInitializedNode()
where: ReturnValueConcept = NotInitializedNode()
pre: ReturnValueConcept = NotInitializedNode()
post: ReturnValueConcept = NotInitializedNode()
body: ReturnValueConcept = NotInitializedNode()
def get_codes(self):
codes = {}
for part_key in ConceptParts:
prop_value = getattr(self, part_key.value)
if hasattr(prop_value, "ast_"):
codes[part_key] = prop_value.ast_
return codes
class DefaultParser(BaseParser):
"""
Parse sheerka specific grammar (like def concept)
"""
def __init__(self):
BaseParser.__init__(self, "DefaultParser")
self.lexer_iter = None
self._current = None
self.context = None
self.text = None
self.sheerka = None
@staticmethod
def fix_indentation(tokens):
"""
In the following example
def concept add one to a as:
def func(x):
return x+1
func(a)
indentations in front of 'def func(x)', 'return x+1' and 'func(a)' must be fixed to avoid a python syntax error
:param tokens:
:return:
"""
if tokens[0].type != TokenKind.COLON:
return tokens
if len(tokens) < 3:
return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE])
if tokens[1].type != TokenKind.NEWLINE:
return UnexpectedTokenErrorNode([tokens[1]], "Unexpected token after colon", [TokenKind.NEWLINE])
if tokens[2].type != TokenKind.WHITESPACE:
return SyntaxErrorNode([tokens[2]], "Indentation not found.")
indent_size = len(tokens[2].value)
# now fix the other indentations
i = 3
while i < len(tokens) - 1:
if tokens[i].type == TokenKind.NEWLINE:
if tokens[i + 1].type != TokenKind.WHITESPACE:
return UnexpectedTokenErrorNode([tokens[i + 1]], "Unexpected token", [TokenKind.WHITESPACE])
if len(tokens[i + 1].value) < indent_size:
return SyntaxErrorNode([tokens[i + 1]], "Invalid indentation.")
tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size)
i += 1
return tokens[3:]
def reset_parser(self, context, text):
self.context = context
self.sheerka = context.sheerka
self.text = text
self.lexer_iter = iter(Tokenizer(text))
self._current = None
self.next_token()
def add_error(self, error, next_token=True):
self.has_error = True
self.error_sink.append(error)
if next_token:
self.next_token()
return error
def get_token(self) -> Token:
return self._current
def next_token(self, skip_whitespace=True):
try:
self._current = next(self.lexer_iter)
if skip_whitespace:
while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE:
self._current = next(self.lexer_iter)
except StopIteration:
self._current = None
return
def parse(self, context, text):
# default parser can only manage string text
if not isinstance(text, str):
log.debug(f"Failed to recognize '{text}'")
return context.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=text))
self.reset_parser(context, text)
tree = self.parse_statement()
# If a error is found it must be sent to error_sink
# tree must contain what was recognized
ret = self.sheerka.ret(
self.name,
not self.has_error,
self.sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=text,
body=self.error_sink if self.has_error else tree,
try_parsed=tree))
self.log_result(log, text, ret)
return ret
def parse_statement(self):
token = self.get_token()
if token.value == Keywords.DEF:
self.next_token()
return self.parse_def_concept(token)
else:
return self.add_error(CannotHandleErrorNode([], self.text))
def parse_def_concept(self, def_token):
"""
def concept name [where xxx] [pre xxx] [post xxx] [as xxx]
"""
# init
log.debug("It may be a definition of a concept")
concept_special_tokens = [def_token]
concept_found = DefConceptNode(concept_special_tokens)
# the definition of a concept consists of several parts
# Keywords.CONCEPT to get the name of the concept
# Keywords.AS to get the body
# Keywords.WHERE to get the conditions to recognize for the variables
# Keywords.PRE to know if the conditions to evaluate the concept
# Keywords.POST to apply or verify once the concept is executed
def_concept_parts = [Keywords.CONCEPT, Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST]
# tokens found, when trying to recognize the parts
tokens_found_by_parts = {
Keywords.CONCEPT: [],
Keywords.AS: None,
Keywords.WHERE: None,
Keywords.PRE: None,
Keywords.POST: None,
}
current_part = Keywords.CONCEPT
token = self.get_token()
first_token = token
# loop thru the tokens, and put them in the correct tokens_found_by_parts entry
while token.type != TokenKind.EOF:
if token.value in def_concept_parts:
concept_special_tokens.append(token) # keep track of the keywords
keyword = token.value
if tokens_found_by_parts[keyword]:
# a part is defined more than once
self.add_error(SyntaxErrorNode([token], f"Too many '{keyword.value}' declarations."))
tokens_found_by_parts[current_part].append(token) # adds the token again
else:
tokens_found_by_parts[keyword] = [token]
current_part = keyword
self.next_token()
else:
tokens_found_by_parts[current_part].append(token)
self.next_token(False)
token = self.get_token()
# semantic checks
name_first_token_index = 1
if first_token.value != Keywords.CONCEPT:
self.add_error(UnexpectedTokenErrorNode([token], "Syntax error.", [Keywords.CONCEPT]))
name_first_token_index = 0
# Manage the name
name_tokens = tokens_found_by_parts[Keywords.CONCEPT]
if len(name_tokens) == name_first_token_index:
self.add_error(SyntaxErrorNode([], "Name is mandatory"))
if name_tokens[-1].type == TokenKind.NEWLINE:
name_tokens = name_tokens[:-1] # strip trailing newlines
if TokenKind.NEWLINE in [t.type for t in name_tokens]:
self.add_error(SyntaxErrorNode(tokens_found_by_parts[Keywords.CONCEPT], "Newline are not allowed in name."))
concept_found.name = NameNode(name_tokens[name_first_token_index:]) # skip the first token
asts_found_by_parts = {
Keywords.AS: NotInitializedNode(),
Keywords.WHERE: NotInitializedNode(),
Keywords.PRE: NotInitializedNode(),
Keywords.POST: NotInitializedNode(),
}
for keyword in tokens_found_by_parts:
if keyword == Keywords.CONCEPT:
continue # already done
log.debug("Processing part '" + keyword.name + "'")
tokens = tokens_found_by_parts[keyword]
if tokens is None:
continue # nothing to do
if len(tokens) == 1: # check for empty declarations
self.add_error(SyntaxErrorNode([tokens[0]], "Empty declaration"), False)
continue
tokens = self.fix_indentation(tokens[1:]) # manage multi-lines declarations
if isinstance(tokens, ErrorNode):
self.add_error(tokens)
continue
# ask the other parsers if they recognize the tokens
new_context = self.context.push(self)
parsing_result = self.sheerka.expect_one(new_context, self.sheerka.parse(new_context, tokens))
if not parsing_result.status:
self.add_error(parsing_result.value)
continue
asts_found_by_parts[keyword] = parsing_result
concept_found.where = asts_found_by_parts[Keywords.WHERE]
concept_found.pre = asts_found_by_parts[Keywords.PRE]
concept_found.post = asts_found_by_parts[Keywords.POST]
concept_found.body = asts_found_by_parts[Keywords.AS]
log.debug(f"Found DefConcept node '{concept_found}'")
return concept_found
# def parse_expression(self):
# return self.parse_addition()
#
# def parse_addition(self):
# left = self.parse_multiply()
# token = self.get_token()
# if token is None or token.type == TokenKind.EOF:
# return left
#
# if token.type == TokenKind.NUMBER: # example 15 +5 or 15 -5
# right = self.parse_addition()
# return BinaryNode(self.collect_tokens(left, token, right), TokenKind.PLUS, left, right)
#
# if token.type not in (TokenKind.PLUS, TokenKind.MINUS):
# return left
#
# self.next_token()
# right = self.parse_addition()
# return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right)
#
# def parse_multiply(self):
# left = self.parse_atom()
# token = self.get_token()
# if token is None or token.type == TokenKind.EOF:
# return left
#
# if token.type not in (TokenKind.STAR, TokenKind.SLASH):
# return left
#
# self.next_token()
# right = self.parse_multiply()
# return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right)
#
# def parse_atom(self):
# token = self.get_token()
# if token.type == TokenKind.NUMBER:
# self.next_token()
# return NumberNode([token], float(token.value) if '.' in token.value else int(token.value))
# elif token.type == TokenKind.STRING:
# self.next_token()
# return StringNode([token], token.value[1:-1], token.value[0])
# elif token.type == TokenKind.IDENTIFIER:
# if token.value == "true":
# self.next_token()
# return TrueNode([token])
# elif token.value == "false":
# self.next_token()
# return FalseNode([token])
# elif token.value == "null":
# self.next_token()
# return NullNode([token])
# else:
# self.next_token()
# return VariableNode([token], token.value)
# elif token.type == TokenKind.LPAR:
# self.next_token()
# exp = self.parse_expression()
# token = self.get_token()
# self.next_token()
#
# if token.type != TokenKind.RPAR:
# error = UnexpectedTokenErrorNode([token], "Right parenthesis not found.", [TokenKind.RPAR])
# self.add_error(error)
# return error
#
# return exp
# else:
# error = UnexpectedTokenErrorNode([token], "Unexpected token",
# [TokenKind.NUMBER, TokenKind.STRING, TokenKind.IDENTIFIER, "true", "false",
# "null", TokenKind.LPAR])
# return self.add_error(error)