Files
Sheerka-Old/parsers/DefaultParser.py
T

429 lines
15 KiB
Python

from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept, ParserResultConcept
from core.concept import ConceptParts
import core.builtin_helpers
import core.utils
from parsers.BaseParser import BaseParser, Node, ErrorNode, NotInitializedNode
from core.tokenizer import Tokenizer, TokenKind, Token, Keywords
from dataclasses import dataclass, field
from parsers.BnfParser import BnfParser
from core.sheerka import ExecutionContext
@dataclass()
class DefaultParserNode(Node):
"""
Base node for all default parser nodes
"""
tokens: list = field(compare=False, repr=False)
@dataclass()
class DefaultParserErrorNode(DefaultParserNode, ErrorNode):
pass
@dataclass()
class UnexpectedTokenErrorNode(DefaultParserErrorNode):
message: str
expected_tokens: list
@dataclass()
class SyntaxErrorNode(DefaultParserErrorNode):
"""
The input is recognized, but there is a syntax error
"""
message: str
@dataclass()
class CannotHandleErrorNode(DefaultParserErrorNode):
"""
The input is not recognized
"""
text: str
@dataclass()
class NameNode(DefaultParserNode):
def get_name(self):
name = ""
first = True
for token in self.tokens:
if token.type == TokenKind.EOF:
break
if token.type == TokenKind.WHITESPACE:
continue
if not first:
name += " "
name += token.value[1:-1] if token.type == TokenKind.STRING else token.value
first = False
return name
def __repr__(self):
return self.get_name()
def __eq__(self, other):
if not isinstance(other, NameNode):
return False
return self.get_name() == other.get_name()
def __hash__(self):
return hash(self.get_name())
@dataclass()
class DefConceptNode(DefaultParserNode):
name: NameNode = NotInitializedNode()
where: ReturnValueConcept = NotInitializedNode()
pre: ReturnValueConcept = NotInitializedNode()
post: ReturnValueConcept = NotInitializedNode()
body: ReturnValueConcept = NotInitializedNode()
definition: ReturnValueConcept = NotInitializedNode()
def get_asts(self):
asts = {}
for part_key in ConceptParts:
prop_value = getattr(self, part_key.value)
if isinstance(prop_value, ReturnValueConcept) and isinstance(prop_value.body,
ParserResultConcept) and hasattr(
prop_value.body.body, "ast_"):
asts[part_key] = prop_value.body.body.ast_
return asts
@dataclass()
class IsaConceptNode(DefaultParserNode):
concept: NameNode = NotInitializedNode()
set: NameNode = NotInitializedNode()
class DefaultParser(BaseParser):
"""
Parse sheerka specific grammar (like def concept)
"""
def __init__(self, **kwargs):
BaseParser.__init__(self, "Default", 50)
self.lexer_iter = None
self._current = None
self.context: ExecutionContext = None
self.text = None
self.sheerka = None
@staticmethod
def fix_indentation(tokens):
"""
In the following example
def concept add one to a as:
def func(x):
return x+1
func(a)
indentations in front of 'def func(x)', 'return x+1' and 'func(a)' must be fixed to avoid a python syntax error
:param tokens:
:return:
"""
if tokens[0].type != TokenKind.COLON:
return tokens
if len(tokens) < 3:
return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE])
if tokens[1].type != TokenKind.NEWLINE:
return UnexpectedTokenErrorNode([tokens[1]], "Unexpected token after colon", [TokenKind.NEWLINE])
if tokens[2].type != TokenKind.WHITESPACE:
return SyntaxErrorNode([tokens[2]], "Indentation not found.")
indent_size = len(tokens[2].value)
# now fix the other indentations
i = 3
while i < len(tokens) - 1:
if tokens[i].type == TokenKind.NEWLINE:
if tokens[i + 1].type != TokenKind.WHITESPACE:
return UnexpectedTokenErrorNode([tokens[i + 1]], "Unexpected token", [TokenKind.WHITESPACE])
if len(tokens[i + 1].value) < indent_size:
return SyntaxErrorNode([tokens[i + 1]], "Invalid indentation.")
tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size)
i += 1
return tokens[3:]
def reset_parser(self, context, text):
self.context = context
self.sheerka = context.sheerka
self.text = text
self.lexer_iter = iter(Tokenizer(text))
self._current = None
self.next_token()
def add_error(self, error, next_token=True):
self.has_error = True
self.error_sink.append(error)
if next_token:
self.next_token()
return error
def get_token(self) -> Token:
return self._current
def next_token(self, skip_whitespace=True):
try:
self._current = next(self.lexer_iter)
if skip_whitespace:
while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE:
self._current = next(self.lexer_iter)
except StopIteration:
self._current = None
return
def parse(self, context, text):
# default parser can only manage string text
if not isinstance(text, str):
ret = context.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=text))
self.log_result(context, text, ret)
return ret
tree = None
try:
self.reset_parser(context, text)
tree = self.parse_statement()
except core.tokenizer.LexerError as e:
self.add_error(e, False)
# If a error is found it must be sent to error_sink
# tree must contain what was recognized
if self.has_error and isinstance(self.error_sink[0], CannotHandleErrorNode):
body = self.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=self.error_sink)
else:
body = self.sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=text,
body=self.error_sink if self.has_error else tree,
try_parsed=tree)
ret = self.sheerka.ret(
self.name,
not self.has_error,
body)
self.log_result(context, text, ret)
return ret
def parse_statement(self):
token = self.get_token()
if token.value == Keywords.DEF:
self.next_token()
self.context.log(self.verbose_log, "Keyword DEF found.", self.name)
return self.parse_def_concept(token)
else:
return self.parse_isa_concept()
def parse_def_concept(self, def_token):
"""
def concept name [where xxx] [pre xxx] [post xxx] [as xxx]
"""
# init
keywords_tokens = [def_token]
concept_found = DefConceptNode(keywords_tokens)
# the definition of a concept consists of several parts
# Keywords.CONCEPT to get the name of the concept
# Keywords.FROM [Keywords.REGEX] to get the definition of the concept
# Keywords.AS to get the body
# Keywords.WHERE to get the conditions to recognize for the variables
# Keywords.PRE to know if the conditions to evaluate the concept
# Keywords.POST to apply or verify once the concept is executed
#
# Regroup the tokens by parts
first_token, tokens_found_by_parts = self.regroup_tokens_by_parts(keywords_tokens)
if first_token.type == TokenKind.EOF:
return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT]))
# get the name
concept_found.name = self.get_concept_name(first_token, tokens_found_by_parts)
# get the definition
concept_found.definition = self.get_concept_definition(tokens_found_by_parts)
# get the ASTs for the remaining parts
asts_found_by_parts = self.get_concept_parts(tokens_found_by_parts)
concept_found.where = asts_found_by_parts[Keywords.WHERE]
concept_found.pre = asts_found_by_parts[Keywords.PRE]
concept_found.post = asts_found_by_parts[Keywords.POST]
concept_found.body = asts_found_by_parts[Keywords.AS]
return concept_found
def parse_isa_concept(self):
concept_name = self.parse_concept_name()
if isinstance(concept_name, DefaultParserErrorNode):
return concept_name
keyword = []
token = self.get_token()
if token.value != Keywords.ISA:
return self.add_error(CannotHandleErrorNode([token], ""))
keyword.append(token)
self.next_token()
set_name = self.parse_concept_name()
return IsaConceptNode(keyword, concept_name, set_name)
def parse_concept_name(self):
tokens = []
token = self.get_token()
while not (token.type == TokenKind.EOF or token.type == TokenKind.KEYWORD):
tokens.append(token)
self.next_token()
token = self.get_token()
if len(tokens) == 0:
return self.add_error(UnexpectedTokenErrorNode([token], "Unexpected token", []))
else:
return NameNode(tokens)
def regroup_tokens_by_parts(self, keywords_tokens):
def_concept_parts = [Keywords.CONCEPT, Keywords.FROM, Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST]
# tokens found, when trying to recognize the parts
tokens_found_by_parts = {
Keywords.CONCEPT: [],
Keywords.FROM: None,
Keywords.AS: None,
Keywords.WHERE: None,
Keywords.PRE: None,
Keywords.POST: None,
}
current_part = Keywords.CONCEPT
token = self.get_token()
first_token = token
# loop thru the tokens, and put them in the correct tokens_found_by_parts entry
while token.type != TokenKind.EOF:
if token.value in def_concept_parts:
keywords_tokens.append(token) # keep track of the keywords
keyword = token.value
if tokens_found_by_parts[keyword]:
# a part is defined more than once
self.add_error(SyntaxErrorNode([token], f"Too many '{keyword.value}' declarations."))
tokens_found_by_parts[current_part].append(token) # adds the token again
else:
tokens_found_by_parts[keyword] = [token]
current_part = keyword
self.next_token()
else:
tokens_found_by_parts[current_part].append(token)
self.next_token(False)
token = self.get_token()
return first_token, tokens_found_by_parts
def get_concept_name(self, first_token, tokens_found_by_parts):
name_first_token_index = 1
token = self.get_token()
if first_token.value != Keywords.CONCEPT:
self.add_error(UnexpectedTokenErrorNode([token], "Syntax error.", [Keywords.CONCEPT]))
name_first_token_index = 0
name_tokens = tokens_found_by_parts[Keywords.CONCEPT]
if len(name_tokens) == name_first_token_index:
self.add_error(SyntaxErrorNode([], "Name is mandatory"))
if name_tokens[-1].type == TokenKind.NEWLINE:
name_tokens = name_tokens[:-1] # strip trailing newlines
if TokenKind.NEWLINE in [t.type for t in name_tokens]:
self.add_error(SyntaxErrorNode(tokens_found_by_parts[Keywords.CONCEPT], "Newline are not allowed in name."))
name_node = NameNode(name_tokens[name_first_token_index:]) # skip the first token
return name_node
def get_concept_definition(self, tokens_found_by_parts):
if tokens_found_by_parts[Keywords.FROM] is None:
return NotInitializedNode()
definition_tokens = tokens_found_by_parts[Keywords.FROM]
if definition_tokens[1].value != Keywords.BNF:
return NotInitializedNode()
tokens = core.utils.strip_tokens(definition_tokens[2:])
if len(tokens) == 0:
self.add_error(SyntaxErrorNode([definition_tokens[1]], "Empty declaration"), False)
return NotInitializedNode()
regex_parser = BnfParser()
with self.context.push(self.name) as sub_context:
parsing_result = regex_parser.parse(sub_context, tokens)
sub_context.add_values(return_values=parsing_result)
if not parsing_result.status:
self.add_error(parsing_result.value)
return NotInitializedNode()
return parsing_result
def get_concept_parts(self, tokens_found_by_parts):
asts_found_by_parts = {
Keywords.AS: NotInitializedNode(),
Keywords.WHERE: NotInitializedNode(),
Keywords.PRE: NotInitializedNode(),
Keywords.POST: NotInitializedNode(),
}
for keyword in tokens_found_by_parts:
if keyword == Keywords.CONCEPT or keyword == Keywords.FROM:
continue # already done
tokens = tokens_found_by_parts[keyword]
if tokens is None:
continue # nothing to do
if len(tokens) == 1: # check for empty declarations
self.add_error(SyntaxErrorNode([tokens[0]], "Empty declaration"), False)
continue
tokens = self.fix_indentation(tokens[1:]) # manage multi-lines declarations
if isinstance(tokens, ErrorNode):
self.add_error(tokens)
continue
# ask the other parsers if they recognize the tokens
with self.context.push(self.name, desc=f"Parsing {keyword}") as sub_context:
sub_context.log_new(self.verbose_log)
to_parse = self.sheerka.ret(
sub_context.who,
True,
self.sheerka.new(BuiltinConcepts.USER_INPUT, body=tokens))
steps = [BuiltinConcepts.PARSING]
parsed = self.sheerka.execute(sub_context, to_parse, steps, self.verbose_log)
parsing_result = core.builtin_helpers.expect_one(sub_context, parsed, self.verbose_log)
sub_context.add_values(return_values=parsing_result)
if not parsing_result.status:
self.add_error(parsing_result.value)
continue
asts_found_by_parts[keyword] = parsing_result
return asts_found_by_parts