Refactored Parsers. Introduced BaseCustomGrammarParser. Renamed DefaultParser into DefConceptParser

This commit is contained in:
2020-10-02 04:45:47 +02:00
parent d100b7e8b3
commit e8f2705dcf
28 changed files with 1411 additions and 872 deletions
+250
View File
@@ -0,0 +1,250 @@
from dataclasses import dataclass, field
import core.utils
from core.tokenizer import Keywords, TokenKind, Tokenizer
from parsers.BaseParser import BaseParser, Node, ErrorNode, UnexpectedEofNode, UnexpectedTokenErrorNode
@dataclass()
class CustomGrammarParserNode(Node):
"""
Base node for all default parser nodes
"""
tokens: list = field(compare=False, repr=False)
@dataclass()
class SyntaxErrorNode(CustomGrammarParserNode, ErrorNode):
"""
The input is recognized, but there is a syntax error
"""
message: str
def __eq__(self, other):
if id(self) == id(other):
return True
if not isinstance(other, SyntaxErrorNode):
return False
if self.message != other.message:
return False
if other.tokens is not None and self.tokens != other.tokens:
return False
return True
def __hash__(self):
return hash(self.message)
@dataclass()
class KeywordNotFound(CustomGrammarParserNode, ErrorNode):
keywords: list
def __eq__(self, other):
if id(self) == id(other):
return True
if not isinstance(other, KeywordNotFound):
return False
if self.keywords != other.keywords:
return False
if other.tokens is not None and self.tokens != other.tokens:
return False
return True
def __hash__(self):
return hash(self.keywords)
class BaseCustomGrammarParser(BaseParser):
"""
Base class for sheerka specific grammars
"""
DEFAULT_TAB_SIZE = 4
def __init__(self, name, priority: int, enabled=True):
super().__init__(name, priority, enabled=enabled)
@staticmethod
def skip_white_spaces(tokens):
i = 0
while i < len(tokens) and tokens[i].type == TokenKind.WHITESPACE:
i += 1
return i
def get_body(self, tokens):
"""
Get the body of a keyword definition
It manages colon body, but the colon must be stripped first
:param tokens:
:return:
"""
def get_tab_size(default_tab_size, text):
return sum([1 if isinstance(c, str) else default_tab_size for c in text])
pos = self.skip_white_spaces(tokens)
if len(tokens) - pos < 3:
self.add_error(SyntaxErrorNode(tokens, "Body is empty or too short."))
return None
if tokens[pos].type != TokenKind.NEWLINE:
self.add_error(UnexpectedTokenErrorNode("New line not found.", tokens[pos], [TokenKind.NEWLINE]))
return None
pos += 1
if tokens[pos].type != TokenKind.WHITESPACE:
self.add_error(UnexpectedTokenErrorNode("Indentation not found.", tokens[pos], [TokenKind.WHITESPACE]))
return None
indent_size = get_tab_size(self.DEFAULT_TAB_SIZE, tokens[pos].value)
pos += 1
i = pos
while i < len(tokens) - 1:
if tokens[i].type == TokenKind.NEWLINE:
if tokens[i + 1].type != TokenKind.WHITESPACE:
self.add_error(UnexpectedTokenErrorNode("Indentation not found.",
tokens[i + 1],
[TokenKind.WHITESPACE]))
return None
if get_tab_size(self.DEFAULT_TAB_SIZE, tokens[i + 1].value) < indent_size:
self.add_error(SyntaxErrorNode([tokens[i + 1]], "Invalid indentation."))
return None
tokens[i + 1] = tokens[i + 1].clone()
tokens[i + 1].value = " " * (get_tab_size(self.DEFAULT_TAB_SIZE, tokens[i + 1].value) - indent_size)
i += 1
return tokens[pos:]
def get_parts(self, keywords, expected_first_token=None):
"""
Reads Parser Input and groups the tokens by keywords
ex:
tokens = Tokenizer("as a b c pre u v w where x y z")
keywords = ["as", "pre", "where"]
assert get_parts(keywords) == {
Keyword("as"): [Token("a"), Token(<ws>), Token("b"), Token(<ws>), Token("c"), Token(<ws>)],
Keyword("pre"): [Token("u"), Token(<ws>), Token("v"), Token(<ws>), Token("w"), Token(<ws>)],
Keyword("where"): [Token("x"), Token(<ws>), Token("y"), Token(<ws>), Token("z"), Token(<ws>)]}
* The order of appearance of the keywords is not important
"as w pre y where z" and "where z pre y as w" will produce the same dictionary
* I can use double quote to protect keyword
where "x y" will produce the entry Keyword("where"): [Token("x"), Token(<ws>), Token("y"), Token(<ws>)]
where 'x y' will produce the entry Keyword("where"): [Token("'x y'")]
:param keywords:
:param expected_first_token: it must be a KeyW
:return: dictionary
"""
def new_part(t, cma, p):
"""
:param t: token
:param cma: colon_mode_activated
:param p: previous token
:return:
"""
if t.value not in keywords:
return False
if not cma or not p:
return True
return p.line != t.line
if self.parser_input.token is None:
self.add_error(KeywordNotFound([], keywords))
return None
if self.parser_input.token.type == TokenKind.WHITESPACE:
self.parser_input.next_token()
token = self.parser_input.token
if expected_first_token and token.value != expected_first_token.value:
self.add_error(UnexpectedTokenErrorNode(f"'{expected_first_token.value}' keyword not found.",
token,
[expected_first_token]))
return None
if token.value not in keywords:
self.add_error(KeywordNotFound([token], keywords))
return None
colon_mode_activated = False # if activate, use keyword + colon to start a new keyword definition
previous_token = None
res = {}
# More explanations on colon_mode_activated
# You can use the pattern
# def concept <name> as:
# <tab> xxx
# <tab> yyy
# ...
#
# It allows to readability and usage of other keywords inside the bloc#
# Example
# def concept give the the date as:
# from datetime import date # I can use the 'from' keyword !!!
# return date.today()
#
# Note that I can choose to use colon or not
#
# def concept in x days as:
# from datetime import date
# return date.today() - x
# where x > 0
#
# is a valid declaration
# loop thru the tokens, and put them in the correct tokens_found_by_parts entry
while True:
if new_part(token, colon_mode_activated, previous_token):
keyword = Keywords(token.value)
if keyword in res:
# a part is defined more than once
self.add_error(SyntaxErrorNode([token], f"Too many '{keyword.value}' declarations."))
break
res[keyword] = [token] # to keep track of when it starts
colon_mode_activated = self.parser_input.the_token_after().type == TokenKind.COLON
if not self.parser_input.next_token():
self.add_error(UnexpectedEofNode(f"While parsing keyword '{keyword.value}'."))
break
else:
res[keyword].append(token)
if not self.parser_input.next_token(skip_whitespace=False):
break
previous_token = token
token = self.parser_input.token
# Post process the result if needed
for k, v in res.items():
stripped = core.utils.strip_tokens(v[1:])
# manage colon first, to sure that what is protected by the quotes remains protected
if len(stripped) > 0 and stripped[0].type == TokenKind.COLON:
body = self.get_body(stripped[1:])
if body:
res[k] = v[0:1] + body
# replace double quoted strings by their content
elif len(stripped) == 1 and stripped[0].type == TokenKind.STRING and stripped[0].value[0] == '"':
res[k] = v[0:1] + list(Tokenizer(stripped[0].strip_quote, yield_eof=False))
return res
+2 -14
View File
@@ -6,8 +6,7 @@ from typing import Set
import core.utils
from core.builtin_concepts import BuiltinConcepts
from core.concept import VARIABLE_PREFIX, Concept, DEFINITION_TYPE_BNF, ConceptParts
from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import TokenKind, LexerError, Token
from core.tokenizer import TokenKind, Token
from parsers.BaseParser import Node, BaseParser, ErrorNode
DEBUG_COMPILED = True
@@ -718,7 +717,7 @@ class BaseNodeParser(BaseParser):
"""
def __init__(self, name, priority, **kwargs):
super().__init__(name, priority)
super().__init__(name, priority, yield_eof=True)
if 'sheerka' in kwargs:
sheerka = kwargs.get("sheerka")
self.concepts_by_first_keyword = sheerka.resolved_concepts_by_first_keyword
@@ -745,17 +744,6 @@ class BaseNodeParser(BaseParser):
concepts_by_first_keyword = self.get_concepts_by_first_token(context, concepts).body
self.concepts_by_first_keyword = self.resolve_concepts_by_first_keyword(context, concepts_by_first_keyword).body
def reset_parser(self, context, parser_input: ParserInput):
self.context = context
self.sheerka = context.sheerka
self.parser_input = parser_input
try:
self.parser_input.reset(False)
except LexerError as e:
self.add_error(self.sheerka.new(BuiltinConcepts.ERROR, body=e), False)
return False
return True
def get_concepts(self, token, to_keep, custom=None, to_map=None, strip_quotes=False):
"""
Tries to find if there are concepts that match the value of the token
+25 -29
View File
@@ -1,7 +1,7 @@
import logging
from dataclasses import dataclass
from typing import Union
import core.utils
from core.builtin_concepts import BuiltinConcepts, ParserResultConcept
from core.concept import Concept
from core.sheerka.ExecutionContext import ExecutionContext
@@ -57,7 +57,7 @@ class ErrorNode(Node):
@dataclass()
class UnexpectedTokenErrorNode(ErrorNode):
message: str
token: Token
token: Union[Token, str]
expected_tokens: list
def __eq__(self, other):
@@ -70,31 +70,25 @@ class UnexpectedTokenErrorNode(ErrorNode):
if self.message != other.message:
return False
if self.token.type != other.token.type or self.token.value != other.token.value:
to_compare = self.token.repr_value if isinstance(other.token, str) else self.token
if to_compare != other.token:
return False
if len(self.expected_tokens) != len(other.expected_tokens):
return False
for i, t in enumerate(self.expected_tokens):
if t != other.expected_tokens[i]:
return False
return True
return self.expected_tokens == other.expected_tokens
def __hash__(self):
return hash((self.message, self.token, self.expected_tokens))
@dataclass()
class UnexpectedEof(ErrorNode):
class UnexpectedEofNode(ErrorNode):
message: str
class BaseParser:
PREFIX = "parsers."
def __init__(self, name, priority: int, enabled=True):
def __init__(self, name, priority: int, enabled=True, yield_eof=False):
self.log = get_logger("parsers." + self.__class__.__name__)
self.init_log = get_logger("init." + self.PREFIX + self.__class__.__name__)
self.verbose_log = get_logger("verbose." + self.PREFIX + self.__class__.__name__)
@@ -107,6 +101,7 @@ class BaseParser:
self.context: ExecutionContext = None
self.sheerka = None
self.parser_input: ParserInput = None
self.yield_eof = yield_eof
def __eq__(self, other):
if not isinstance(other, self.__class__):
@@ -126,10 +121,9 @@ class BaseParser:
self.error_sink.clear()
try:
self.parser_input.reset(False)
self.parser_input.next_token()
self.parser_input.reset(self.yield_eof)
except LexerError as e:
self.add_error(self.sheerka.new(BuiltinConcepts.ERROR, body=e), False)
self.add_error(e, False)
return False
return True
@@ -165,12 +159,12 @@ class BaseParser:
value = context.return_value_to_str(r)
context.log(f" Recognized '{value}'", self.name)
def get_return_value_body(self, sheerka, source, tree, try_parse):
def get_return_value_body(self, sheerka, source, parsed, try_parse):
"""
All parsers must return their result in a standard way
:param sheerka:
:param source:
:param tree:
:param parsed:
:param try_parse:
:return:
"""
@@ -178,17 +172,19 @@ class BaseParser:
return self.error_sink[0]
if self.has_error:
return sheerka.new(
BuiltinConcepts.ERROR,
body=self.error_sink
)
if parsed is None:
return sheerka.new(BuiltinConcepts.NOT_FOR_ME,
body=source,
reason=self.error_sink)
else:
return sheerka.new(BuiltinConcepts.ERROR,
body=self.error_sink)
return sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=source,
body=tree,
try_parsed=try_parse)
return sheerka.new(BuiltinConcepts.PARSER_RESULT,
parser=self,
source=source,
body=parsed,
try_parsed=try_parse)
@staticmethod
def get_input_as_lexer_nodes(parser_input, expected_parser=None):
@@ -242,7 +238,7 @@ class BaseParser:
tokens = [tokens]
switcher = {
# TokenKind.CONCEPT: lambda t: core.utils.str_concept(t.value),
# TokenKind.CONCEPT: lambda t: core.utils.str_concept(t.value),
}
if custom_switcher:
+274
View File
@@ -0,0 +1,274 @@
from dataclasses import dataclass, field
import core.builtin_helpers
import core.utils
from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept, ParserResultConcept
from core.concept import ConceptParts, DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF
from core.sheerka.services.SheerkaExecute import ParserInput, SheerkaExecute
from core.tokenizer import TokenKind, Keywords
from parsers.BaseCustomGrammarParser import BaseCustomGrammarParser, SyntaxErrorNode
from parsers.BaseParser import Node, ErrorNode, NotInitializedNode, UnexpectedTokenErrorNode
from parsers.BnfParser import BnfParser
class ParsingException(Exception):
def __init__(self, error):
self.error = error
@dataclass()
class DefaultParserNode(Node):
"""
Base node for all default parser nodes
"""
tokens: list = field(compare=False, repr=False)
@dataclass()
class DefaultParserErrorNode(DefaultParserNode, ErrorNode):
pass
@dataclass()
class CannotHandleErrorNode(DefaultParserErrorNode):
"""
The input is not recognized
"""
text: str
@dataclass()
class NameNode(DefaultParserNode):
def get_name(self):
name = ""
first = True
for token in self.tokens:
if token.type == TokenKind.EOF:
break
if token.type == TokenKind.WHITESPACE:
continue
if not first:
name += " "
name += token.value[1:-1] if token.type == TokenKind.STRING else str(token.value)
first = False
return name
def __repr__(self):
return self.get_name()
def __eq__(self, other):
if not isinstance(other, NameNode):
return False
return self.get_name() == other.get_name()
def __hash__(self):
return hash(self.get_name())
@dataclass()
class DefConceptNode(DefaultParserNode):
name: NameNode = NotInitializedNode()
where: ReturnValueConcept = NotInitializedNode()
pre: ReturnValueConcept = NotInitializedNode()
post: ReturnValueConcept = NotInitializedNode()
body: ReturnValueConcept = NotInitializedNode()
ret: ReturnValueConcept = NotInitializedNode()
definition: ReturnValueConcept = NotInitializedNode()
definition_type: str = None
def get_asts(self):
asts = {}
for part_key in ConceptParts:
prop_value = getattr(self, part_key.value)
if isinstance(prop_value, ReturnValueConcept) and \
isinstance(prop_value.body, ParserResultConcept) and \
hasattr(prop_value.body.body, "ast_"):
asts[part_key] = prop_value
return asts
@dataclass()
class IsaConceptNode(DefaultParserNode):
concept: NameNode = NotInitializedNode()
set: NameNode = NotInitializedNode()
class DefConceptParser(BaseCustomGrammarParser):
"""
Parse sheerka specific grammar (like def concept)
"""
KEYWORDS = [Keywords.CONCEPT, Keywords.FROM, Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST, Keywords.RET]
KEYWORDS_VALUES = [k.value for k in KEYWORDS]
def __init__(self, **kwargs):
BaseCustomGrammarParser.__init__(self, "DefConcept", 60)
def parse(self, context, parser_input: ParserInput):
# default parser can only manage string text
if parser_input.from_tokens:
ret = context.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=parser_input))
self.log_result(context, parser_input, ret)
return ret
context.log(f"Parsing '{parser_input}' with FunctionParser", self.name)
sheerka = context.sheerka
if parser_input.is_empty():
return sheerka.ret(self.name,
False,
sheerka.new(BuiltinConcepts.IS_EMPTY))
if not self.reset_parser(context, parser_input):
return self.sheerka.ret(self.name,
False,
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
self.parser_input.next_token()
node = self.parse_def_concept()
body = self.get_return_value_body(sheerka, parser_input.as_text(), node, node)
ret = sheerka.ret(self.name, not self.has_error, body)
self.log_result(context, parser_input.as_text(), ret)
return ret
def parse_def_concept(self):
"""
def concept name [where xxx] [pre xxx] [post xxx] [as xxx]
"""
token = self.parser_input.token
if token.value != Keywords.DEF.value:
self.add_error(UnexpectedTokenErrorNode("'def' keyword not found.", token, [Keywords.DEF]))
return None
self.context.log("Keyword DEF found.", self.name)
keywords_found = [token]
self.parser_input.next_token()
# ## the definition of a concept consists of several parts
# Keywords.CONCEPT to get the name of the concept
# Keywords.FROM [Keywords.BNF] | [Keywords.DEF] to get the definition of the concept
# Keywords.AS to get the body
# Keywords.WHERE to get the conditions to recognize for the variables
# Keywords.PRE to know if the conditions to evaluate the concept
# Keywords.POST to apply or verify once the concept is executed
# Keywords.RET to transform the concept into another concept
parts = self.get_parts(self.KEYWORDS_VALUES, expected_first_token=Keywords.CONCEPT)
if parts is None:
return None
keywords_found.extend([t[0] for t in parts.values()]) # keep track of all keywords found
node = DefConceptNode(keywords_found)
# if first_token.type == TokenKind.EOF:
# return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT]))
# get the name
node.name = self.get_concept_name(parts[Keywords.CONCEPT])
# get definition
node.definition_type, node.definition = self.get_concept_definition(node, parts)
# get the bodies
node.body = self.get_ast(Keywords.AS, parts)
node.where = self.get_ast(Keywords.WHERE, parts)
node.pre = self.get_ast(Keywords.PRE, parts)
node.post = self.get_ast(Keywords.POST, parts)
node.ret = self.get_ast(Keywords.RET, parts)
return node
def get_concept_name(self, tokens):
name_tokens = core.utils.strip_tokens(tokens[1:])
if len(name_tokens) == 0:
self.add_error(SyntaxErrorNode([], "Name is mandatory"))
return None
for token in name_tokens:
if token.type == TokenKind.NEWLINE:
self.add_error(SyntaxErrorNode([token], "Newline are not allowed in name."))
return None
name_node = NameNode(name_tokens) # skip the first token
return name_node
def get_concept_definition(self, current_concept_def, parts):
if Keywords.FROM not in parts:
return None, NotInitializedNode()
tokens = parts[Keywords.FROM]
if len(tokens) == 1:
self.add_error(SyntaxErrorNode([], f"Empty '{tokens[0].value}' declaration."), False)
return None, NotInitializedNode()
if tokens[1].value == Keywords.BNF.value:
return self.get_concept_bnf_definition(current_concept_def, core.utils.strip_tokens(tokens[2:]))
return self.get_concept_simple_definition(core.utils.strip_tokens(tokens[0:]))
def get_concept_bnf_definition(self, current_concept_def, tokens):
if len(tokens) == 0:
self.add_error(SyntaxErrorNode([], "Empty 'bnf' declaration"), False)
return None, NotInitializedNode()
if tokens[0].type == TokenKind.COLON:
tokens = self.get_body(tokens[1:])
bnf_regex_parser = BnfParser()
desc = f"Resolving BNF {current_concept_def.definition}"
with self.context.push(BuiltinConcepts.INIT_BNF,
current_concept_def,
who=self.name,
obj=current_concept_def,
desc=desc) as sub_context:
parsing_result = bnf_regex_parser.parse(sub_context, tokens)
sub_context.add_values(return_values=parsing_result)
if not parsing_result.status:
self.add_error(parsing_result.value)
return None, NotInitializedNode()
return DEFINITION_TYPE_BNF, parsing_result
def get_concept_simple_definition(self, tokens):
start = 2 if tokens[1].value == Keywords.DEF.value else 1
tokens = core.utils.strip_tokens(tokens[start:])
if len(tokens) == 0:
self.add_error(SyntaxErrorNode([], f"Empty 'from' declaration."), False)
return None, NotInitializedNode()
if tokens[0].type == TokenKind.COLON:
tokens = self.get_body(tokens[1:])
return DEFINITION_TYPE_DEF, NameNode(tokens)
def get_ast(self, keyword, parts):
if keyword not in parts:
return NotInitializedNode()
tokens = parts[keyword]
if len(tokens) == 1:
self.add_error(SyntaxErrorNode(tokens, f"Empty '{tokens[0].value}' declaration."))
return None
source = self.sheerka.services[SheerkaExecute.NAME].get_parser_input(None, tokens[1:])
parsed = core.builtin_helpers.parse_unrecognized(self.context,
source,
parsers="all",
who=self.name,
prop=keyword,
filter_func=core.builtin_helpers.expect_one)
if not parsed.status:
self.add_error(parsed.value)
return None
return parsed
-509
View File
@@ -1,509 +0,0 @@
from dataclasses import dataclass, field
import core.builtin_helpers
import core.utils
from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept, ParserResultConcept
from core.concept import ConceptParts, DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF
from core.sheerka.services.SheerkaExecute import ParserInput, SheerkaExecute
from core.tokenizer import Tokenizer, TokenKind, Keywords
from parsers.BaseParser import BaseParser, Node, ErrorNode, NotInitializedNode
from parsers.BnfParser import BnfParser
class ParsingException(Exception):
def __init__(self, error):
self.error = error
@dataclass()
class DefaultParserNode(Node):
"""
Base node for all default parser nodes
"""
tokens: list = field(compare=False, repr=False)
@dataclass()
class DefaultParserErrorNode(DefaultParserNode, ErrorNode):
pass
@dataclass()
class UnexpectedTokenErrorNode(DefaultParserErrorNode):
message: str
expected_tokens: list
@dataclass()
class SyntaxErrorNode(DefaultParserErrorNode):
"""
The input is recognized, but there is a syntax error
"""
message: str
@dataclass()
class CannotHandleErrorNode(DefaultParserErrorNode):
"""
The input is not recognized
"""
text: str
@dataclass()
class NameNode(DefaultParserNode):
def get_name(self):
name = ""
first = True
for token in self.tokens:
if token.type == TokenKind.EOF:
break
if token.type == TokenKind.WHITESPACE:
continue
if not first:
name += " "
name += token.value[1:-1] if token.type == TokenKind.STRING else str(token.value)
first = False
return name
def __repr__(self):
return self.get_name()
def __eq__(self, other):
if not isinstance(other, NameNode):
return False
return self.get_name() == other.get_name()
def __hash__(self):
return hash(self.get_name())
@dataclass()
class DefConceptNode(DefaultParserNode):
name: NameNode = NotInitializedNode()
where: ReturnValueConcept = NotInitializedNode()
pre: ReturnValueConcept = NotInitializedNode()
post: ReturnValueConcept = NotInitializedNode()
body: ReturnValueConcept = NotInitializedNode()
ret: ReturnValueConcept = NotInitializedNode()
definition: ReturnValueConcept = NotInitializedNode()
definition_type: str = None
def get_asts(self):
asts = {}
for part_key in ConceptParts:
prop_value = getattr(self, part_key.value)
if isinstance(prop_value, ReturnValueConcept) and \
isinstance(prop_value.body, ParserResultConcept) and \
hasattr(prop_value.body.body, "ast_"):
asts[part_key] = prop_value
return asts
@dataclass()
class IsaConceptNode(DefaultParserNode):
concept: NameNode = NotInitializedNode()
set: NameNode = NotInitializedNode()
class DefaultParser(BaseParser):
"""
Parse sheerka specific grammar (like def concept)
"""
def __init__(self, **kwargs):
BaseParser.__init__(self, "Default", 60)
@staticmethod
def fix_indentation(tokens):
"""
In the following example
def concept add one to a as:
def func(x):
return x+1
func(a)
indentations in front of 'def func(x)', 'return x+1' and 'func(a)' must be fixed to avoid a python syntax error
:param tokens:
:return:
"""
if len(tokens) == 0:
return tokens
tokens = tokens.copy() # do not modify ParserInput.tokens
if tokens[0].type != TokenKind.COLON:
return tokens
if len(tokens) < 3:
raise ParsingException(UnexpectedTokenErrorNode(tokens[0:2],
"Unexpected end of file",
[TokenKind.NEWLINE]))
pos = DefaultParser.eat_white_space(tokens, 1)
if tokens[pos].type != TokenKind.NEWLINE:
raise ParsingException(UnexpectedTokenErrorNode([tokens[pos]],
"Unexpected token after colon",
[TokenKind.NEWLINE]))
pos += 1
if tokens[pos].type != TokenKind.WHITESPACE:
raise ParsingException(SyntaxErrorNode([tokens[pos]],
"Indentation not found."))
indent_size = len(tokens[pos].value)
pos += 1
# now fix the other indentations
# KSI 23/05/2020 Not quite sure this 'fixing' stuff is still relevant,
# as I now have an editor in interactive mode
i = pos
while i < len(tokens) - 1:
if tokens[i].type == TokenKind.NEWLINE:
if tokens[i + 1].type != TokenKind.WHITESPACE:
return UnexpectedTokenErrorNode([tokens[i + 1]], "Unexpected token", [TokenKind.WHITESPACE])
if len(tokens[i + 1].value) < indent_size:
return SyntaxErrorNode([tokens[i + 1]], "Invalid indentation.")
tokens[i + 1] = tokens[i + 1].clone()
tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size)
i += 1
return tokens[pos:]
@staticmethod
def eat_white_space(tokens, index):
if index >= len(tokens):
return index
while index < len(tokens) and tokens[index].type == TokenKind.WHITESPACE:
index += 1
return index
def reset_parser(self, context, parser_input):
self.context = context
self.sheerka = context.sheerka
self.parser_input = parser_input
self.parser_input.reset()
self.parser_input.next_token()
def parse(self, context, parser_input: ParserInput):
# default parser can only manage string text
if parser_input.from_tokens:
ret = context.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=parser_input))
self.log_result(context, parser_input, ret)
return ret
try:
self.reset_parser(context, parser_input)
tree = self.parse_statement()
except core.tokenizer.LexerError as e:
return self.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.ERROR, body=[e]))
# If a error is found it must be sent to error_sink
# tree must contain what was recognized
if self.has_error and isinstance(self.error_sink[0], CannotHandleErrorNode):
body = self.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=self.error_sink)
else:
body = self.get_return_value_body(context.sheerka, parser_input.as_text(), tree, tree)
ret = self.sheerka.ret(
self.name,
not self.has_error,
body)
self.log_result(context, parser_input.as_text(), ret)
return ret
def parse_statement(self):
token = self.parser_input.token
if token.value == Keywords.DEF.value:
self.parser_input.next_token()
self.context.log("Keyword DEF found.", self.name)
return self.parse_def_concept(token)
return self.add_error(CannotHandleErrorNode([token], ""))
def parse_def_concept(self, def_token):
"""
def concept name [where xxx] [pre xxx] [post xxx] [as xxx]
"""
# init
keywords_tokens = [def_token]
concept_found = DefConceptNode(keywords_tokens)
# ##
# ## the definition of a concept consists of several parts
# ## Keywords.CONCEPT to get the name of the concept
# ## Keywords.FROM [Keywords.BNF] | [Keywords.DEF] to get the definition of the concept
# ## Keywords.AS to get the body
# ## Keywords.WHERE to get the conditions to recognize for the variables
# ## Keywords.PRE to know if the conditions to evaluate the concept
# ## Keywords.POST to apply or verify once the concept is executed
# Regroup the tokens by parts
first_token, tokens_found_by_parts = self.regroup_tokens_by_parts(keywords_tokens)
if first_token.type == TokenKind.EOF:
return self.add_error(UnexpectedTokenErrorNode([first_token], "Unexpected end of file", [Keywords.CONCEPT]))
# get the name
concept_found.name = self.get_concept_name(first_token, tokens_found_by_parts)
# get the definition
def_type, def_value = self.get_concept_definition(concept_found, tokens_found_by_parts)
concept_found.definition_type = def_type
concept_found.definition = def_value
# get the ASTs for the remaining parts
asts_found_by_parts = self.get_concept_parts(tokens_found_by_parts)
concept_found.where = asts_found_by_parts[Keywords.WHERE]
concept_found.pre = asts_found_by_parts[Keywords.PRE]
concept_found.post = asts_found_by_parts[Keywords.POST]
concept_found.body = asts_found_by_parts[Keywords.AS]
concept_found.ret = asts_found_by_parts[Keywords.RET]
return concept_found
def regroup_tokens_by_parts(self, keywords_tokens):
def new_part(t, cma, p):
"""
:param t: token
:param cma: concept_mode_activated
:param p: previous token
:return:
"""
if not t.value in def_concept_parts:
return False
if not cma or not p:
return True
return p.line != t.line
def_concept_parts = [Keywords.CONCEPT.value,
Keywords.FROM.value,
Keywords.AS.value,
Keywords.WHERE.value,
Keywords.PRE.value,
Keywords.POST.value,
Keywords.RET.value]
# tokens found, when trying to recognize the parts
tokens_found_by_parts = {
Keywords.CONCEPT: [],
Keywords.FROM: None,
Keywords.AS: None,
Keywords.WHERE: None,
Keywords.PRE: None,
Keywords.POST: None,
Keywords.RET: None,
}
current_part = Keywords.CONCEPT
token = self.parser_input.token
first_token = token
colon_mode_activated = False # if activate, use keyword + colon to start a new keyword definition
previous_token = None
# more explanation on colon_mode_activated
# You can use the pattern
# def concept <name> as:
# <tab> xxx
# <tab> yyy
# ...
#
# It allows to readability and usage of other keywords inside the bloc#
# Example
# def concept give the the date as:
# from datetime import date
# return date.today()
#
# 'from datetime' will not be considered as a keyword because it's lead by a tab
# whereas in
# def concept in x days as:
# from datetime import date
# return date.today() - x
# where x > 0
#
# where will be recognized as the keyword because it is the first word of the line
# loop thru the tokens, and put them in the correct tokens_found_by_parts entry
while token.type != TokenKind.EOF:
if new_part(token, colon_mode_activated, previous_token):
keywords_tokens.append(token) # keep track of the keywords
keyword = Keywords(token.value)
if tokens_found_by_parts[keyword]:
# a part is defined more than once
self.add_error(SyntaxErrorNode([token], f"Too many '{keyword.value}' declarations."))
tokens_found_by_parts[current_part].append(token) # adds the token again
else:
tokens_found_by_parts[keyword] = [token]
current_part = keyword
colon_mode_activated = self.parser_input.the_token_after().type == TokenKind.COLON
self.parser_input.next_token()
else:
tokens_found_by_parts[current_part].append(token)
self.parser_input.next_token(False)
previous_token = token
token = self.parser_input.token
return first_token, tokens_found_by_parts
def get_concept_name(self, first_token, tokens_found_by_parts):
name_first_token_index = 1
token = self.parser_input.token
if first_token.value != Keywords.CONCEPT.value:
self.add_error(UnexpectedTokenErrorNode([token], "Syntax error.", [Keywords.CONCEPT]))
name_first_token_index = 0
name_tokens = tokens_found_by_parts[Keywords.CONCEPT]
if len(name_tokens) == name_first_token_index:
self.add_error(SyntaxErrorNode([], "Name is mandatory"))
if name_tokens[-1].type == TokenKind.NEWLINE:
name_tokens = name_tokens[:-1] # strip trailing newlines
if TokenKind.NEWLINE in [t.type for t in name_tokens]:
self.add_error(SyntaxErrorNode(tokens_found_by_parts[Keywords.CONCEPT], "Newline are not allowed in name."))
tokens = name_tokens[name_first_token_index:]
stripped = core.utils.strip_tokens(tokens)
if len(stripped) == 1 and stripped[0].type == TokenKind.STRING and stripped[0].value[0] == '"':
tokens = list(Tokenizer(stripped[0].strip_quote, yield_eof=False))
name_node = NameNode(tokens) # skip the first token
return name_node
def get_concept_definition(self, current_concept_def, tokens_found_by_parts):
if tokens_found_by_parts[Keywords.FROM] is None:
return None, NotInitializedNode()
definition_tokens = tokens_found_by_parts[Keywords.FROM]
if len(definition_tokens) == 1:
self.add_error(SyntaxErrorNode([], "Empty declaration"), False)
return None, NotInitializedNode()
if definition_tokens[1].value == Keywords.BNF.value:
return self.get_concept_bnf_definition(current_concept_def, definition_tokens)
return self.get_concept_simple_definition(definition_tokens)
def get_concept_bnf_definition(self, current_concept_def, definition_tokens):
try:
tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[2:]))
except ParsingException as ex:
self.add_error(ex.error)
return None, NotInitializedNode()
if len(tokens) == 0:
self.add_error(SyntaxErrorNode([definition_tokens[1]], "Empty declaration"), False)
return None, NotInitializedNode()
bnf_regex_parser = BnfParser()
desc = f"Resolving BNF {current_concept_def.definition}"
with self.context.push(BuiltinConcepts.INIT_BNF,
current_concept_def,
who=self.name,
obj=current_concept_def,
desc=desc) as sub_context:
parsing_result = bnf_regex_parser.parse(sub_context, tokens)
sub_context.add_values(return_values=parsing_result)
if not parsing_result.status:
self.add_error(parsing_result.value)
return None, NotInitializedNode()
return DEFINITION_TYPE_BNF, parsing_result
def get_concept_simple_definition(self, definition_tokens):
start = 2 if definition_tokens[1].value == Keywords.DEF.value else 1
try:
tokens = self.fix_indentation(core.utils.strip_tokens(definition_tokens[start:]))
except ParsingException as ex:
self.add_error(ex.error)
return None, NotInitializedNode()
if len(tokens) == 0:
self.add_error(SyntaxErrorNode([definition_tokens[start]], "Empty declaration"), False)
return None, NotInitializedNode()
return DEFINITION_TYPE_DEF, NameNode(tokens)
def get_concept_parts(self, tokens_found_by_parts):
asts_found_by_parts = {
Keywords.AS: NotInitializedNode(),
Keywords.WHERE: NotInitializedNode(),
Keywords.PRE: NotInitializedNode(),
Keywords.POST: NotInitializedNode(),
Keywords.RET: NotInitializedNode()
}
for keyword in tokens_found_by_parts:
if keyword == Keywords.CONCEPT or keyword == Keywords.FROM:
continue # already done
tokens = tokens_found_by_parts[keyword]
if tokens is None:
continue # nothing to do
if len(tokens) == 1: # check for empty declarations
self.add_error(SyntaxErrorNode([tokens[0]], "Empty declaration"), False)
continue
try:
tokens = self.fix_indentation(tokens[1:]) # manage multi-lines declarations
except ParsingException as ex:
self.add_error(ex.error)
continue
# ask the other parsers if they recognize the tokens
source = self.sheerka.services[SheerkaExecute.NAME].get_parser_input(None, tokens)
parsed = core.builtin_helpers.parse_unrecognized(self.context,
source,
parsers="all",
who=self.name,
prop=keyword,
filter_func=core.builtin_helpers.expect_one)
if not parsed.status:
self.add_error(parsed.value)
continue
asts_found_by_parts[keyword] = parsed
#
# with self.context.push(BuiltinConcepts.PARSING, keyword, who=self.name, desc=f"Parsing {keyword}") as sub_context:
# parser_input = self.sheerka.services[SheerkaExecute.NAME].get_parser_input(None, tokens)
# to_parse = self.sheerka.ret(
# sub_context.who,
# True,
# self.sheerka.new(BuiltinConcepts.USER_INPUT, body=parser_input))
# steps = [BuiltinConcepts.BEFORE_PARSING, BuiltinConcepts.PARSING, BuiltinConcepts.AFTER_PARSING]
# if keyword in (Keywords.WHERE, Keywords.PRE):
# sub_context.protected_hints.add(BuiltinConcepts.EVAL_QUESTION_REQUESTED)
# parsed = self.sheerka.execute(sub_context, to_parse, steps)
# parsing_result = core.builtin_helpers.expect_one(sub_context, parsed)
# sub_context.add_values(return_values=parsing_result)
#
# if not parsing_result.status:
# self.add_error(parsing_result.value)
# continue
#
# asts_found_by_parts[keyword] = parsing_result
return asts_found_by_parts
+5 -4
View File
@@ -5,7 +5,7 @@ from core.builtin_concepts import BuiltinConcepts
from core.concept import Concept
from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import LexerError, TokenKind, Token
from parsers.BaseParser import Node, BaseParser, UnexpectedTokenErrorNode, UnexpectedEof, ErrorNode
from parsers.BaseParser import Node, BaseParser, UnexpectedTokenErrorNode, UnexpectedEofNode, ErrorNode
class ExprNode(Node):
@@ -189,7 +189,7 @@ class ExpressionParser(BaseParser):
"""
def __init__(self, **kwargs):
super().__init__("Expression", 50, False)
super().__init__("Expression", 50, False, yield_eof=True)
def parse(self, context, parser_input: ParserInput):
"""
@@ -215,6 +215,7 @@ class ExpressionParser(BaseParser):
False,
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
self.parser_input.next_token()
tree = self.parse_or()
token = self.parser_input.token
if token and token.type != TokenKind.EOF:
@@ -240,7 +241,7 @@ class ExpressionParser(BaseParser):
self.parser_input.next_token()
expr = self.parse_and()
if expr is None:
self.add_error(UnexpectedEof("When parsing 'or'"))
self.add_error(UnexpectedEofNode("When parsing 'or'"))
return OrNode(*parts)
parts.append(expr)
token = self.parser_input.token
@@ -258,7 +259,7 @@ class ExpressionParser(BaseParser):
self.parser_input.next_token()
expr = self.parse_names()
if expr is None:
self.add_error(UnexpectedEof("When parsing 'and'"))
self.add_error(UnexpectedEofNode("When parsing 'and'"))
return AndNode(*parts)
parts.append(expr)
token = self.parser_input.token
+132
View File
@@ -0,0 +1,132 @@
from dataclasses import dataclass
from core.builtin_concepts import BuiltinConcepts, ReturnValueConcept
from core.builtin_helpers import parse_unrecognized, expect_one
from core.sheerka.services.SheerkaExecute import ParserInput, SheerkaExecute
from core.tokenizer import Keywords
from core.utils import strip_tokens
from parsers.BaseCustomGrammarParser import BaseCustomGrammarParser, KeywordNotFound
from parsers.BaseParser import BaseParser, Node
@dataclass
class FormatAstNode:
pass
@dataclass
class FormatAstRawText(FormatAstNode):
text: str
@dataclass
class FormatRuleNode(Node):
tokens: dict
rule: ReturnValueConcept = None
format_ast: FormatAstNode = None
class FormatRuleParser(BaseCustomGrammarParser):
"""
Class that will parse formatting rules definitions
when xxx print yyy
where xxx will be evaluated in the context of BuiltinConcepts.EVAL_QUESTION_REQUESTED
and yyy is a internal way to describe a format (yet another one)
"""
KEYWORDS = [Keywords.WHEN, Keywords.PRINT]
KEYWORDS_VALUES = [k.value for k in KEYWORDS]
def __init__(self, **kwargs):
BaseCustomGrammarParser.__init__(self, "FormatRule", 60)
def parse(self, context, parser_input: ParserInput):
"""
:param context:
:param parser_input:
:return:
"""
if not isinstance(parser_input, ParserInput):
return None
if parser_input.from_tokens:
ret = context.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=parser_input))
self.log_result(context, parser_input, ret)
return ret
context.log(f"Parsing '{parser_input}' with FunctionParser", self.name)
sheerka = context.sheerka
if parser_input.is_empty():
return sheerka.ret(self.name,
False,
sheerka.new(BuiltinConcepts.IS_EMPTY))
if not self.reset_parser(context, parser_input):
return self.sheerka.ret(self.name,
False,
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
self.parser_input.next_token()
rule = self.parse_rule()
body = self.get_return_value_body(sheerka, parser_input.as_text(), rule, rule)
ret = sheerka.ret(self.name, not self.has_error, body)
self.log_result(context, parser_input.as_text(), ret)
return ret
def parse_rule(self):
parts = self.get_parts(self.KEYWORDS_VALUES)
if parts is None:
return None
node = FormatRuleNode(parts)
try:
res = self.get_when(parts[Keywords.WHEN])
if res is None:
return node
node.rule = res
parsed = self.get_print(parts[Keywords.PRINT])
if parsed is None:
return node
node.format_ast = parsed
except KeyError as e:
self.add_error(KeywordNotFound([], [e.args[0].value]))
return None
return node
def get_when(self, tokens):
"""
Validate the when part of the rule.
:param tokens:
:return:
"""
source = self.sheerka.services[SheerkaExecute.NAME].get_parser_input(None, strip_tokens(tokens[1:]))
parsed = parse_unrecognized(self.context,
source,
parsers="all",
who=self.name,
prop=Keywords.WHEN,
filter_func=expect_one)
if not parsed.status:
self.add_error(parsed.value)
return None
return parsed
def get_print(self, tokens):
"""
Validate the print part
:param tokens:
:return:
"""
source = BaseParser.get_text_from_tokens(strip_tokens(tokens[1:]))
return FormatAstRawText(source)
+12 -9
View File
@@ -7,7 +7,7 @@ from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import TokenKind, Token
from core.utils import get_n_clones
from parsers.BaseNodeParser import SourceCodeNode, SourceCodeWithConceptNode, UnrecognizedTokensNode
from parsers.BaseParser import BaseParser, UnexpectedTokenErrorNode, UnexpectedEof, Node
from parsers.BaseParser import BaseParser, UnexpectedTokenErrorNode, UnexpectedEofNode, Node
from parsers.PythonWithConceptsParser import PythonWithConceptsParser
# No need to check for Python code as the source code node will resolve to python code anyway
@@ -143,7 +143,7 @@ class FunctionParser(BaseParser):
so 'twenty one' will resolve to [[c:twenty one:]], not [[c:twenty one:], [c:twenty:, c:one:]]
:param kwargs:
"""
super().__init__("Function", 55, True)
super().__init__("Function", 55)
self.sep = sep
self.longest_concepts_only = longest_concepts_only
self.record_errors = True
@@ -179,6 +179,7 @@ class FunctionParser(BaseParser):
False,
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
self.parser_input.next_token()
node = self.parse_function()
if self.parser_input.next_token():
@@ -219,7 +220,7 @@ class FunctionParser(BaseParser):
return None
if not self.parser_input.next_token():
self.add_error(UnexpectedEof(f"Unexpected EOF while parsing left parenthesis"))
self.add_error(UnexpectedEofNode(f"Unexpected EOF while parsing left parenthesis"))
return None
token = self.parser_input.token
@@ -231,7 +232,7 @@ class FunctionParser(BaseParser):
start_node = NamesNode(start, start + 1, self.parser_input.tokens[start:start + 2])
if not self.parser_input.next_token():
self.add_error(UnexpectedEof(f"Unexpected EOF after left parenthesis"))
self.add_error(UnexpectedEofNode(f"Unexpected EOF after left parenthesis"))
return FunctionNode(start_node, None, None)
params = self.parse_parameters()
@@ -239,7 +240,7 @@ class FunctionParser(BaseParser):
return FunctionNode(start_node, None, params)
token = self.parser_input.token
if token.type != TokenKind.RPAR:
if not token or token.type != TokenKind.RPAR:
self.add_error(UnexpectedTokenErrorNode(f"Right parenthesis not found",
token,
[TokenKind.RPAR]))
@@ -261,7 +262,7 @@ class FunctionParser(BaseParser):
token = self.parser_input.token
if token.type == TokenKind.EOF:
self.add_error(UnexpectedEof(f"Unexpected EOF while parsing parameters"))
self.add_error(UnexpectedEofNode(f"Unexpected EOF while parsing parameters"))
return None
if token.type == TokenKind.RPAR:
@@ -269,10 +270,12 @@ class FunctionParser(BaseParser):
if token.value == self.sep:
sep_pos = self.parser_input.pos
self.parser_input.next_token()
has_next = self.parser_input.next_token() # it's before add_sep() to capture trailing whitespace
function_parameter.add_sep(sep_pos,
self.parser_input.pos - 1,
self.parser_input.tokens[sep_pos: self.parser_input.pos])
if not has_next:
break
return nodes
@@ -292,8 +295,8 @@ class FunctionParser(BaseParser):
tokens = []
while True:
token = self.parser_input.token
# if token is None:
# break
if token is None:
break
if token.value == self.sep or token.type == TokenKind.RPAR:
break