Implemented FunctionParser

This commit is contained in:
2020-09-17 14:11:09 +02:00
parent 8a866880bc
commit 177a6b1d5f
40 changed files with 1752 additions and 561 deletions
+407
View File
@@ -0,0 +1,407 @@
from dataclasses import dataclass
from typing import List
from core.builtin_concepts import BuiltinConcepts
from core.builtin_helpers import get_lexer_nodes_from_unrecognized, update_compiled
from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import TokenKind, Token
from core.utils import get_n_clones
from parsers.BaseNodeParser import SourceCodeNode, SourceCodeWithConceptNode, UnrecognizedTokensNode
from parsers.BaseParser import BaseParser, UnexpectedTokenErrorNode, UnexpectedEof, Node
from parsers.PythonWithConceptsParser import PythonWithConceptsParser
# No need to check for Python code as the source code node will resolve to python code anyway
# I only look for concepts, so
PARSERS = ["BnfNode", "SyaNode", "AtomNode"]
@dataclass
class FunctionParserNode(Node):
pass
@dataclass()
class NamesNode(FunctionParserNode):
start: int # index of the first token
end: int # index of the last token
tokens: List[Token]
def __repr__(self):
return f"NameNode('{self.str_value()}')"
def str_value(self):
if self.tokens is None:
return None
return "".join([t.str_value for t in self.tokens])
def to_unrecognized(self):
return UnrecognizedTokensNode(self.start, self.end, self.tokens).fix_source()
@dataclass()
class FunctionParameter:
"""
class the represent result of the parameter parsing
"""
value: NamesNode # value parsed
separator: NamesNode = None # holds the value and the position of the separator
def add_sep(self, start, end, tokens):
self.separator = NamesNode(start, end, tokens)
def value_to_unrecognized(self):
return UnrecognizedTokensNode(self.value.start, self.value.end, self.value.tokens).fix_source()
def separator_to_unrecognized(self):
if self.separator is None:
return None
return UnrecognizedTokensNode(self.separator.start, self.separator.end, self.separator.tokens).fix_source()
@dataclass
class FunctionNode(FunctionParserNode):
first: NamesNode # beginning of the function (it should represent the name of the function)
last: NamesNode # last part of the function (it should be the trailing parenthesis)
parameters: list
class FN(FunctionNode):
"""
Test class only
It matches with FunctionNode but with less constraints
Thereby,
FN("first", "last", ["param1," ...]) can be compared to
FunctionNode(NamesNode("first"), NamesNode("second"), [FunctionParameter(NamesNodes("param1"), NamesNodes(", ")])
Note that FunctionParameter can easily be defined with a single string
* "param" -> FunctionParameter(NamesNode("param"), None)
* "param, " -> FunctionParameter(NamesNode("param"), NamesNode(", "))
For more complicated situations, you can use a tuple (value, sep) to define the value part and the separator part
"""
def __init__(self, first, last, parameters):
self.first = first
self.last = last
self.parameters = []
for param in parameters:
if isinstance(param, tuple):
self.parameters.append(param)
elif isinstance(param, str) and (pos := param.find(",")) != -1:
self.parameters.append((param[:pos], param[pos:]))
else:
self.parameters.append((param, None))
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, FN):
return self.first == other.first and self.last == other.last and self.parameters == other.parameters
if isinstance(other, FunctionNode):
if self.first != other.first.str_value() or self.last != other.last.str_value():
return False
if len(self.parameters) != len(other.parameters):
return False
for self_parameter, other_parameter in zip(self.parameters, other.parameters):
value = other_parameter.value.str_value() if isinstance(self_parameter[0],
str) else other_parameter.value
sep = other_parameter.separator.str_value() if other_parameter.separator else None
if self_parameter[0] != value or self_parameter[1] != sep:
return False
return True
return False
def __hash__(self):
return hash((self.first, self.last, self.parameters))
class FunctionParser(BaseParser):
"""
The parser will be used to parse func(x, y, z)
where x, y and z can be source code, concepts or other functions
It will return a SourceCodeNode or SourceCodeNodeWithConcept
"""
def __init__(self, sep=",", longest_concepts_only=True, **kwargs):
"""
:param sep:
:param longest_concepts_only: When multiples concepts are found, only keep the longest one
so 'twenty one' will resolve to [[c:twenty one:]], not [[c:twenty one:], [c:twenty:, c:one:]]
:param kwargs:
"""
super().__init__("Function", 55, True)
self.sep = sep
self.longest_concepts_only = longest_concepts_only
self.record_errors = True
def add_error(self, error, next_token=True):
if not self.record_errors:
return
return super().add_error(error, next_token)
def parse(self, context, parser_input: ParserInput):
"""
:param context:
:param parser_input:
:return:
"""
if not isinstance(parser_input, ParserInput):
return None
context.log(f"Parsing '{parser_input}' with FunctionParser", self.name)
sheerka = context.sheerka
if parser_input.is_empty():
return sheerka.ret(self.name,
False,
sheerka.new(BuiltinConcepts.IS_EMPTY))
if not self.reset_parser(context, parser_input):
return self.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
node = self.parse_function()
if self.parser_input.next_token():
self.add_error(UnexpectedTokenErrorNode("Only one function supported",
self.parser_input.token,
[TokenKind.EOF]))
if self.has_error:
if node is None:
body = context.sheerka.new(BuiltinConcepts.NOT_FOR_ME,
body=parser_input.as_text(),
reason=self.error_sink)
else:
body = context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink)
return self.sheerka.ret(self.name, False, body)
source_code_nodes = self.to_source_code_node(node)
res = []
for source_code_node in source_code_nodes:
value = self.get_return_value_body(context.sheerka,
self.parser_input.as_text(),
source_code_node,
source_code_node)
res.append(self.sheerka.ret(self.name, source_code_node.python_node is not None, value))
return res[0] if len(res) == 1 else res
def parse_function(self):
start = self.parser_input.pos
token = self.parser_input.token
if token.type != TokenKind.IDENTIFIER:
self.add_error(UnexpectedTokenErrorNode(f"{token.repr_value} is not a identifier",
token,
[TokenKind.IDENTIFIER]))
return None
if not self.parser_input.next_token():
self.add_error(UnexpectedEof(f"Unexpected EOF while parsing left parenthesis"))
return None
token = self.parser_input.token
if token.type != TokenKind.LPAR:
self.add_error(UnexpectedTokenErrorNode(f"{token.repr_value} is not a left parenthesis",
token,
[TokenKind.LPAR]))
return None
start_node = NamesNode(start, start + 1, self.parser_input.tokens[start:start + 2])
if not self.parser_input.next_token():
self.add_error(UnexpectedEof(f"Unexpected EOF after left parenthesis"))
return FunctionNode(start_node, None, None)
params = self.parse_parameters()
if self.has_error:
return FunctionNode(start_node, None, params)
token = self.parser_input.token
if token.type != TokenKind.RPAR:
self.add_error(UnexpectedTokenErrorNode(f"Right parenthesis not found",
token,
[TokenKind.RPAR]))
return FunctionNode(start_node, None, params)
return FunctionNode(start_node,
NamesNode(self.parser_input.pos, self.parser_input.pos, [token]),
params)
def parse_parameters(self):
nodes = []
while True:
param_value = self.parse_parameter_value()
if not param_value:
break
function_parameter = FunctionParameter(param_value)
nodes.append(function_parameter)
token = self.parser_input.token
if token.type == TokenKind.EOF:
self.add_error(UnexpectedEof(f"Unexpected EOF while parsing parameters"))
return None
if token.type == TokenKind.RPAR:
break
if token.value == self.sep:
sep_pos = self.parser_input.pos
self.parser_input.next_token()
function_parameter.add_sep(sep_pos,
self.parser_input.pos - 1,
self.parser_input.tokens[sep_pos: self.parser_input.pos])
return nodes
def parse_parameter_value(self):
# check if the parameter is a function
start_pos = self.parser_input.pos
self.record_errors = False
func = self.parse_function()
self.record_errors = True
if func:
self.parser_input.next_token()
return func
# otherwise, eat until LPAR or separator
self.parser_input.seek(start_pos)
self.record_errors = True
tokens = []
while True:
token = self.parser_input.token
# if token is None:
# break
if token.value == self.sep or token.type == TokenKind.RPAR:
break
tokens.append(token)
if not self.parser_input.next_token(skip_whitespace=False):
break
return NamesNode(start_pos, self.parser_input.pos - 1, tokens) if len(tokens) else None
def to_source_code_node(self, function_node: FunctionNode):
python_parser = PythonWithConceptsParser()
if len(function_node.parameters) == 0:
# validate the source
nodes_to_parse = [function_node.first.to_unrecognized(), function_node.last.to_unrecognized()]
python_parsing_res = python_parser.parse_nodes(self.context, nodes_to_parse)
python_node = python_parsing_res.body.body if python_parsing_res.status else None
return [SourceCodeNode(start=function_node.first.start,
end=function_node.last.end,
tokens=function_node.first.tokens + function_node.last.tokens,
python_node=python_node,
return_value=python_parsing_res)]
def update_source_code_node(scn, nodes, sep):
if hasattr(nodes, "__iter__"):
for n in nodes:
scn.add_node(n)
else:
scn.add_node(nodes)
if sep:
scn.add_node(sep.to_unrecognized())
res = [SourceCodeWithConceptNode(function_node.first.to_unrecognized(), function_node.last.to_unrecognized())]
for param in function_node.parameters:
if isinstance(param.value, NamesNode):
unrecognized = param.value.to_unrecognized()
# try to recognize concepts
nodes_sequences = get_lexer_nodes_from_unrecognized(self.context,
unrecognized,
PARSERS)
else:
# the parameter is also a function
nodes_sequences = self.to_source_code_node(param.value)
if self.longest_concepts_only:
nodes_sequences = self.get_longest_concepts(nodes_sequences)
if nodes_sequences is None:
# no concept found
for source_code_node in res:
update_source_code_node(source_code_node, unrecognized, param.separator)
elif len(nodes_sequences) == 1:
# only one result
# It is the same code than when there are multiple results
# But here, we save the creation of the tmp_res object (not sure it worth it)
for source_code_node in res:
update_source_code_node(source_code_node, nodes_sequences[0], param.separator)
else:
# multiple result, make the cartesian product
tmp_res = []
for source_code_node in res:
instances = get_n_clones(source_code_node, len(nodes_sequences))
tmp_res.extend(instances)
for instance, node_sequence in zip(instances, nodes_sequences):
update_source_code_node(instance, node_sequence, param.separator)
res = tmp_res
# check if it is a valid source code
for source_code_node in res:
source_code_node.fix_all_pos()
source_code_node.pseudo_fix_source()
python_parsing_res = python_parser.parse_nodes(self.context, source_code_node.get_all_nodes())
if python_parsing_res.status:
source_code_node.python_node = python_parsing_res.body.body
source_code_node.return_value = python_parsing_res
# make sure that concepts found can be evaluated
errors = []
for c in source_code_node.python_node.concepts.values():
update_compiled(self.context, c, errors)
return res
@staticmethod
def get_longest_concepts(nodes_sequences):
"""
The longest sequences are the ones that have the less number of concepts
For example
'twenty one' resolves to
[c:twenty one:]
[c:twenty:, c:one:]
[c:twenty one:] has only one concept, so it's the longest one (two tokens against one token twice)
:param nodes_sequences:
:return:
"""
if nodes_sequences is None:
return None
res = []
min_len = -1
for current_sequence in nodes_sequences:
# awful hack to remove when NodeSequence and ConceptSequence will be implemented
current_len = len(current_sequence) if hasattr(current_sequence, "__len__") else 1
if len(res) == 0:
res.append(current_sequence)
min_len = current_len
elif current_len == min_len:
res.append(current_sequence)
elif current_len < min_len:
res.clear()
res.append(current_sequence)
min_len = current_len
return res