Implemented FunctionParser
This commit is contained in:
@@ -0,0 +1,407 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from core.builtin_helpers import get_lexer_nodes_from_unrecognized, update_compiled
|
||||
from core.sheerka.services.SheerkaExecute import ParserInput
|
||||
from core.tokenizer import TokenKind, Token
|
||||
from core.utils import get_n_clones
|
||||
from parsers.BaseNodeParser import SourceCodeNode, SourceCodeWithConceptNode, UnrecognizedTokensNode
|
||||
from parsers.BaseParser import BaseParser, UnexpectedTokenErrorNode, UnexpectedEof, Node
|
||||
from parsers.PythonWithConceptsParser import PythonWithConceptsParser
|
||||
|
||||
# No need to check for Python code as the source code node will resolve to python code anyway
|
||||
# I only look for concepts, so
|
||||
PARSERS = ["BnfNode", "SyaNode", "AtomNode"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionParserNode(Node):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass()
|
||||
class NamesNode(FunctionParserNode):
|
||||
start: int # index of the first token
|
||||
end: int # index of the last token
|
||||
tokens: List[Token]
|
||||
|
||||
def __repr__(self):
|
||||
return f"NameNode('{self.str_value()}')"
|
||||
|
||||
def str_value(self):
|
||||
if self.tokens is None:
|
||||
return None
|
||||
|
||||
return "".join([t.str_value for t in self.tokens])
|
||||
|
||||
def to_unrecognized(self):
|
||||
return UnrecognizedTokensNode(self.start, self.end, self.tokens).fix_source()
|
||||
|
||||
|
||||
@dataclass()
|
||||
class FunctionParameter:
|
||||
"""
|
||||
class the represent result of the parameter parsing
|
||||
"""
|
||||
value: NamesNode # value parsed
|
||||
separator: NamesNode = None # holds the value and the position of the separator
|
||||
|
||||
def add_sep(self, start, end, tokens):
|
||||
self.separator = NamesNode(start, end, tokens)
|
||||
|
||||
def value_to_unrecognized(self):
|
||||
return UnrecognizedTokensNode(self.value.start, self.value.end, self.value.tokens).fix_source()
|
||||
|
||||
def separator_to_unrecognized(self):
|
||||
if self.separator is None:
|
||||
return None
|
||||
return UnrecognizedTokensNode(self.separator.start, self.separator.end, self.separator.tokens).fix_source()
|
||||
|
||||
|
||||
@dataclass
|
||||
class FunctionNode(FunctionParserNode):
|
||||
first: NamesNode # beginning of the function (it should represent the name of the function)
|
||||
last: NamesNode # last part of the function (it should be the trailing parenthesis)
|
||||
parameters: list
|
||||
|
||||
|
||||
class FN(FunctionNode):
|
||||
"""
|
||||
Test class only
|
||||
It matches with FunctionNode but with less constraints
|
||||
|
||||
Thereby,
|
||||
FN("first", "last", ["param1," ...]) can be compared to
|
||||
FunctionNode(NamesNode("first"), NamesNode("second"), [FunctionParameter(NamesNodes("param1"), NamesNodes(", ")])
|
||||
|
||||
Note that FunctionParameter can easily be defined with a single string
|
||||
* "param" -> FunctionParameter(NamesNode("param"), None)
|
||||
* "param, " -> FunctionParameter(NamesNode("param"), NamesNode(", "))
|
||||
For more complicated situations, you can use a tuple (value, sep) to define the value part and the separator part
|
||||
"""
|
||||
|
||||
def __init__(self, first, last, parameters):
|
||||
self.first = first
|
||||
self.last = last
|
||||
self.parameters = []
|
||||
for param in parameters:
|
||||
if isinstance(param, tuple):
|
||||
self.parameters.append(param)
|
||||
elif isinstance(param, str) and (pos := param.find(",")) != -1:
|
||||
self.parameters.append((param[:pos], param[pos:]))
|
||||
else:
|
||||
self.parameters.append((param, None))
|
||||
|
||||
def __eq__(self, other):
|
||||
if id(self) == id(other):
|
||||
return True
|
||||
|
||||
if isinstance(other, FN):
|
||||
return self.first == other.first and self.last == other.last and self.parameters == other.parameters
|
||||
|
||||
if isinstance(other, FunctionNode):
|
||||
if self.first != other.first.str_value() or self.last != other.last.str_value():
|
||||
return False
|
||||
if len(self.parameters) != len(other.parameters):
|
||||
return False
|
||||
for self_parameter, other_parameter in zip(self.parameters, other.parameters):
|
||||
value = other_parameter.value.str_value() if isinstance(self_parameter[0],
|
||||
str) else other_parameter.value
|
||||
sep = other_parameter.separator.str_value() if other_parameter.separator else None
|
||||
if self_parameter[0] != value or self_parameter[1] != sep:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.first, self.last, self.parameters))
|
||||
|
||||
|
||||
class FunctionParser(BaseParser):
|
||||
"""
|
||||
The parser will be used to parse func(x, y, z)
|
||||
where x, y and z can be source code, concepts or other functions
|
||||
It will return a SourceCodeNode or SourceCodeNodeWithConcept
|
||||
"""
|
||||
|
||||
def __init__(self, sep=",", longest_concepts_only=True, **kwargs):
|
||||
"""
|
||||
|
||||
:param sep:
|
||||
:param longest_concepts_only: When multiples concepts are found, only keep the longest one
|
||||
so 'twenty one' will resolve to [[c:twenty one:]], not [[c:twenty one:], [c:twenty:, c:one:]]
|
||||
:param kwargs:
|
||||
"""
|
||||
super().__init__("Function", 55, True)
|
||||
self.sep = sep
|
||||
self.longest_concepts_only = longest_concepts_only
|
||||
self.record_errors = True
|
||||
|
||||
def add_error(self, error, next_token=True):
|
||||
if not self.record_errors:
|
||||
return
|
||||
|
||||
return super().add_error(error, next_token)
|
||||
|
||||
def parse(self, context, parser_input: ParserInput):
|
||||
"""
|
||||
|
||||
:param context:
|
||||
:param parser_input:
|
||||
:return:
|
||||
"""
|
||||
|
||||
if not isinstance(parser_input, ParserInput):
|
||||
return None
|
||||
|
||||
context.log(f"Parsing '{parser_input}' with FunctionParser", self.name)
|
||||
sheerka = context.sheerka
|
||||
|
||||
if parser_input.is_empty():
|
||||
return sheerka.ret(self.name,
|
||||
False,
|
||||
sheerka.new(BuiltinConcepts.IS_EMPTY))
|
||||
|
||||
if not self.reset_parser(context, parser_input):
|
||||
return self.sheerka.ret(
|
||||
self.name,
|
||||
False,
|
||||
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
|
||||
|
||||
node = self.parse_function()
|
||||
|
||||
if self.parser_input.next_token():
|
||||
self.add_error(UnexpectedTokenErrorNode("Only one function supported",
|
||||
self.parser_input.token,
|
||||
[TokenKind.EOF]))
|
||||
|
||||
if self.has_error:
|
||||
if node is None:
|
||||
body = context.sheerka.new(BuiltinConcepts.NOT_FOR_ME,
|
||||
body=parser_input.as_text(),
|
||||
reason=self.error_sink)
|
||||
else:
|
||||
body = context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink)
|
||||
return self.sheerka.ret(self.name, False, body)
|
||||
|
||||
source_code_nodes = self.to_source_code_node(node)
|
||||
|
||||
res = []
|
||||
for source_code_node in source_code_nodes:
|
||||
value = self.get_return_value_body(context.sheerka,
|
||||
self.parser_input.as_text(),
|
||||
source_code_node,
|
||||
source_code_node)
|
||||
|
||||
res.append(self.sheerka.ret(self.name, source_code_node.python_node is not None, value))
|
||||
|
||||
return res[0] if len(res) == 1 else res
|
||||
|
||||
def parse_function(self):
|
||||
|
||||
start = self.parser_input.pos
|
||||
token = self.parser_input.token
|
||||
if token.type != TokenKind.IDENTIFIER:
|
||||
self.add_error(UnexpectedTokenErrorNode(f"{token.repr_value} is not a identifier",
|
||||
token,
|
||||
[TokenKind.IDENTIFIER]))
|
||||
return None
|
||||
|
||||
if not self.parser_input.next_token():
|
||||
self.add_error(UnexpectedEof(f"Unexpected EOF while parsing left parenthesis"))
|
||||
return None
|
||||
|
||||
token = self.parser_input.token
|
||||
if token.type != TokenKind.LPAR:
|
||||
self.add_error(UnexpectedTokenErrorNode(f"{token.repr_value} is not a left parenthesis",
|
||||
token,
|
||||
[TokenKind.LPAR]))
|
||||
return None
|
||||
|
||||
start_node = NamesNode(start, start + 1, self.parser_input.tokens[start:start + 2])
|
||||
if not self.parser_input.next_token():
|
||||
self.add_error(UnexpectedEof(f"Unexpected EOF after left parenthesis"))
|
||||
return FunctionNode(start_node, None, None)
|
||||
|
||||
params = self.parse_parameters()
|
||||
if self.has_error:
|
||||
return FunctionNode(start_node, None, params)
|
||||
|
||||
token = self.parser_input.token
|
||||
if token.type != TokenKind.RPAR:
|
||||
self.add_error(UnexpectedTokenErrorNode(f"Right parenthesis not found",
|
||||
token,
|
||||
[TokenKind.RPAR]))
|
||||
return FunctionNode(start_node, None, params)
|
||||
|
||||
return FunctionNode(start_node,
|
||||
NamesNode(self.parser_input.pos, self.parser_input.pos, [token]),
|
||||
params)
|
||||
|
||||
def parse_parameters(self):
|
||||
nodes = []
|
||||
while True:
|
||||
param_value = self.parse_parameter_value()
|
||||
if not param_value:
|
||||
break
|
||||
|
||||
function_parameter = FunctionParameter(param_value)
|
||||
nodes.append(function_parameter)
|
||||
|
||||
token = self.parser_input.token
|
||||
if token.type == TokenKind.EOF:
|
||||
self.add_error(UnexpectedEof(f"Unexpected EOF while parsing parameters"))
|
||||
return None
|
||||
|
||||
if token.type == TokenKind.RPAR:
|
||||
break
|
||||
|
||||
if token.value == self.sep:
|
||||
sep_pos = self.parser_input.pos
|
||||
self.parser_input.next_token()
|
||||
function_parameter.add_sep(sep_pos,
|
||||
self.parser_input.pos - 1,
|
||||
self.parser_input.tokens[sep_pos: self.parser_input.pos])
|
||||
|
||||
return nodes
|
||||
|
||||
def parse_parameter_value(self):
|
||||
# check if the parameter is a function
|
||||
start_pos = self.parser_input.pos
|
||||
self.record_errors = False
|
||||
func = self.parse_function()
|
||||
self.record_errors = True
|
||||
if func:
|
||||
self.parser_input.next_token()
|
||||
return func
|
||||
|
||||
# otherwise, eat until LPAR or separator
|
||||
self.parser_input.seek(start_pos)
|
||||
self.record_errors = True
|
||||
tokens = []
|
||||
while True:
|
||||
token = self.parser_input.token
|
||||
# if token is None:
|
||||
# break
|
||||
|
||||
if token.value == self.sep or token.type == TokenKind.RPAR:
|
||||
break
|
||||
|
||||
tokens.append(token)
|
||||
if not self.parser_input.next_token(skip_whitespace=False):
|
||||
break
|
||||
|
||||
return NamesNode(start_pos, self.parser_input.pos - 1, tokens) if len(tokens) else None
|
||||
|
||||
def to_source_code_node(self, function_node: FunctionNode):
|
||||
python_parser = PythonWithConceptsParser()
|
||||
|
||||
if len(function_node.parameters) == 0:
|
||||
# validate the source
|
||||
nodes_to_parse = [function_node.first.to_unrecognized(), function_node.last.to_unrecognized()]
|
||||
python_parsing_res = python_parser.parse_nodes(self.context, nodes_to_parse)
|
||||
python_node = python_parsing_res.body.body if python_parsing_res.status else None
|
||||
|
||||
return [SourceCodeNode(start=function_node.first.start,
|
||||
end=function_node.last.end,
|
||||
tokens=function_node.first.tokens + function_node.last.tokens,
|
||||
python_node=python_node,
|
||||
return_value=python_parsing_res)]
|
||||
|
||||
def update_source_code_node(scn, nodes, sep):
|
||||
if hasattr(nodes, "__iter__"):
|
||||
for n in nodes:
|
||||
scn.add_node(n)
|
||||
else:
|
||||
scn.add_node(nodes)
|
||||
|
||||
if sep:
|
||||
scn.add_node(sep.to_unrecognized())
|
||||
|
||||
res = [SourceCodeWithConceptNode(function_node.first.to_unrecognized(), function_node.last.to_unrecognized())]
|
||||
for param in function_node.parameters:
|
||||
if isinstance(param.value, NamesNode):
|
||||
unrecognized = param.value.to_unrecognized()
|
||||
# try to recognize concepts
|
||||
nodes_sequences = get_lexer_nodes_from_unrecognized(self.context,
|
||||
unrecognized,
|
||||
PARSERS)
|
||||
else:
|
||||
# the parameter is also a function
|
||||
nodes_sequences = self.to_source_code_node(param.value)
|
||||
|
||||
if self.longest_concepts_only:
|
||||
nodes_sequences = self.get_longest_concepts(nodes_sequences)
|
||||
|
||||
if nodes_sequences is None:
|
||||
# no concept found
|
||||
for source_code_node in res:
|
||||
update_source_code_node(source_code_node, unrecognized, param.separator)
|
||||
|
||||
elif len(nodes_sequences) == 1:
|
||||
# only one result
|
||||
# It is the same code than when there are multiple results
|
||||
# But here, we save the creation of the tmp_res object (not sure it worth it)
|
||||
for source_code_node in res:
|
||||
update_source_code_node(source_code_node, nodes_sequences[0], param.separator)
|
||||
else:
|
||||
# multiple result, make the cartesian product
|
||||
tmp_res = []
|
||||
for source_code_node in res:
|
||||
instances = get_n_clones(source_code_node, len(nodes_sequences))
|
||||
tmp_res.extend(instances)
|
||||
for instance, node_sequence in zip(instances, nodes_sequences):
|
||||
update_source_code_node(instance, node_sequence, param.separator)
|
||||
res = tmp_res
|
||||
|
||||
# check if it is a valid source code
|
||||
for source_code_node in res:
|
||||
source_code_node.fix_all_pos()
|
||||
source_code_node.pseudo_fix_source()
|
||||
|
||||
python_parsing_res = python_parser.parse_nodes(self.context, source_code_node.get_all_nodes())
|
||||
if python_parsing_res.status:
|
||||
source_code_node.python_node = python_parsing_res.body.body
|
||||
source_code_node.return_value = python_parsing_res
|
||||
|
||||
# make sure that concepts found can be evaluated
|
||||
errors = []
|
||||
for c in source_code_node.python_node.concepts.values():
|
||||
update_compiled(self.context, c, errors)
|
||||
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def get_longest_concepts(nodes_sequences):
|
||||
"""
|
||||
The longest sequences are the ones that have the less number of concepts
|
||||
For example
|
||||
'twenty one' resolves to
|
||||
[c:twenty one:]
|
||||
[c:twenty:, c:one:]
|
||||
[c:twenty one:] has only one concept, so it's the longest one (two tokens against one token twice)
|
||||
:param nodes_sequences:
|
||||
:return:
|
||||
"""
|
||||
if nodes_sequences is None:
|
||||
return None
|
||||
|
||||
res = []
|
||||
min_len = -1
|
||||
for current_sequence in nodes_sequences:
|
||||
# awful hack to remove when NodeSequence and ConceptSequence will be implemented
|
||||
current_len = len(current_sequence) if hasattr(current_sequence, "__len__") else 1
|
||||
if len(res) == 0:
|
||||
res.append(current_sequence)
|
||||
min_len = current_len
|
||||
elif current_len == min_len:
|
||||
res.append(current_sequence)
|
||||
elif current_len < min_len:
|
||||
res.clear()
|
||||
res.append(current_sequence)
|
||||
min_len = current_len
|
||||
|
||||
return res
|
||||
Reference in New Issue
Block a user