Sheerka-Old/src/parsers/FunctionParser.py

from dataclasses import dataclass

from core.builtin_concepts import BuiltinConcepts
from core.builtin_helpers import get_lexer_nodes_from_unrecognized, update_compiled
from core.concept import Concept
from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import TokenKind
from core.utils import get_n_clones
from parsers.BaseNodeParser import SourceCodeNode, SourceCodeWithConceptNode, UnrecognizedTokensNode
from parsers.BaseParser import BaseParser, UnexpectedTokenParsingError, UnexpectedEofParsingError, Node
from parsers.BnfNodeParser import BnfNodeParser
from parsers.PythonWithConceptsParser import PythonWithConceptsParser
from parsers.RuleParser import RuleParser
from parsers.SequenceNodeParser import SequenceNodeParser
from parsers.SyaNodeParser import SyaNodeParser
from parsers.expressions import NameExprNode

PARSERS = [RuleParser.NAME,
           SequenceNodeParser.NAME,
           BnfNodeParser.NAME,
           SyaNodeParser.NAME]


@dataclass
class FunctionParserNode(Node):
    pass


@dataclass()
class FunctionParameter:
    """
    class the represent result of the parameter parsing
    """
    value: NameExprNode  # value parsed
    separator: NameExprNode = None  # holds the value and the position of the separator

    def add_sep(self, start, end, tokens):
        self.separator = NameExprNode(start, end, tokens)

    def value_to_unrecognized(self):
        return UnrecognizedTokensNode(self.value.start, self.value.end, self.value.tokens).fix_source()

    def separator_to_unrecognized(self):
        if self.separator is None:
            return None
        return UnrecognizedTokensNode(self.separator.start, self.separator.end, self.separator.tokens).fix_source()


@dataclass
class FunctionNode(FunctionParserNode):
    first: NameExprNode  # beginning of the function (it should represent the name of the function)
    last: NameExprNode  # last part of the function (it should be the trailing parenthesis)
    parameters: list


class FN(FunctionNode):
    """
    Test class only
    It matches with FunctionNode but with less constraints

    Thereby,
    FN("first", "last", ["param1," ...]) can be compared to
    FunctionNode(NameExprNode("first"), NameExprNode("second"),  [FunctionParameter(NamesNodes("param1"), NamesNodes(", ")])

    Note that FunctionParameter can easily be defined with a single string
        * "param" -> FunctionParameter(NameExprNode("param"), None)
        * "param, " -> FunctionParameter(NameExprNode("param"), NameExprNode(", "))
    For more complicated situations, you can use a tuple (value, sep) to define the value part and the separator part
    """

    def __init__(self, first, last, parameters):
        self.first = first
        self.last = last
        self.parameters = []
        for param in parameters:
            if isinstance(param, tuple):
                self.parameters.append(param)
            elif isinstance(param, str) and (pos := param.find(",")) != -1:
                self.parameters.append((param[:pos], param[pos:]))
            else:
                self.parameters.append((param, None))

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, FN):
            return self.first == other.first and self.last == other.last and self.parameters == other.parameters

        if isinstance(other, FunctionNode):
            if self.first != other.first.value or self.last != other.last.value:
                return False
            if len(self.parameters) != len(other.parameters):
                return False
            for self_parameter, other_parameter in zip(self.parameters, other.parameters):
                value = other_parameter.value.value if isinstance(self_parameter[0], str) else other_parameter.value
                sep = other_parameter.separator.value if other_parameter.separator else None
                if self_parameter[0] != value or self_parameter[1] != sep:
                    return False

            return True

        return False

    def __hash__(self):
        return hash((self.first, self.last, self.parameters))


class FunctionParser(BaseParser):
    """
    The parser will be used to parse func(x, y, z)
    where x, y and z can be source code, concepts or other functions
    It will return a  SourceCodeNode or SourceCodeNodeWithConcept
    """

    def __init__(self, sep=",", longest_concepts_only=True, **kwargs):
        """

        :param sep:
        :param longest_concepts_only: When multiples concepts are found, only keep the longest one
        so 'twenty one' will resolve to [[c:twenty one:]], not [[c:twenty one:], [c:twenty:, c:one:]]
        :param kwargs:
        """
        super().__init__("Function", 55)
        self.sep = sep
        self.longest_concepts_only = longest_concepts_only
        self.record_errors = True

    def add_error(self, error, next_token=True):
        if not self.record_errors:
            return

        return super().add_error(error, next_token)

    def parse(self, context, parser_input: ParserInput):
        """

        :param context:
        :param parser_input:
        :return:
        """

        if not isinstance(parser_input, ParserInput):
            return None

        context.log(f"Parsing '{parser_input}' with FunctionParser", self.name)
        sheerka = context.sheerka

        if parser_input.is_empty():
            return sheerka.ret(self.name,
                               False,
                               sheerka.new(BuiltinConcepts.IS_EMPTY))

        if not self.reset_parser(context, parser_input):
            return self.sheerka.ret(
                self.name,
                False,
                context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))

        self.parser_input.next_token()
        node = self.parse_function()

        if self.parser_input.next_token():
            self.add_error(UnexpectedTokenParsingError("Only one function supported",
                                                       self.parser_input.token,
                                                       [TokenKind.EOF]))

        if self.has_error:
            if node is None:
                body = context.sheerka.new(BuiltinConcepts.NOT_FOR_ME,
                                           body=parser_input.as_text(),
                                           reason=self.error_sink)
            else:
                body = context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink)
            return self.sheerka.ret(self.name, False, body)

        source_code_nodes = self.to_source_code_node(node)

        res = []
        for source_code_node in source_code_nodes:
            value = self.get_return_value_body(context.sheerka,
                                               self.parser_input.as_text(),
                                               source_code_node,
                                               source_code_node)

            res.append(self.sheerka.ret(self.name, source_code_node.python_node is not None, value))

        return res[0] if len(res) == 1 else res

    def parse_function(self):

        start = self.parser_input.pos
        token = self.parser_input.token
        if token.type != TokenKind.IDENTIFIER:
            self.add_error(UnexpectedTokenParsingError(f"{token.repr_value} is not a identifier",
                                                       token,
                                                       [TokenKind.IDENTIFIER]))
            return None

        if not self.parser_input.next_token():
            self.add_error(UnexpectedEofParsingError(f"Unexpected EOF while parsing left parenthesis"))
            return None

        token = self.parser_input.token
        if token.type != TokenKind.LPAR:
            self.add_error(UnexpectedTokenParsingError(f"{token.repr_value} is not a left parenthesis",
                                                       token,
                                                       [TokenKind.LPAR]))
            return None

        start_node = NameExprNode(start, start + 1, self.parser_input.tokens[start:start + 2])
        if not self.parser_input.next_token():
            self.add_error(UnexpectedEofParsingError(f"Unexpected EOF after left parenthesis"))
            return FunctionNode(start_node, None, None)

        params = self.parse_parameters()
        if self.has_error:
            return FunctionNode(start_node, None, params)

        token = self.parser_input.token
        if not token or token.type != TokenKind.RPAR:
            self.add_error(UnexpectedTokenParsingError(f"Right parenthesis not found",
                                                       token,
                                                       [TokenKind.RPAR]))
            return FunctionNode(start_node, None, params)

        return FunctionNode(start_node,
                            NameExprNode(self.parser_input.pos, self.parser_input.pos, [token]),
                            params)

    def parse_parameters(self):
        nodes = []
        while True:
            param_value = self.parse_parameter_value()
            if not param_value:
                break

            function_parameter = FunctionParameter(param_value)
            nodes.append(function_parameter)

            token = self.parser_input.token
            if token.type == TokenKind.EOF:
                self.add_error(UnexpectedEofParsingError(f"Unexpected EOF while parsing parameters"))
                return None

            if token.type == TokenKind.RPAR:
                break

            if token.value == self.sep:
                sep_pos = self.parser_input.pos
                has_next = self.parser_input.next_token()  # it's before add_sep() to capture trailing whitespace
                function_parameter.add_sep(sep_pos,
                                           self.parser_input.pos - 1,
                                           self.parser_input.tokens[sep_pos: self.parser_input.pos])
                if not has_next:
                    break

        return nodes

    def parse_parameter_value(self):
        # check if the parameter is a function
        start_pos = self.parser_input.pos
        self.record_errors = False
        func = self.parse_function()
        self.record_errors = True
        if func:
            self.parser_input.next_token()
            return func

        # otherwise, eat until LPAR or separator
        self.parser_input.seek(start_pos)
        self.record_errors = True
        tokens = []
        while True:
            token = self.parser_input.token
            if token is None:
                break

            if token.value == self.sep or token.type == TokenKind.RPAR:
                break

            tokens.append(token)
            if not self.parser_input.next_token(skip_whitespace=False):
                break

        return NameExprNode(start_pos, self.parser_input.pos - 1, tokens) if len(tokens) else None

    def to_source_code_node(self, function_node: FunctionNode):
        python_parser = PythonWithConceptsParser()

        if len(function_node.parameters) == 0:
            # validate the source
            nodes_to_parse = [function_node.first.to_unrecognized(), function_node.last.to_unrecognized()]
            python_parsing_res = python_parser.parse_nodes(self.context, nodes_to_parse)
            python_node = python_parsing_res.body.body if python_parsing_res.status else None

            return [SourceCodeNode(start=function_node.first.start,
                                   end=function_node.last.end,
                                   tokens=function_node.first.tokens + function_node.last.tokens,
                                   python_node=python_node,
                                   return_value=python_parsing_res)]

        def update_source_code_node(scn, nodes, sep):
            if hasattr(nodes, "__iter__"):
                for n in nodes:
                    scn.add_node(n)
            else:
                scn.add_node(nodes)

            if sep:
                scn.add_node(sep.to_unrecognized())

        res = [SourceCodeWithConceptNode(function_node.first.to_unrecognized(), function_node.last.to_unrecognized())]

        # try to recognize every parameter, one by one
        for param in function_node.parameters:
            if isinstance(param.value, NameExprNode):
                # try to recognize concepts
                unrecognized = param.value.to_unrecognized()
                nodes_sequences = get_lexer_nodes_from_unrecognized(self.context,
                                                                    unrecognized,
                                                                    PARSERS)
            else:
                # the parameter is also a function
                nodes_sequences = self.to_source_code_node(param.value)

            if self.longest_concepts_only:
                nodes_sequences = self.get_longest_concepts(nodes_sequences)

            if nodes_sequences is None:
                # no concept found
                for source_code_node in res:
                    update_source_code_node(source_code_node, unrecognized, param.separator)

            elif len(nodes_sequences) == 1:
                # only one result
                # It is the same code than when there are multiple results
                # But here, we save the creation of the tmp_res object (not sure it worth it)
                for source_code_node in res:
                    update_source_code_node(source_code_node, nodes_sequences[0], param.separator)
            else:
                # multiple result, make the cartesian product
                tmp_res = []
                for source_code_node in res:
                    instances = get_n_clones(source_code_node, len(nodes_sequences))
                    tmp_res.extend(instances)
                    for instance, node_sequence in zip(instances, nodes_sequences):
                        update_source_code_node(instance, node_sequence, param.separator)
                res = tmp_res

        # check if it is a valid source code
        for source_code_node in res:
            source_code_node.fix_all_pos()
            source_code_node.pseudo_fix_source()

            python_parsing_res = python_parser.parse_nodes(self.context, source_code_node.get_all_nodes())
            if python_parsing_res.status:
                source_code_node.python_node = python_parsing_res.body.body
                source_code_node.return_value = python_parsing_res

                # make sure that concepts found can be evaluated
                errors = []
                for c in [c for c in source_code_node.python_node.objects.values() if isinstance(c, Concept)]:
                    update_compiled(self.context, c, errors)

        return res

    @staticmethod
    def get_longest_concepts(nodes_sequences):
        """
        The longest sequences are the ones that have the less number of concepts
        For example
        'twenty one' resolves to
            [c:twenty one:]
            [c:twenty:, c:one:]
        [c:twenty one:] has only one concept, so it's the longest one (two tokens against one token twice)
        :param nodes_sequences:
        :return:
        """
        if nodes_sequences is None:
            return None

        res = []
        min_len = -1
        for current_sequence in nodes_sequences:
            # awful hack to remove when NodeSequence and ConceptSequence will be implemented
            current_len = len(current_sequence) if hasattr(current_sequence, "__len__") else 1
            if len(res) == 0:
                res.append(current_sequence)
                min_len = current_len
            elif current_len == min_len:
                res.append(current_sequence)
            elif current_len < min_len:
                res.clear()
                res.append(current_sequence)
                min_len = current_len

        return res