Implemented FunctionParser

2020-09-17 14:11:09 +02:00
parent 8a866880bc
commit 177a6b1d5f
40 changed files with 1752 additions and 561 deletions
@@ -0,0 +1,407 @@
+from dataclasses import dataclass
+from typing import List
+
+from core.builtin_concepts import BuiltinConcepts
+from core.builtin_helpers import get_lexer_nodes_from_unrecognized, update_compiled
+from core.sheerka.services.SheerkaExecute import ParserInput
+from core.tokenizer import TokenKind, Token
+from core.utils import get_n_clones
+from parsers.BaseNodeParser import SourceCodeNode, SourceCodeWithConceptNode, UnrecognizedTokensNode
+from parsers.BaseParser import BaseParser, UnexpectedTokenErrorNode, UnexpectedEof, Node
+from parsers.PythonWithConceptsParser import PythonWithConceptsParser
+
+# No need to check for Python code as the source code node will resolve to python code anyway
+# I only look for concepts, so
+PARSERS = ["BnfNode", "SyaNode", "AtomNode"]
+
+
+@dataclass
+class FunctionParserNode(Node):
+    pass
+
+
+@dataclass()
+class NamesNode(FunctionParserNode):
+    start: int  # index of the first token
+    end: int  # index of the last token
+    tokens: List[Token]
+
+    def __repr__(self):
+        return f"NameNode('{self.str_value()}')"
+
+    def str_value(self):
+        if self.tokens is None:
+            return None
+
+        return "".join([t.str_value for t in self.tokens])
+
+    def to_unrecognized(self):
+        return UnrecognizedTokensNode(self.start, self.end, self.tokens).fix_source()
+
+
+@dataclass()
+class FunctionParameter:
+    """
+    class the represent result of the parameter parsing
+    """
+    value: NamesNode  # value parsed
+    separator: NamesNode = None  # holds the value and the position of the separator
+
+    def add_sep(self, start, end, tokens):
+        self.separator = NamesNode(start, end, tokens)
+
+    def value_to_unrecognized(self):
+        return UnrecognizedTokensNode(self.value.start, self.value.end, self.value.tokens).fix_source()
+
+    def separator_to_unrecognized(self):
+        if self.separator is None:
+            return None
+        return UnrecognizedTokensNode(self.separator.start, self.separator.end, self.separator.tokens).fix_source()
+
+
+@dataclass
+class FunctionNode(FunctionParserNode):
+    first: NamesNode  # beginning of the function (it should represent the name of the function)
+    last: NamesNode  # last part of the function (it should be the trailing parenthesis)
+    parameters: list
+
+
+class FN(FunctionNode):
+    """
+    Test class only
+    It matches with FunctionNode but with less constraints
+
+    Thereby,
+    FN("first", "last", ["param1," ...]) can be compared to
+    FunctionNode(NamesNode("first"), NamesNode("second"),  [FunctionParameter(NamesNodes("param1"), NamesNodes(", ")])
+
+    Note that FunctionParameter can easily be defined with a single string
+        * "param" -> FunctionParameter(NamesNode("param"), None)
+        * "param, " -> FunctionParameter(NamesNode("param"), NamesNode(", "))
+    For more complicated situations, you can use a tuple (value, sep) to define the value part and the separator part
+    """
+
+    def __init__(self, first, last, parameters):
+        self.first = first
+        self.last = last
+        self.parameters = []
+        for param in parameters:
+            if isinstance(param, tuple):
+                self.parameters.append(param)
+            elif isinstance(param, str) and (pos := param.find(",")) != -1:
+                self.parameters.append((param[:pos], param[pos:]))
+            else:
+                self.parameters.append((param, None))
+
+    def __eq__(self, other):
+        if id(self) == id(other):
+            return True
+
+        if isinstance(other, FN):
+            return self.first == other.first and self.last == other.last and self.parameters == other.parameters
+
+        if isinstance(other, FunctionNode):
+            if self.first != other.first.str_value() or self.last != other.last.str_value():
+                return False
+            if len(self.parameters) != len(other.parameters):
+                return False
+            for self_parameter, other_parameter in zip(self.parameters, other.parameters):
+                value = other_parameter.value.str_value() if isinstance(self_parameter[0],
+                                                                        str) else other_parameter.value
+                sep = other_parameter.separator.str_value() if other_parameter.separator else None
+                if self_parameter[0] != value or self_parameter[1] != sep:
+                    return False
+
+            return True
+
+        return False
+
+    def __hash__(self):
+        return hash((self.first, self.last, self.parameters))
+
+
+class FunctionParser(BaseParser):
+    """
+    The parser will be used to parse func(x, y, z)
+    where x, y and z can be source code, concepts or other functions
+    It will return a  SourceCodeNode or SourceCodeNodeWithConcept
+    """
+
+    def __init__(self, sep=",", longest_concepts_only=True, **kwargs):
+        """
+
+        :param sep:
+        :param longest_concepts_only: When multiples concepts are found, only keep the longest one
+        so 'twenty one' will resolve to [[c:twenty one:]], not [[c:twenty one:], [c:twenty:, c:one:]]
+        :param kwargs:
+        """
+        super().__init__("Function", 55, True)
+        self.sep = sep
+        self.longest_concepts_only = longest_concepts_only
+        self.record_errors = True
+
+    def add_error(self, error, next_token=True):
+        if not self.record_errors:
+            return
+
+        return super().add_error(error, next_token)
+
+    def parse(self, context, parser_input: ParserInput):
+        """
+
+        :param context:
+        :param parser_input:
+        :return:
+        """
+
+        if not isinstance(parser_input, ParserInput):
+            return None
+
+        context.log(f"Parsing '{parser_input}' with FunctionParser", self.name)
+        sheerka = context.sheerka
+
+        if parser_input.is_empty():
+            return sheerka.ret(self.name,
+                               False,
+                               sheerka.new(BuiltinConcepts.IS_EMPTY))
+
+        if not self.reset_parser(context, parser_input):
+            return self.sheerka.ret(
+                self.name,
+                False,
+                context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
+
+        node = self.parse_function()
+
+        if self.parser_input.next_token():
+            self.add_error(UnexpectedTokenErrorNode("Only one function supported",
+                                                    self.parser_input.token,
+                                                    [TokenKind.EOF]))
+
+        if self.has_error:
+            if node is None:
+                body = context.sheerka.new(BuiltinConcepts.NOT_FOR_ME,
+                                           body=parser_input.as_text(),
+                                           reason=self.error_sink)
+            else:
+                body = context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink)
+            return self.sheerka.ret(self.name, False, body)
+
+        source_code_nodes = self.to_source_code_node(node)
+
+        res = []
+        for source_code_node in source_code_nodes:
+            value = self.get_return_value_body(context.sheerka,
+                                               self.parser_input.as_text(),
+                                               source_code_node,
+                                               source_code_node)
+
+            res.append(self.sheerka.ret(self.name, source_code_node.python_node is not None, value))
+
+        return res[0] if len(res) == 1 else res
+
+    def parse_function(self):
+
+        start = self.parser_input.pos
+        token = self.parser_input.token
+        if token.type != TokenKind.IDENTIFIER:
+            self.add_error(UnexpectedTokenErrorNode(f"{token.repr_value} is not a identifier",
+                                                    token,
+                                                    [TokenKind.IDENTIFIER]))
+            return None
+
+        if not self.parser_input.next_token():
+            self.add_error(UnexpectedEof(f"Unexpected EOF while parsing left parenthesis"))
+            return None
+
+        token = self.parser_input.token
+        if token.type != TokenKind.LPAR:
+            self.add_error(UnexpectedTokenErrorNode(f"{token.repr_value} is not a left parenthesis",
+                                                    token,
+                                                    [TokenKind.LPAR]))
+            return None
+
+        start_node = NamesNode(start, start + 1, self.parser_input.tokens[start:start + 2])
+        if not self.parser_input.next_token():
+            self.add_error(UnexpectedEof(f"Unexpected EOF after left parenthesis"))
+            return FunctionNode(start_node, None, None)
+
+        params = self.parse_parameters()
+        if self.has_error:
+            return FunctionNode(start_node, None, params)
+
+        token = self.parser_input.token
+        if token.type != TokenKind.RPAR:
+            self.add_error(UnexpectedTokenErrorNode(f"Right parenthesis not found",
+                                                    token,
+                                                    [TokenKind.RPAR]))
+            return FunctionNode(start_node, None, params)
+
+        return FunctionNode(start_node,
+                            NamesNode(self.parser_input.pos, self.parser_input.pos, [token]),
+                            params)
+
+    def parse_parameters(self):
+        nodes = []
+        while True:
+            param_value = self.parse_parameter_value()
+            if not param_value:
+                break
+
+            function_parameter = FunctionParameter(param_value)
+            nodes.append(function_parameter)
+
+            token = self.parser_input.token
+            if token.type == TokenKind.EOF:
+                self.add_error(UnexpectedEof(f"Unexpected EOF while parsing parameters"))
+                return None
+
+            if token.type == TokenKind.RPAR:
+                break
+
+            if token.value == self.sep:
+                sep_pos = self.parser_input.pos
+                self.parser_input.next_token()
+                function_parameter.add_sep(sep_pos,
+                                           self.parser_input.pos - 1,
+                                           self.parser_input.tokens[sep_pos: self.parser_input.pos])
+
+        return nodes
+
+    def parse_parameter_value(self):
+        # check if the parameter is a function
+        start_pos = self.parser_input.pos
+        self.record_errors = False
+        func = self.parse_function()
+        self.record_errors = True
+        if func:
+            self.parser_input.next_token()
+            return func
+
+        # otherwise, eat until LPAR or separator
+        self.parser_input.seek(start_pos)
+        self.record_errors = True
+        tokens = []
+        while True:
+            token = self.parser_input.token
+            # if token is None:
+            #     break
+
+            if token.value == self.sep or token.type == TokenKind.RPAR:
+                break
+
+            tokens.append(token)
+            if not self.parser_input.next_token(skip_whitespace=False):
+                break
+
+        return NamesNode(start_pos, self.parser_input.pos - 1, tokens) if len(tokens) else None
+
+    def to_source_code_node(self, function_node: FunctionNode):
+        python_parser = PythonWithConceptsParser()
+
+        if len(function_node.parameters) == 0:
+            # validate the source
+            nodes_to_parse = [function_node.first.to_unrecognized(), function_node.last.to_unrecognized()]
+            python_parsing_res = python_parser.parse_nodes(self.context, nodes_to_parse)
+            python_node = python_parsing_res.body.body if python_parsing_res.status else None
+
+            return [SourceCodeNode(start=function_node.first.start,
+                                   end=function_node.last.end,
+                                   tokens=function_node.first.tokens + function_node.last.tokens,
+                                   python_node=python_node,
+                                   return_value=python_parsing_res)]
+
+        def update_source_code_node(scn, nodes, sep):
+            if hasattr(nodes, "__iter__"):
+                for n in nodes:
+                    scn.add_node(n)
+            else:
+                scn.add_node(nodes)
+
+            if sep:
+                scn.add_node(sep.to_unrecognized())
+
+        res = [SourceCodeWithConceptNode(function_node.first.to_unrecognized(), function_node.last.to_unrecognized())]
+        for param in function_node.parameters:
+            if isinstance(param.value, NamesNode):
+                unrecognized = param.value.to_unrecognized()
+                # try to recognize concepts
+                nodes_sequences = get_lexer_nodes_from_unrecognized(self.context,
+                                                                    unrecognized,
+                                                                    PARSERS)
+            else:
+                # the parameter is also a function
+                nodes_sequences = self.to_source_code_node(param.value)
+
+            if self.longest_concepts_only:
+                nodes_sequences = self.get_longest_concepts(nodes_sequences)
+
+            if nodes_sequences is None:
+                # no concept found
+                for source_code_node in res:
+                    update_source_code_node(source_code_node, unrecognized, param.separator)
+
+            elif len(nodes_sequences) == 1:
+                # only one result
+                # It is the same code than when there are multiple results
+                # But here, we save the creation of the tmp_res object (not sure it worth it)
+                for source_code_node in res:
+                    update_source_code_node(source_code_node, nodes_sequences[0], param.separator)
+            else:
+                # multiple result, make the cartesian product
+                tmp_res = []
+                for source_code_node in res:
+                    instances = get_n_clones(source_code_node, len(nodes_sequences))
+                    tmp_res.extend(instances)
+                    for instance, node_sequence in zip(instances, nodes_sequences):
+                        update_source_code_node(instance, node_sequence, param.separator)
+                res = tmp_res
+
+        # check if it is a valid source code
+        for source_code_node in res:
+            source_code_node.fix_all_pos()
+            source_code_node.pseudo_fix_source()
+
+            python_parsing_res = python_parser.parse_nodes(self.context, source_code_node.get_all_nodes())
+            if python_parsing_res.status:
+                source_code_node.python_node = python_parsing_res.body.body
+                source_code_node.return_value = python_parsing_res
+
+                # make sure that concepts found can be evaluated
+                errors = []
+                for c in source_code_node.python_node.concepts.values():
+                    update_compiled(self.context, c, errors)
+
+        return res
+
+    @staticmethod
+    def get_longest_concepts(nodes_sequences):
+        """
+        The longest sequences are the ones that have the less number of concepts
+        For example
+        'twenty one' resolves to
+            [c:twenty one:]
+            [c:twenty:, c:one:]
+        [c:twenty one:] has only one concept, so it's the longest one (two tokens against one token twice)
+        :param nodes_sequences:
+        :return:
+        """
+        if nodes_sequences is None:
+            return None
+
+        res = []
+        min_len = -1
+        for current_sequence in nodes_sequences:
+            # awful hack to remove when NodeSequence and ConceptSequence will be implemented
+            current_len = len(current_sequence) if hasattr(current_sequence, "__len__") else 1
+            if len(res) == 0:
+                res.append(current_sequence)
+                min_len = current_len
+            elif current_len == min_len:
+                res.append(current_sequence)
+            elif current_len < min_len:
+                res.clear()
+                res.append(current_sequence)
+                min_len = current_len
+
+        return res