Sheerka-Old/src/parsers/ExactConceptParser.py

import logging

import core.builtin_helpers
from core.builtin_concepts import ReturnValueConcept, BuiltinConcepts
from core.concept import VARIABLE_PREFIX
from core.sheerka.services.SheerkaExecute import ParserInput
from core.tokenizer import Keywords, TokenKind, LexerError
from core.utils import str_concept
from parsers.BaseParser import BaseParser


class ExactConceptParser(BaseParser):
    """
    Tries to recognize a single concept
    """

    MAX_WORDS_SIZE = 6

    def __init__(self, max_word_size=None, **kwargs):
        BaseParser.__init__(self, "ExactConcept", 80)
        self.max_word_size = max_word_size

    def parse(self, context, parser_input: ParserInput):
        """
        text can be string, but text can also be an list of tokens
        :param context:
        :param parser_input:
        :return:
        """

        context.log(f"Parsing '{parser_input}'", self.name)
        sheerka = context.sheerka

        try:
            parser_input.reset()
            words = self.get_words(parser_input)
        except LexerError as e:
            context.log(f"Error found in tokenizer {e}", self.name)
            return sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.ERROR, body=e))

        if len(words) > (self.max_word_size or self.MAX_WORDS_SIZE):
            context.log(f"Max words reached. Stopping.", self.name)
            too_long = sheerka.new(BuiltinConcepts.CONCEPT_TOO_LONG, body=parser_input.as_text())
            body = sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=parser_input.as_text(), reason=too_long)
            return sheerka.ret(self.name, False, body)

        already_recognized = []  # keep track of the concepts founds
        for combination in self.combinations(words):

            concept_key = " ".join(combination)
            result = sheerka.new(concept_key)  # use new(), not get() because we need a new instance

            if sheerka.isinstance(result, BuiltinConcepts.UNKNOWN_CONCEPT):
                continue

            concepts = result if isinstance(result, list) else [result]

            for concept in concepts:
                if concept in already_recognized:
                    context.log(f"Recognized concept {concept} again. Skipping.", self.name)
                    # example
                    # if the input is foo a and a concept is defined as foo a
                    # The will be two matches. One for 'foo a' and 'foo _var_0'
                    # but it's the same concept foo a
                    continue

                context.log(f"Recognized concept {concept}.", self.name)
                # update the properties if needed
                for i, token in enumerate(combination):
                    if token.startswith(VARIABLE_PREFIX):
                        index = int(token[len(VARIABLE_PREFIX):])
                        value = words[i]
                        concept.def_var_by_index(index, str_concept(value) if isinstance(value, tuple) else value)
                        concept.metadata.need_validation = True
                        if self.verbose_log.isEnabledFor(logging.DEBUG):
                            prop_name = concept.metadata.variables[index][0]
                            context.log(
                                f"Added variable {index}: {prop_name}='{words[i]}'.",
                                self.name)

                already_recognized.append(concept)

        by_name = sheerka.resolve(parser_input.as_text())
        core.builtin_helpers.set_is_evaluated(by_name)
        recognized = self.merge_concepts(already_recognized, by_name)

        if len(recognized) == 0:
            ret = sheerka.ret(self.name, False, sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT,
                                                            body=parser_input.as_text()))
            self.log_result(context, parser_input, ret)
            return ret
        else:
            res = [self.as_return_value(context, parser_input, c) for c in recognized]
            if len(res) == 1:
                self.log_result(context, parser_input, res[0])
            else:
                self.log_multiple_results(context, parser_input, res)
            return res

    @staticmethod
    def get_words(parser_input):
        res = []
        for t in parser_input.as_tokens():
            if t.type == TokenKind.EOF:
                break
            if t.type == TokenKind.NEWLINE or t.type == TokenKind.WHITESPACE:
                continue
            res.append(t.value.value if isinstance(t.value, Keywords) else t.value)
        return res

    def combinations(self, iterable):
        # combinations('foo', 'bar', 'baz') -->
        # ('foo', 'bar', 'baz'),
        # ('__var__0', 'bar', 'baz'),
        # ('foo', '__var__0', 'baz'),
        # ('foo', 'bar', '__var__0'),
        # ('__var__0', '__var__1', 'baz'),
        # ('__var__0', 'bar', '__var__1'),
        # ('foo', '__var__0', '__var__1'),
        # ('__var__0', '__var__1', '__var__2')]

        pool = tuple(iterable)
        n = len(pool)

        res = set()

        for r in range(0, n + 1):
            indices = list(range(r))
            res.add(self.get_tuple(pool, indices))
            while True:
                for i in reversed(range(r)):
                    if indices[i] != i + n - r:
                        break
                else:
                    break
                indices[i] += 1
                for j in range(i + 1, r):
                    indices[j] = indices[j - 1] + 1
                res.add(self.get_tuple(pool, indices))

        # remove all result that contains a token concepts
        # They are not valid entries, since a token concept MUST be replaced by a variable
        filtered = set()
        for combination in res:
            for entry in combination:
                if isinstance(entry, tuple):
                    break
            else:
                filtered.add(combination)

        return filtered

    @staticmethod
    def get_tuple(pool, indices):
        res = []
        vars = {}
        k = 0

        # init vars
        for i in indices:
            value = pool[i]
            if value not in vars:
                vars[pool[i]] = f"{VARIABLE_PREFIX}{k}"
                k += 1

        # create tuple
        for i in range(len(pool)):
            value = pool[i]
            res.append(vars[value] if value in vars else value)
        return tuple(res)

    def as_return_value(self, context, parser_input, concept):
        return ReturnValueConcept(
            self.name,
            True,
            context.sheerka.new(
                BuiltinConcepts.PARSER_RESULT,
                parser=self,
                source=parser_input.as_text(),
                body=concept,
                try_parsed=concept))