Sheerka-Old/src/parsers/BaseNodeParser.py

from collections import namedtuple
from dataclasses import dataclass
from enum import Enum

import core.utils
from core.concept import Concept, ConceptParts
from core.rule import Rule
from core.tokenizer import TokenKind, Token
from parsers.BaseParser import Node, BaseParser, ParsingError

DEBUG_COMPILED = True


@dataclass()
class LexerNode(Node):
    start: int  # starting index in the tokens list
    end: int  # ending index in the tokens list
    tokens: list = None  # tokens
    source: str = None  # string representation of what was parsed

    def __post_init__(self):
        if self.source is None:
            self.source = core.utils.get_text_from_tokens(self.tokens)

    def __eq__(self, other):
        if not isinstance(other, LexerNode):
            return False

        return self.start == other.start and \
               self.end == other.end and \
               self.source == other.source and \
               self.tokens == other.tokens

    def fix_source(self, force=True):
        if force or self.source is None:
            self.source = core.utils.get_text_from_tokens(self.tokens)
        return self

    def clone(self):
        pass

    def to_short_str(self):
        raise NotImplementedError

    def get_source_to_parse(self):
        return self.source


class UnrecognizedTokensNode(LexerNode):
    def __init__(self, start, end, tokens):
        super().__init__(start, end, tokens)
        self.is_frozen = False  # TODO: Remove as it seems to now be useless
        self.parenthesis_count = 0

    def freeze(self):
        # TODO: Remove as it seems to now be useless
        self.is_frozen = True

    def reset(self):
        self.start = self.end = -1
        self.tokens.clear()
        self.is_frozen = False
        self.parenthesis_count = 0
        self.source = ""

    def add_token(self, token, pos):
        if self.is_frozen:
            raise Exception("The node is frozen")

        if self.end != -1 and pos == self.end + 2:
            # add the missing whitespace
            p = self.tokens[-1]  # previous token
            self.tokens.append(Token(TokenKind.WHITESPACE, " ", p.index + 1, p.line, p.column + 1))

        self.tokens.append(token)
        self.end = pos
        if self.start == -1:
            self.start = pos

        if token.type == TokenKind.LPAR:
            self.parenthesis_count += 1

        if token.type == TokenKind.RPAR:
            self.parenthesis_count -= 1

        return self

    def pop(self, token_kind):
        if self.is_frozen:
            raise Exception("The node is frozen")

        if len(self.tokens) > 0 and self.tokens[-1].type == token_kind:
            self.tokens.pop()
            if len(self.tokens) == 0:
                self.reset()
            else:
                self.end -= 1

    def has_open_paren(self):
        return self.parenthesis_count > 0

    def not_whitespace(self):
        return not self.is_whitespace()

    def is_whitespace(self):
        for t in self.tokens:
            if t.type not in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
                return False
        return True

    def is_empty(self):
        return len(self.tokens) == 0

    def last_token_type(self):
        if len(self.tokens) == 0:
            return None
        return self.tokens[-1].type

    def __eq__(self, other):
        if isinstance(other, utnode):
            return self.start == other.start and \
                   self.end == other.end and \
                   self.source == other.source

        if isinstance(other, UTN):
            return other == self

        if not isinstance(other, UnrecognizedTokensNode):
            return False

        return self.start == other.start and \
               self.end == other.end and \
               self.source == other.source

    def __hash__(self):
        return hash((self.start, self.end, self.source))

    def __repr__(self):
        return f"UnrecognizedTokensNode(source='{self.source}', start={self.start}, end={self.end})"

    def clone(self):
        clone = UnrecognizedTokensNode(self.start, self.end, self.tokens[:])
        clone.is_frozen = self.is_frozen
        clone.parenthesis_count = self.parenthesis_count
        return clone

    def to_short_str(self):
        return f"UTN('{self.source}')"


class RuleNode(LexerNode):
    def __init__(self, rule, start, end, tokens=None, source=None):
        super().__init__(start, end, tokens, source)
        self.rule = rule
        self.fix_source(False)

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, RN):
            return other == self

        if not isinstance(other, RuleNode):
            return False

        return self.rule == other.rule and \
               self.start == other.start and \
               self.end == other.end and \
               self.source == other.source

    def __hash__(self):
        return hash((self.rule, self.start, self.end, self.source))

    def __repr__(self):
        return f"RuleNode(rule='{self.rule}', source='{self.source}', start={self.start}, end={self.end})"

    def clone(self):
        return RuleNode(self.rule, self.start, self.end, self.tokens, self.source)

    def to_short_str(self):
        return f'RN({self.rule})'


class ConceptNode(LexerNode):
    """
    Returned by the BnfNodeParser
    It represents a recognized concept
    """

    def __init__(self, concept, start, end, tokens=None, source=None, underlying=None):
        super().__init__(start, end, tokens, source)
        self.concept = concept
        self.underlying = underlying
        self.fix_source(False)

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, (CN, CNC)):
            return other == self

        if isinstance(other, cnode):
            return self.concept.key == other.concept_key and \
                   self.start == other.start and \
                   self.end == other.end and \
                   self.source == other.source

        if isinstance(other, short_cnode):
            return self.concept.key == other.concept_key and self.source == other.source

        if not isinstance(other, ConceptNode):
            return False

        return self.concept == other.concept and \
               self.start == other.start and \
               self.end == other.end and \
               self.source == other.source and \
               self.underlying == other.underlying

    def __hash__(self):
        return hash((self.concept, self.start, self.end, self.source, self.underlying))

    def __repr__(self):
        text = f"ConceptNode(concept='{self.concept}', source='{self.source}', start={self.start}, end={self.end}"
        if DEBUG_COMPILED:
            for k, v in self.concept.get_compiled().items():
                text += f", {k}='{v}'"
        return text + ")"

    def clone(self):
        # do we need to clone the concept as well ?
        clone = ConceptNode(self.concept, self.start, self.end, self.tokens, self.source, self.underlying)
        return clone

    def as_bag(self):
        """
        Creates a dictionary with the useful properties of the ConceptNode
        see Concept.as_bag() for extra informations
        """
        bag = {}
        for k, v in self.__dict__.items():
            bag[k] = v

        # if isinstance(self.concept, Concept):
        #     bag["compiled"] = self.concept.get_compiled()
        return bag

    def to_short_str(self):
        return f'CN({self.concept})'


class SourceCodeNode(LexerNode):
    """
    Returned when some source code (like Python source code is recognized)
    """

    def __init__(self, start, end, tokens=None, source=None, python_node=None, return_value=None):
        """

        :param start: start position (index of the first token)
        :param end: end position (index of the last token)
        :param tokens:
        :param source: tokens as string
        :param python_node: PythonNode found (when the SourceCodeNode is validated)
        :param return_value: ReturnValueConcept returned when the source was validated

        When return_value is provided,
        You should have return_value.body.body == node
        """
        super().__init__(start, end, tokens, source)
        self.python_node = python_node  # The PythonNode (or whatever language node) that is found
        self.return_value = return_value  # original result of the parsing

    def __eq__(self, other):
        if isinstance(other, scnode):
            return self.start == other.start and \
                   self.end == other.end and \
                   self.source == other.source

        if isinstance(other, SCN):
            return other == self

        if not isinstance(other, SourceCodeNode):
            return False

        return self.python_node == other.python_node and \
               self.start == other.start and \
               self.end == other.end and \
               self.source == other.source

    def __hash__(self):
        return hash((self.start, self.end, self.source))

    def __repr__(self):
        return f"SourceCodeNode(start={self.start}, end={self.end}, source='{self.source}')"

    def to_short_str(self):
        return f"SCN('{self.source}')"

    def get_python_node(self):
        return self.python_node

    def get_source_to_parse(self):
        return self.python_node.source


class SourceCodeWithConceptNode(LexerNode):
    """
    Kind of temporary version for SourceCodeNode
    I know that there is some code,
    I know that there are some concepts
    I just don't want to make the glue yet

    So I push all the nodes into one big bag
    """

    def __init__(self, first_node, last_node, content_nodes=None, has_unrecognized=False):
        super().__init__(9999, -1, None)  # why not sys.maxint ?
        self.first = first_node
        self.last = last_node
        self.nodes = content_nodes or []
        self.has_unrecognized = has_unrecognized
        self._all_nodes = None
        self.fix_all_pos()

        self.python_node = None  # if the source code node is validated against a python parse, here is the PythonNode
        self.return_value = None  # return_value that produced the PythonNode

    def add_node(self, node):
        self.nodes.append(node)
        self.fix_pos(node)
        self._all_nodes = None

        return self

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, SCWC):
            return other == self

        if not isinstance(other, SourceCodeWithConceptNode):
            return False

        if self.start != other.start or self.end != other.end:
            return False

        if self.first != other.first:
            return False

        if self.last != other.last:
            return False

        if len(self.nodes) != len(other.nodes):
            return False

        for self_node, other_node in zip(self.nodes, other.nodes):
            if self_node != other_node:
                return False

        # at last
        return True

    def __hash__(self):
        return hash((self.first, self.last, len(self.nodes)))

    def __repr__(self):
        return f"SourceCodeWithConceptNode(start={self.start}, end={self.end}, source='{self.source}')"

    def fix_all_pos(self):
        if self.first is None:  # to ease some unit test where only the python_node is necessary
            return

        for n in [self.first, self.last] + self.nodes:
            self.fix_pos(n)

    def fix_pos(self, node):
        if hasattr(node, "start") and node.start is not None:
            if node.start < self.start:
                self.start = node.start

        if hasattr(node, "end") and node.end is not None:
            if node.end > self.end:
                self.end = node.end
        return self

    def pseudo_fix_source(self):
        """
        pseudo because the code is not that clean !
        :return:
        """
        self.source = self.first.source
        for n in self.nodes:
            self.source += " "
            if hasattr(n, "source"):
                self.source += n.source
            elif hasattr(n, "concept"):
                self.source += str(n.concept)
            else:
                self.source += " unknown"
        self.source += self.last.source
        return self

    def get_all_nodes(self):
        if self._all_nodes:
            return self._all_nodes

        self._all_nodes = [self.first, *self.nodes, self.last]
        return self._all_nodes

    def clone(self):
        clone = SourceCodeWithConceptNode(self.first, self.last, self.nodes.copy(), self.has_unrecognized)
        return clone

    def to_short_str(self):
        return f"SCWC({self.first}" + ", ".join(n.to_short_str for n in self.nodes) + f"{self.last})"

    def get_python_node(self):
        return self.python_node

    def get_source_to_parse(self):
        return self.python_node.source


@dataclass()
class GrammarErrorNode(ParsingError):
    message: str


class SyaAssociativity(Enum):
    Left = "left"
    Right = "right"
    No = "No"

    def __repr__(self):
        return self.value


cnode = namedtuple("ConceptNode", "concept_key start end source")
short_cnode = namedtuple("ConceptNode", "concept_key source")
utnode = namedtuple("utnode", "start end source")
scnode = namedtuple("scnode", "start end source")


class HelperWithPos:
    def __init__(self, start=None, end=None):
        self.start = start
        self.end = end

        self.start_is_fixed = start is not None
        self.end_is_fixed = end is not None

    def fix_pos(self, node):
        if not self.start_is_fixed:
            start = node.start if hasattr(node, "start") else \
                node[0] if isinstance(node, tuple) else None

            if start is not None and (self.start is None or start < self.start):
                self.start = start

        if not self.end_is_fixed:
            end = node.end if hasattr(node, "end") else \
                node[1] if isinstance(node, tuple) else None

            if end is not None and (self.end is None or end > self.end):
                self.end = end
        return self


class SCN(HelperWithPos):
    """
    SourceCodeNode tester class
    It matches with SourceCodeNode but with less constraints

    SCN == SourceCodeNode if source, start, end (start and end are not validated when None)
    """

    def __init__(self, source, start=None, end=None):
        super().__init__(start, end)
        self.source = source

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, SourceCodeNode):
            if self.source != other.source:
                return False
            if self.start is not None and self.start != other.start:
                return False
            if self.end is not None and self.end != other.end:
                return False

            return True

        if not isinstance(other, CN):
            return False

        return self.source == other.source and \
               self.start == other.start and \
               self.end == other.end

    def __hash__(self):
        return hash((self.source, self.start, self.end))

    def __repr__(self):
        txt = f"SCN(source='{self.source}'"
        if self.start is not None:
            txt += f", start={self.start}"
        if self.end is not None:
            txt += f", end={self.end}"
        return txt + ")"


class SCWC(HelperWithPos):
    """
    SourceNodeWithConcept tester class
    It matches with a SourceNodeWithConcept
    but it's easier to instantiate during the tests
    """

    def __init__(self, first, last, *args):
        super().__init__(None, None)
        self.first = first
        self.last = last
        self.content = args

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, SourceCodeWithConceptNode):
            if self.first != other.first:
                return False

            if self.last != other.last:
                return False

            if len(self.content) != len(other.nodes):
                return False

            for self_node, other_node in zip(self.content, other.nodes):
                if self_node != other_node:
                    return False

            # at last
            return True

    def __repr__(self):
        txt = "SCWC("
        if self.start is not None:
            txt += f"start={self.start}"
        if self.end is not None:
            txt += f", end={self.end}"
        txt += f", source='{self.source}'"
        return txt + ")"

    @property
    def source(self):
        """
        this code is a copy and paste from SourceCodeWithConceptNode.pseudo_fix_source
        TODO: create a common function or whatever...
        :return:
        """
        source = self.first.source if hasattr(self.first, "source") else self.first
        for n in self.content:
            source += " "
            if hasattr(n, "source"):
                source += n.source
            elif hasattr(n, "concept"):
                source += str(n.concept)
            else:
                source += " unknown"
        source += self.last.source if hasattr(self.last, "source") else self.last
        return source


class CN(HelperWithPos):
    """
    ConceptNode tester class
    It matches with ConceptNode but with less constraints

    CN == ConceptNode if concept key, start, end and source are the same
    """

    def __init__(self, concept, start=None, end=None, source=None):
        """

        :param concept: Concept or concept_key (only the key is used anyway)
        :param start:
        :param end:
        :param source:
        """
        super().__init__(start, end)
        self.concept_key = concept.key if isinstance(concept, Concept) else concept
        self.source = source
        self.concept = concept if isinstance(concept, Concept) else None

    def fix_source(self, str_tokens):
        self.source = "".join(str_tokens)
        return self

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, ConceptNode):
            if other.concept is None:
                return False
            if other.concept.key != self.concept_key:
                return False
            if self.start is not None and self.start != other.start:
                return False
            if self.end is not None and self.end != other.end:
                return False
            if self.source is not None and self.source != other.source:
                return False
            return True

        if not isinstance(other, CN):
            return False

        return self.concept_key == other.concept_key and \
               self.start == other.start and \
               self.end == other.end and \
               self.source == other.source

    def __hash__(self):
        return hash((self.concept_key, self.start, self.end, self.source))

    def __repr__(self):
        if self.concept:
            txt = f"CN(concept='{self.concept}'"
        else:
            txt = f"CN(concept_key='{self.concept_key}'"
        txt += f", source='{self.source}'"
        if self.start is not None:
            txt += f", start={self.start}"
        if self.end is not None:
            txt += f", end={self.end}"
        return txt + ")"


class CNC(CN):
    """
    ConceptNode for Compiled tester class
    It matches with ConceptNode
    But focuses on the 'compiled' property of the concept

    CNC == ConceptNode if CNC.get_compiled() == ConceptNode.concept.get_compiled()
    """

    def __init__(self, concept_key, start=None, end=None, source=None, exclude_body=False, **kwargs):
        super().__init__(concept_key, start, end, source)
        self.compiled = kwargs
        self.exclude_body = exclude_body
        if "body" in self.compiled:
            self.compiled[ConceptParts.BODY] = self.compiled["body"]
            del self.compiled["body"]

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, ConceptNode):
            if other.concept is None:
                return False
            if other.concept.key != self.concept_key:
                return False
            if self.start is not None and self.start != other.start:
                return False
            if self.end is not None and self.end != other.end:
                return False
            if self.source is not None and self.source != other.source:
                return False
            if self.exclude_body:
                to_compare = {k: v for k, v in other.concept.get_compiled().items() if k != ConceptParts.BODY}
            else:
                to_compare = other.concept.get_compiled()
            if self.compiled == to_compare:  # expanded form to ease the debug
                return True
            else:
                return False

        if not isinstance(other, CNC):
            return False

        return self.concept_key == other.concept_key and \
               self.start == other.start and \
               self.end == other.end and \
               self.source == other.source and \
               self.compiled == other.compiled

    def __repr__(self):
        if self.concept:
            txt = f"CNC(concept='{self.concept}'"
        else:
            txt = f"CNC(concept_key='{self.concept_key}'"
        txt += f", source='{self.source}'"
        if self.start is not None:
            txt += f", start={self.start}"
        if self.end is not None:
            txt += f", end={self.end}"

        for k, v in self.compiled.items():
            txt += f", {k}='{v}'"
        return txt + ")"


class UTN(HelperWithPos):
    """
    Tester class for UnrecognizedTokenNode
    compare the source, and start, end  if defined
    """

    def __init__(self, source, start=None, end=None):
        """
        :param source:
        :param start:
        :param end:
        """
        super().__init__(start, end)
        self.source = source

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, UnrecognizedTokensNode):
            return self.start == other.start and \
                   self.end == other.end and \
                   self.source == other.source

        if not isinstance(other, UTN):
            return False

        return self.start == other.start and \
               self.end == other.end and \
               self.source == other.source

    def __hash__(self):
        return hash((self.source, self.start, self.end))

    def __repr__(self):
        txt = f"UTN(source='{self.source}'"
        if self.start is not None:
            txt += f", start={self.start}"
        if self.end is not None:
            txt += f", end={self.end}"
        return txt + ")"


class RN(HelperWithPos):
    """
    Helper class to test RuleNode
    """

    def __init__(self, rule, start=None, end=None, source=None):
        """

        :param concept: Concept or concept_key (only the key is used anyway)
        :param start:
        :param end:
        :param source:
        """
        super().__init__(start, end)
        self.rule_id = rule.id if isinstance(rule, Rule) else rule
        self.source = source or core.utils.str_concept((None, self.rule_id), prefix="r:")
        self.rule = rule if isinstance(rule, Rule) else None

    def __eq__(self, other):
        if id(self) == id(other):
            return True

        if isinstance(other, RuleNode):
            if other.rule is None:
                return False
            if other.rule.id != self.rule_id:
                return False
            if self.start is not None and self.start != other.start:
                return False
            if self.end is not None and self.end != other.end:
                return False
            if self.source is not None and self.source != other.source:
                return False
            return True

        if not isinstance(other, RN):
            return False

        return self.rule_id == other.rule_id and \
               self.start == other.start and \
               self.end == other.end and \
               self.source == other.source

    def __hash__(self):
        return hash((self.rule_id, self.start, self.end, self.source))

    def __repr__(self):
        if self.rule:
            txt = f"RN(rule='{self.rule}'"
        else:
            txt = f"RN(rule_id='{self.rule_id}'"
        txt += f", source='{self.source}'"
        if self.start is not None:
            txt += f", start={self.start}"
        if self.end is not None:
            txt += f", end={self.end}"
        return txt + ")"


class BaseNodeParser(BaseParser):
    """
    Parser that return LexerNode
    """

    def __init__(self, name, priority, **kwargs):
        super().__init__(name, priority, yield_eof=True)

    def init_from_concepts(self, context, concepts, **kwargs):
        """
        Initialize the parser with a list of concepts
        For unit tests convenience
        :param context
        :param concepts
        :return:
        """
        from core.sheerka.services.SheerkaConceptManager import SheerkaConceptManager
        concepts_by_first_keyword = SheerkaConceptManager.compute_concepts_by_first_token(context, concepts).body
        resolved = SheerkaConceptManager.resolve_concepts_by_first_keyword(context, concepts_by_first_keyword).body

        context.sheerka.om.put(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY,
                               False,
                               resolved)