Added SyaNodeParser (finally, after one month)

2020-04-09 15:42:36 +02:00
parent c9acfa99a1
commit 6c7c529016
56 changed files with 5322 additions and 404 deletions
@@ -0,0 +1,669 @@
+from collections import namedtuple
+from dataclasses import dataclass
+from enum import Enum
+
+from core.builtin_concepts import BuiltinConcepts
+from core.concept import VARIABLE_PREFIX, Concept
+from core.sheerka.ExecutionContext import ExecutionContext
+from core.tokenizer import TokenKind, LexerError, Token
+from parsers.BaseParser import Node, BaseParser, ErrorNode
+
+DEBUG_COMPILED = True
+
+
+@dataclass()
+class LexerNode(Node):
+    start: int  # starting index in the tokens list
+    end: int  # ending index in the tokens list
+    tokens: list = None  # tokens
+    source: str = None  # string representation of what was parsed
+
+    def __post_init__(self):
+        if self.source is None:
+            self.source = BaseParser.get_text_from_tokens(self.tokens)
+
+    def __eq__(self, other):
+        if not isinstance(other, LexerNode):
+            return False
+
+        return self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source and \
+               self.tokens == other.tokens
+
+    def fix_source(self, force=True):
+        if force or self.source is None:
+            self.source = BaseParser.get_text_from_tokens(self.tokens)
+        return self
+
+
+class UnrecognizedTokensNode(LexerNode):
+    def __init__(self, start, end, tokens):
+        super().__init__(start, end, tokens)
+        self.is_frozen = False
+        self.parenthesis_count = 0
+
+    def freeze(self):
+        self.is_frozen = True
+
+    def reset(self):
+        self.start = self.end = -1
+        self.tokens.clear()
+        self.is_frozen = False
+        self.parenthesis_count = 0
+
+    def has_open_paren(self):
+        return self.parenthesis_count > 0
+
+    def add_token(self, token, pos):
+        if self.is_frozen:
+            raise Exception("The node is frozen")
+
+        if self.end != -1 and pos == self.end + 2:
+            # add the missing whitespace
+            p = self.tokens[-1]  # previous token
+            self.tokens.append(Token(TokenKind.WHITESPACE, " ", p.index + 1, p.line, p.column + 1))
+
+        self.tokens.append(token)
+        self.end = pos
+        if self.start == -1:
+            self.start = pos
+
+        if token.type == TokenKind.LPAR:
+            self.parenthesis_count += 1
+
+        if token.type == TokenKind.RPAR:
+            self.parenthesis_count -= 1
+
+        return self
+
+    def not_whitespace(self):
+        return not self.is_whitespace()
+
+    def is_whitespace(self):
+        for t in self.tokens:
+            if t.type not in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
+                return False
+        return True
+
+    def is_empty(self):
+        return len(self.tokens) == 0
+
+    def __eq__(self, other):
+        if isinstance(other, utnode):
+            return self.start == other.start and \
+                   self.end == other.end and \
+                   self.source == other.source
+
+        if isinstance(other, UTN):
+            return other == self
+
+        if not isinstance(other, UnrecognizedTokensNode):
+            return False
+
+        return self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source
+
+    def __hash__(self):
+        return hash((self.start, self.end, self.source))
+
+    def __repr__(self):
+        return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"
+
+    def clone(self):
+        clone = UnrecognizedTokensNode(self.start, self.end, self.tokens[:])
+        clone.is_frozen = self.is_frozen
+        clone.parenthesis_count = self.parenthesis_count
+        return clone
+
+
+class ConceptNode(LexerNode):
+    """
+    Returned by the BnfNodeParser
+    It represents a recognized concept
+    """
+
+    def __init__(self, concept, start, end, tokens=None, source=None, underlying=None):
+        super().__init__(start, end, tokens, source)
+        self.concept = concept
+        self.underlying = underlying
+        self.fix_source(False)
+
+    def __eq__(self, other):
+        if id(self) == id(other):
+            return True
+
+        if isinstance(other, (CN, CNC)):
+            return other == self
+
+        if isinstance(other, cnode):
+            return self.concept.key == other.concept_key and \
+                   self.start == other.start and \
+                   self.end == other.end and \
+                   self.source == other.source
+
+        if isinstance(other, short_cnode):
+            return self.concept.key == other.concept_key and self.source == other.source
+
+        if not isinstance(other, ConceptNode):
+            return False
+
+        return self.concept == other.concept and \
+               self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source and \
+               self.underlying == other.underlying
+
+    def __hash__(self):
+        return hash((self.concept, self.start, self.end, self.source, self.underlying))
+
+    def __repr__(self):
+        text = f"ConceptNode(concept='{self.concept}', source='{self.source}', start={self.start}, end={self.end}"
+        if DEBUG_COMPILED:
+            for k, v in self.concept.compiled.items():
+                text += f", {k}='{v}'"
+        return text + ")"
+
+    def clone(self):
+        # do we need to clone the concept as well ?
+        clone = ConceptNode(self.concept, self.start, self.end, self.tokens, self.source, self.underlying)
+        return clone
+
+
+class SourceCodeNode(LexerNode):
+    """
+    Returned when some source code (like Python source code is recognized)
+    """
+
+    def __init__(self, node, start, end, tokens=None, source=None, return_value=None):
+        super().__init__(start, end, tokens, source)
+        self.node = node  # The PythonNode (or whatever language node) that is found
+        self.return_value = return_value  # original result of the parsing
+
+    def __eq__(self, other):
+        if isinstance(other, scnode):
+            return self.start == other.start and \
+                   self.end == other.end and \
+                   self.source == other.source
+
+        if not isinstance(other, SourceCodeNode):
+            return False
+
+        return self.node == other.node and \
+               self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source
+
+    def __hash__(self):
+        return hash((self.start, self.end, self.source))
+
+    def __repr__(self):
+        return f"SourceCodeNode(start={self.start}, end={self.end}, source='{self.source}')"
+
+
+class SourceCodeWithConceptNode(LexerNode):
+    """
+    Kind of temporary version for SourceCodeNode
+    I know that there is some code,
+    I know that there are some concepts
+    I just don't want to make the glue yet
+
+    So I push all the nodes into one big bag
+    """
+
+    def __init__(self, first_node, last_node, content_nodes=None):
+        super().__init__(9999, -1, None)  # why not sys.maxint ?
+        self.first = first_node
+        self.last = last_node
+        self.nodes = content_nodes or []
+        self.has_unrecognized = False
+        self.fix_all_pos()
+
+    def add_node(self, node):
+        self.nodes.append(node)
+        self.fix_pos(node)
+
+        return self
+
+    def __eq__(self, other):
+        if id(self) == id(other):
+            return True
+
+        if not isinstance(other, SourceCodeWithConceptNode):
+            return False
+
+        if self.start != other.start or self.end != other.end:
+            return False
+
+        if self.first != other.first:
+            return False
+
+        if self.last != other.last:
+            return False
+
+        if len(self.nodes) != len(other.nodes):
+            return False
+
+        for self_node, other_node in zip(self.nodes, other.nodes):
+            if self_node != other_node:
+                return False
+
+        # at last
+        return True
+
+    def __hash__(self):
+        return hash((self.first, self.last, len(self.nodes)))
+
+    def __repr__(self):
+        return f"SourceCodeWithConceptNode(start={self.start}, end={self.end}, source='{self.source}')"
+
+    def fix_all_pos(self):
+        for n in [self.first, self.last] + self.nodes:
+            self.fix_pos(n)
+
+    def fix_pos(self, node):
+        if hasattr(node, "start") and node.start is not None:
+            if node.start < self.start:
+                self.start = node.start
+
+        if hasattr(node, "end") and node.end is not None:
+            if node.end > self.end:
+                self.end = node.end
+        return self
+
+    def pseudo_fix_source(self):
+        self.source = self.first.source
+        for n in self.nodes:
+            self.source += " "
+            if hasattr(n, "source"):
+                self.source += n.source
+            elif hasattr(n, "concept"):
+                self.source += str(n.concept)
+            else:
+                self.source += " unknown"
+        self.source += self.last.source
+        return self
+
+    def clone(self):
+        clone = SourceCodeWithConceptNode(self.first, self.last, self.nodes)
+        return clone
+
+
+@dataclass()
+class GrammarErrorNode(ErrorNode):
+    message: str
+
+
+class SyaAssociativity(Enum):
+    Left = "left"
+    Right = "right"
+    No = "No"
+
+    def __repr__(self):
+        return self.value
+
+
+cnode = namedtuple("ConceptNode", "concept_key start end source")
+short_cnode = namedtuple("ConceptNode", "concept_key source")
+utnode = namedtuple("utnode", "start end source")
+scnode = namedtuple("scnode", "start end source")
+
+
+@dataclass(init=False)
+class SCWC:
+    """
+    SourceNodeWithConcept tester class
+    It matches with a SourceNodeWithConcept
+    but it's easier to instantiate during the tests
+    """
+    first: LexerNode
+    last: LexerNode
+    content: tuple
+
+    def __init__(self, first, last, *args):
+        self.first = first
+        self.last = last
+        self.content = args
+
+
+class HelperWithPos:
+    def __init__(self, start=None, end=None):
+        self.start = start
+        self.end = end
+
+        self.start_is_fixed = start is not None
+        self.end_is_fixed = end is not None
+
+    def fix_pos(self, node):
+        if not self.start_is_fixed:
+            start = node.start if hasattr(node, "start") else \
+                node[0] if isinstance(node, tuple) else None
+
+            if start is not None and (self.start is None or start < self.start):
+                self.start = start
+
+        if not self.end_is_fixed:
+            end = node.end if hasattr(node, "end") else \
+                node[1] if isinstance(node, tuple) else None
+
+            if end is not None and (self.end is None or end > self.end):
+                self.end = end
+        return self
+
+
+class CN(HelperWithPos):
+    """
+    ConceptNode tester class
+    It matches with ConceptNode but with less constraints
+
+    CNC == ConceptNode if concept key, start, end and source are the same
+    """
+
+    def __init__(self, concept, start=None, end=None, source=None):
+        """
+
+        :param concept: Concept or concept_key (only the key is used anyway)
+        :param start:
+        :param end:
+        :param source:
+        """
+        super().__init__(start, end)
+        self.concept_key = concept.key if isinstance(concept, Concept) else concept
+        self.source = source
+        self.concept = concept if isinstance(concept, Concept) else None
+
+    def fix_source(self, str_tokens):
+        self.source = "".join(str_tokens)
+        return self
+
+    def __eq__(self, other):
+        if id(self) == id(other):
+            return True
+
+        if isinstance(other, ConceptNode):
+            if other.concept is None:
+                return False
+            if other.concept.key != self.concept_key:
+                return False
+            if self.start is not None and self.start != other.start:
+                return False
+            if self.end is not None and self.end != other.end:
+                return False
+            return True
+
+        if not isinstance(other, CN):
+            return False
+
+        return self.concept_key == other.concept_key and \
+               self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source
+
+    def __hash__(self):
+        return hash((self.concept_key, self.start, self.end, self.source))
+
+    def __repr__(self):
+        if self.concept:
+            txt = f"CN(concept='{self.concept}'"
+        else:
+            txt = f"CN(concept_key='{self.concept_key}'"
+        txt += f", source='{self.source}'"
+        if self.start is not None:
+            txt += f", start={self.start}"
+        if self.end is not None:
+            txt += f", end={self.end}"
+        return txt + ")"
+
+
+class CNC(CN):
+    """
+    ConceptNode for Compiled tester class
+    It matches with ConceptNode
+    But focuses on the 'compiled' property of the concept
+
+    CNC == ConceptNode if CNC.compiled == ConceptNode.concept.compiled
+    """
+
+    def __init__(self, concept_key, start=None, end=None, source=None, **kwargs):
+        super().__init__(concept_key, start, end, source)
+        self.compiled = kwargs
+
+    def __eq__(self, other):
+        if id(self) == id(other):
+            return True
+
+        if isinstance(other, ConceptNode):
+            if other.concept is None:
+                return False
+            if other.concept.key != self.concept_key:
+                return False
+            if self.start is not None and self.start != other.start:
+                return False
+            if self.end is not None and self.end != other.end:
+                return False
+            return self.compiled == other.concept.compiled  # assert instead of return to help debugging tests
+
+        if not isinstance(other, CNC):
+            return False
+
+        return self.concept_key == other.concept_key and \
+               self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source and \
+               self.compiled == other.compiled
+
+    def __repr__(self):
+        if self.concept:
+            txt = f"CNC(concept='{self.concept}'"
+        else:
+            txt = f"CNC(concept_key='{self.concept_key}'"
+        txt += f", source='{self.source}'"
+        if self.start is not None:
+            txt += f", start={self.start}"
+        if self.end is not None:
+            txt += f", end={self.end}"
+
+        for k, v in self.compiled.items():
+            txt += f", {k}='{v}'"
+        return txt + ")"
+
+
+class BaseNodeParser(BaseParser):
+    def __init__(self, name, priority, **kwargs):
+        super().__init__(name, priority)
+        if 'sheerka' in kwargs:
+            sheerka = kwargs.get("sheerka")
+            self.init_from_sheerka(sheerka)
+
+        else:
+            self.concepts_by_first_keyword = None
+            self.sya_definitions = None
+
+        self.token = None
+        self.pos = -1
+        self.tokens = None
+
+        self.context: ExecutionContext = None
+        self.text = None
+        self.sheerka = None
+
+    def init_from_sheerka(self, sheerka):
+        """
+        Use the definitons from Sheerka to initialize
+        :param sheerka:
+        :return:
+        """
+        self.concepts_by_first_keyword = sheerka.concepts_by_first_keyword
+        if sheerka.sya_definitions:
+            self.sya_definitions = {}
+            for k, v in sheerka.sya_definitions.items():
+                self.sya_definitions[k] = (v[0], SyaAssociativity(v[1]))
+
+    def reset_parser(self, context, text):
+        self.context = context
+        self.sheerka = context.sheerka
+        self.text = text
+
+        try:
+            self.tokens = list(self.get_input_as_tokens(text))
+        except LexerError as e:
+            self.add_error(self.sheerka.new(BuiltinConcepts.ERROR, body=e), False)
+            return False
+
+        self.token = None
+        self.pos = -1
+        return True
+
+    def add_error(self, error, next_token=True):
+        self.error_sink.append(error)
+        if next_token:
+            self.next_token()
+        return error
+
+    def get_token(self) -> Token:
+        return self.token
+
+    def next_token(self, skip_whitespace=True):
+        if self.token and self.token.type == TokenKind.EOF:
+            return False
+
+        self.pos += 1
+        self.token = self.tokens[self.pos]
+
+        if skip_whitespace:
+            while self.token.type == TokenKind.WHITESPACE or self.token.type == TokenKind.NEWLINE:
+                self.pos += 1
+                self.token = self.tokens[self.pos]
+
+        return self.token.type != TokenKind.EOF
+
+    def initialize(self, context, concepts, sya_definitions=None, use_sheerka=False):
+        """
+        To quickly find a concept, we store them in an hash where the key is the first token of the concept
+        example :
+            Concept("foo a").def_prop("a"), "foo" is a token, "a" is a variable
+            So the key to use will be "foo"
+
+            Concept("a foo").def_prop("a") -> first token is "foo"
+
+            Concept("Hello my dear a").def_prop("a") -> first token is "Hello"
+        Note that under the same key, there will be multiple entry
+        a B-Tree may be a better implementation in the future
+
+        We also store sya_definition which a is tuple (concept_precedence:int, concept_associativity:SyaAssociativity)
+        :param context:
+        :param concepts: list[Concept]
+        :param sya_definitions: hash[concept_id, tuple(precedence:int, associativity:SyaAssociativity)]
+        :param use_sheerka: first init with the definitions from Sheerka
+        :return:
+        """
+        self.context = context
+        self.sheerka = context.sheerka
+
+        if use_sheerka:
+            self.init_from_sheerka(self.sheerka)
+
+        if sya_definitions:
+            if self.sya_definitions:
+                self.sya_definitions.update(sya_definitions)
+            else:
+                self.sya_definitions = sya_definitions
+
+        if self.concepts_by_first_keyword is None:
+            self.concepts_by_first_keyword = {}
+
+        for concept in concepts:
+            keywords = concept.key.split()
+            for keyword in keywords:
+                if keyword.startswith(VARIABLE_PREFIX):
+                    continue
+
+                self.concepts_by_first_keyword.setdefault(keyword, []).append(concept.id)
+                break
+
+        return self.sheerka.ret(self.name, True, self.concepts_by_first_keyword)
+
+    def get_concepts(self, token, to_keep, to_map=None):
+        """
+        Tries to find if there are concepts that match the value of the token
+        :param token:
+        :param to_keep: predicate to tell if the concept is eligible
+        :param to_map:
+        :return:
+        """
+
+        if token.type == TokenKind.STRING:
+            name = token.value[1:-1]
+        elif token.type == TokenKind.KEYWORD:
+            name = token.value.value
+        else:
+            name = token.value
+
+        result = []
+        if name in self.concepts_by_first_keyword:
+            for concept_id in self.concepts_by_first_keyword[name]:
+
+                concept = self.sheerka.get_by_id(concept_id)
+
+                if not to_keep(concept):
+                    continue
+
+                concept = to_map(concept) if to_map else concept
+                result.append(concept)
+            return result
+
+        return None
+
+    @staticmethod
+    def get_token_value(token):
+        if token.type == TokenKind.STRING:
+            return token.value[1:-1]
+        elif token.type == TokenKind.KEYWORD:
+            return token.value.value
+        else:
+            return token.value
+
+
+class UTN(HelperWithPos):
+    """
+    Tester class for UnrecognizedTokenNode
+    compare the source, and start, end  if defined
+    """
+
+    def __init__(self, source, start=None, end=None):
+        """
+        :param concept: Concept or concept_key (only the key is used anyway)
+        :param start:
+        :param end:
+        :param source:
+        """
+        super().__init__(start, end)
+        self.source = source
+
+    def __eq__(self, other):
+        if id(self) == id(other):
+            return True
+
+        if isinstance(other, UnrecognizedTokensNode):
+            return self.start == other.start and \
+                   self.end == other.end and \
+                   self.source == other.source
+
+        if not isinstance(other, UTN):
+            return False
+
+        return self.start == other.start and \
+               self.end == other.end and \
+               self.source == other.source
+
+    def __hash__(self):
+        return hash((self.source, self.start, self.end))
+
+    def __repr__(self):
+        txt = f"UTN( source='{self.source}'"
+        if self.start is not None:
+            txt += f", start={self.start}"
+        if self.end is not None:
+            txt += f", end={self.end}"
+        return txt + ")"