from collections import namedtuple from dataclasses import dataclass from enum import Enum from typing import Set import core.utils from core.builtin_concepts import BuiltinConcepts from core.concept import VARIABLE_PREFIX, Concept, DEFINITION_TYPE_BNF, ConceptParts from core.rule import Rule from core.tokenizer import TokenKind, Token from parsers.BaseParser import Node, BaseParser, ParsingError DEBUG_COMPILED = True @dataclass class ChickenAndEggError(Exception): concepts: Set[str] @dataclass class NoFirstTokenError(ParsingError): concept: Concept key: str @dataclass() class LexerNode(Node): start: int # starting index in the tokens list end: int # ending index in the tokens list tokens: list = None # tokens source: str = None # string representation of what was parsed def __post_init__(self): if self.source is None: self.source = core.utils.get_text_from_tokens(self.tokens) def __eq__(self, other): if not isinstance(other, LexerNode): return False return self.start == other.start and \ self.end == other.end and \ self.source == other.source and \ self.tokens == other.tokens def fix_source(self, force=True): if force or self.source is None: self.source = core.utils.get_text_from_tokens(self.tokens) return self def clone(self): pass def to_short_str(self): raise NotImplementedError class UnrecognizedTokensNode(LexerNode): def __init__(self, start, end, tokens): super().__init__(start, end, tokens) self.is_frozen = False # TODO: Remove as it seems to now be useless self.parenthesis_count = 0 def freeze(self): # TODO: Remove as it seems to now be useless self.is_frozen = True def reset(self): self.start = self.end = -1 self.tokens.clear() self.is_frozen = False self.parenthesis_count = 0 self.source = "" def add_token(self, token, pos): if self.is_frozen: raise Exception("The node is frozen") if self.end != -1 and pos == self.end + 2: # add the missing whitespace p = self.tokens[-1] # previous token self.tokens.append(Token(TokenKind.WHITESPACE, " ", p.index + 1, p.line, p.column + 1)) self.tokens.append(token) self.end = pos if self.start == -1: self.start = pos if token.type == TokenKind.LPAR: self.parenthesis_count += 1 if token.type == TokenKind.RPAR: self.parenthesis_count -= 1 return self def pop(self, token_kind): if self.is_frozen: raise Exception("The node is frozen") if len(self.tokens) > 0 and self.tokens[-1].type == token_kind: self.tokens.pop() if len(self.tokens) == 0: self.reset() else: self.end -= 1 def has_open_paren(self): return self.parenthesis_count > 0 def not_whitespace(self): return not self.is_whitespace() def is_whitespace(self): for t in self.tokens: if t.type not in (TokenKind.WHITESPACE, TokenKind.NEWLINE): return False return True def is_empty(self): return len(self.tokens) == 0 def last_token_type(self): if len(self.tokens) == 0: return None return self.tokens[-1].type def __eq__(self, other): if isinstance(other, utnode): return self.start == other.start and \ self.end == other.end and \ self.source == other.source if isinstance(other, UTN): return other == self if not isinstance(other, UnrecognizedTokensNode): return False return self.start == other.start and \ self.end == other.end and \ self.source == other.source def __hash__(self): return hash((self.start, self.end, self.source)) def __repr__(self): return f"UnrecognizedTokensNode(source='{self.source}', start={self.start}, end={self.end})" def clone(self): clone = UnrecognizedTokensNode(self.start, self.end, self.tokens[:]) clone.is_frozen = self.is_frozen clone.parenthesis_count = self.parenthesis_count return clone def to_short_str(self): return f"UTN('{self.source}')" class RuleNode(LexerNode): def __init__(self, rule, start, end, tokens=None, source=None): super().__init__(start, end, tokens, source) self.rule = rule self.fix_source(False) def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, RN): return other == self if not isinstance(other, RuleNode): return False return self.rule == other.rule and \ self.start == other.start and \ self.end == other.end and \ self.source == other.source def __hash__(self): return hash((self.rule, self.start, self.end, self.source)) def __repr__(self): return f"RuleNode(rule='{self.rule}', source='{self.source}', start={self.start}, end={self.end})" def clone(self): return RuleNode(self.rule, self.start, self.end, self.tokens, self.source) def to_short_str(self): return f'RN({self.rule})' class ConceptNode(LexerNode): """ Returned by the BnfNodeParser It represents a recognized concept """ def __init__(self, concept, start, end, tokens=None, source=None, underlying=None): super().__init__(start, end, tokens, source) self.concept = concept self.underlying = underlying self.fix_source(False) def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, (CN, CNC)): return other == self if isinstance(other, cnode): return self.concept.key == other.concept_key and \ self.start == other.start and \ self.end == other.end and \ self.source == other.source if isinstance(other, short_cnode): return self.concept.key == other.concept_key and self.source == other.source if not isinstance(other, ConceptNode): return False return self.concept == other.concept and \ self.start == other.start and \ self.end == other.end and \ self.source == other.source and \ self.underlying == other.underlying def __hash__(self): return hash((self.concept, self.start, self.end, self.source, self.underlying)) def __repr__(self): text = f"ConceptNode(concept='{self.concept}', source='{self.source}', start={self.start}, end={self.end}" if DEBUG_COMPILED: for k, v in self.concept.get_compiled().items(): text += f", {k}='{v}'" return text + ")" def clone(self): # do we need to clone the concept as well ? clone = ConceptNode(self.concept, self.start, self.end, self.tokens, self.source, self.underlying) return clone def as_bag(self): """ Creates a dictionary with the useful properties of the ConceptNode see Concept.as_bag() for extra informations """ bag = {} for k, v in self.__dict__.items(): bag[k] = v # if isinstance(self.concept, Concept): # bag["compiled"] = self.concept.get_compiled() return bag def to_short_str(self): return f'CN({self.concept})' class SourceCodeNode(LexerNode): """ Returned when some source code (like Python source code is recognized) """ def __init__(self, start, end, tokens=None, source=None, python_node=None, return_value=None): """ :param start: start position (index of the first token) :param end: end position (index of the last token) :param tokens: :param source: tokens as string :param python_node: PythonNode found (when the SourceCodeNode is validated) :param return_value: ReturnValueConcept returned when the source was validated When return_value is provided, You should have return_value.body.body == node """ super().__init__(start, end, tokens, source) self.python_node = python_node # The PythonNode (or whatever language node) that is found self.return_value = return_value # original result of the parsing def __eq__(self, other): if isinstance(other, scnode): return self.start == other.start and \ self.end == other.end and \ self.source == other.source if isinstance(other, SCN): return other == self if not isinstance(other, SourceCodeNode): return False return self.python_node == other.python_node and \ self.start == other.start and \ self.end == other.end and \ self.source == other.source def __hash__(self): return hash((self.start, self.end, self.source)) def __repr__(self): return f"SourceCodeNode(start={self.start}, end={self.end}, source='{self.source}')" def to_short_str(self): return f"SCN('{self.source}')" class SourceCodeWithConceptNode(LexerNode): """ Kind of temporary version for SourceCodeNode I know that there is some code, I know that there are some concepts I just don't want to make the glue yet So I push all the nodes into one big bag """ def __init__(self, first_node, last_node, content_nodes=None, has_unrecognized=False): super().__init__(9999, -1, None) # why not sys.maxint ? self.first = first_node self.last = last_node self.nodes = content_nodes or [] self.has_unrecognized = has_unrecognized self._all_nodes = None self.fix_all_pos() self.python_node = None # if the source code node is validated against a python parse, here is the PythonNode self.return_value = None # return_value that produced the PythonNode def add_node(self, node): self.nodes.append(node) self.fix_pos(node) self._all_nodes = None return self def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, SCWC): return other == self if not isinstance(other, SourceCodeWithConceptNode): return False if self.start != other.start or self.end != other.end: return False if self.first != other.first: return False if self.last != other.last: return False if len(self.nodes) != len(other.nodes): return False for self_node, other_node in zip(self.nodes, other.nodes): if self_node != other_node: return False # at last return True def __hash__(self): return hash((self.first, self.last, len(self.nodes))) def __repr__(self): return f"SourceCodeWithConceptNode(start={self.start}, end={self.end}, source='{self.source}')" def fix_all_pos(self): if self.first is None: # to ease some unit test where only the python_node is necessary return for n in [self.first, self.last] + self.nodes: self.fix_pos(n) def fix_pos(self, node): if hasattr(node, "start") and node.start is not None: if node.start < self.start: self.start = node.start if hasattr(node, "end") and node.end is not None: if node.end > self.end: self.end = node.end return self def pseudo_fix_source(self): """ pseudo because the code is not that clean ! :return: """ self.source = self.first.source for n in self.nodes: self.source += " " if hasattr(n, "source"): self.source += n.source elif hasattr(n, "concept"): self.source += str(n.concept) else: self.source += " unknown" self.source += self.last.source return self def get_all_nodes(self): if self._all_nodes: return self._all_nodes self._all_nodes = [self.first, *self.nodes, self.last] return self._all_nodes def clone(self): clone = SourceCodeWithConceptNode(self.first, self.last, self.nodes.copy(), self.has_unrecognized) return clone def to_short_str(self): return f"SCWC({self.first}" + ", ".join(n.to_short_str for n in self.nodes) + f"{self.last})" @dataclass() class GrammarErrorNode(ParsingError): message: str class SyaAssociativity(Enum): Left = "left" Right = "right" No = "No" def __repr__(self): return self.value cnode = namedtuple("ConceptNode", "concept_key start end source") short_cnode = namedtuple("ConceptNode", "concept_key source") utnode = namedtuple("utnode", "start end source") scnode = namedtuple("scnode", "start end source") class HelperWithPos: def __init__(self, start=None, end=None): self.start = start self.end = end self.start_is_fixed = start is not None self.end_is_fixed = end is not None def fix_pos(self, node): if not self.start_is_fixed: start = node.start if hasattr(node, "start") else \ node[0] if isinstance(node, tuple) else None if start is not None and (self.start is None or start < self.start): self.start = start if not self.end_is_fixed: end = node.end if hasattr(node, "end") else \ node[1] if isinstance(node, tuple) else None if end is not None and (self.end is None or end > self.end): self.end = end return self class SCN(HelperWithPos): """ SourceCodeNode tester class It matches with SourceCodeNode but with less constraints SCN == SourceCodeNode if source, start, end (start and end are not validated when None) """ def __init__(self, source, start=None, end=None): super().__init__(start, end) self.source = source def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, SourceCodeNode): if self.source != other.source: return False if self.start is not None and self.start != other.start: return False if self.end is not None and self.end != other.end: return False return True if not isinstance(other, CN): return False return self.source == other.source and \ self.start == other.start and \ self.end == other.end def __hash__(self): return hash((self.source, self.start, self.end)) def __repr__(self): txt = f"SCN(source='{self.source}'" if self.start is not None: txt += f", start={self.start}" if self.end is not None: txt += f", end={self.end}" return txt + ")" class SCWC(HelperWithPos): """ SourceNodeWithConcept tester class It matches with a SourceNodeWithConcept but it's easier to instantiate during the tests """ def __init__(self, first, last, *args): super().__init__(None, None) self.first = first self.last = last self.content = args def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, SourceCodeWithConceptNode): if self.first != other.first: return False if self.last != other.last: return False if len(self.content) != len(other.nodes): return False for self_node, other_node in zip(self.content, other.nodes): if self_node != other_node: return False # at last return True def __repr__(self): txt = "SCWC(" if self.start is not None: txt += f"start={self.start}" if self.end is not None: txt += f", end={self.end}" txt += f", source='{self.source}'" return txt + ")" @property def source(self): """ this code is a copy and paste from SourceCodeWithConceptNode.pseudo_fix_source TODO: create a common function or whatever... :return: """ source = self.first.source if hasattr(self.first, "source") else self.first for n in self.content: source += " " if hasattr(n, "source"): source += n.source elif hasattr(n, "concept"): source += str(n.concept) else: source += " unknown" source += self.last.source if hasattr(self.last, "source") else self.last return source class CN(HelperWithPos): """ ConceptNode tester class It matches with ConceptNode but with less constraints CN == ConceptNode if concept key, start, end and source are the same """ def __init__(self, concept, start=None, end=None, source=None): """ :param concept: Concept or concept_key (only the key is used anyway) :param start: :param end: :param source: """ super().__init__(start, end) self.concept_key = concept.key if isinstance(concept, Concept) else concept self.source = source self.concept = concept if isinstance(concept, Concept) else None def fix_source(self, str_tokens): self.source = "".join(str_tokens) return self def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, ConceptNode): if other.concept is None: return False if other.concept.key != self.concept_key: return False if self.start is not None and self.start != other.start: return False if self.end is not None and self.end != other.end: return False if self.source is not None and self.source != other.source: return False return True if not isinstance(other, CN): return False return self.concept_key == other.concept_key and \ self.start == other.start and \ self.end == other.end and \ self.source == other.source def __hash__(self): return hash((self.concept_key, self.start, self.end, self.source)) def __repr__(self): if self.concept: txt = f"CN(concept='{self.concept}'" else: txt = f"CN(concept_key='{self.concept_key}'" txt += f", source='{self.source}'" if self.start is not None: txt += f", start={self.start}" if self.end is not None: txt += f", end={self.end}" return txt + ")" class CNC(CN): """ ConceptNode for Compiled tester class It matches with ConceptNode But focuses on the 'compiled' property of the concept CNC == ConceptNode if CNC.get_compiled() == ConceptNode.concept.get_compiled() """ def __init__(self, concept_key, start=None, end=None, source=None, exclude_body=False, **kwargs): super().__init__(concept_key, start, end, source) self.compiled = kwargs self.exclude_body = exclude_body if "body" in self.compiled: self.compiled[ConceptParts.BODY] = self.compiled["body"] del self.compiled["body"] def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, ConceptNode): if other.concept is None: return False if other.concept.key != self.concept_key: return False if self.start is not None and self.start != other.start: return False if self.end is not None and self.end != other.end: return False if self.source is not None and self.source != other.source: return False if self.exclude_body: to_compare = {k: v for k, v in other.concept.get_compiled().items() if k != ConceptParts.BODY} else: to_compare = other.concept.get_compiled() if self.compiled == to_compare: # expanded form to ease the debug return True else: return False if not isinstance(other, CNC): return False return self.concept_key == other.concept_key and \ self.start == other.start and \ self.end == other.end and \ self.source == other.source and \ self.compiled == other.compiled def __repr__(self): if self.concept: txt = f"CNC(concept='{self.concept}'" else: txt = f"CNC(concept_key='{self.concept_key}'" txt += f", source='{self.source}'" if self.start is not None: txt += f", start={self.start}" if self.end is not None: txt += f", end={self.end}" for k, v in self.compiled.items(): txt += f", {k}='{v}'" return txt + ")" class UTN(HelperWithPos): """ Tester class for UnrecognizedTokenNode compare the source, and start, end if defined """ def __init__(self, source, start=None, end=None): """ :param source: :param start: :param end: """ super().__init__(start, end) self.source = source def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, UnrecognizedTokensNode): return self.start == other.start and \ self.end == other.end and \ self.source == other.source if not isinstance(other, UTN): return False return self.start == other.start and \ self.end == other.end and \ self.source == other.source def __hash__(self): return hash((self.source, self.start, self.end)) def __repr__(self): txt = f"UTN(source='{self.source}'" if self.start is not None: txt += f", start={self.start}" if self.end is not None: txt += f", end={self.end}" return txt + ")" class RN(HelperWithPos): """ Helper class to test RuleNode """ def __init__(self, rule, start=None, end=None, source=None): """ :param concept: Concept or concept_key (only the key is used anyway) :param start: :param end: :param source: """ super().__init__(start, end) self.rule_id = rule.id if isinstance(rule, Rule) else rule self.source = source or core.utils.str_concept((None, self.rule_id), prefix="r:") self.rule = rule if isinstance(rule, Rule) else None def __eq__(self, other): if id(self) == id(other): return True if isinstance(other, RuleNode): if other.rule is None: return False if other.rule.id != self.rule_id: return False if self.start is not None and self.start != other.start: return False if self.end is not None and self.end != other.end: return False if self.source is not None and self.source != other.source: return False return True if not isinstance(other, RN): return False return self.rule_id == other.rule_id and \ self.start == other.start and \ self.end == other.end and \ self.source == other.source def __hash__(self): return hash((self.rule_id, self.start, self.end, self.source)) def __repr__(self): if self.rule: txt = f"RN(rule='{self.rule}'" else: txt = f"RN(rule_id='{self.rule_id}'" txt += f", source='{self.source}'" if self.start is not None: txt += f", start={self.start}" if self.end is not None: txt += f", end={self.end}" return txt + ")" class BaseNodeParser(BaseParser): """ Parser that return LexerNode """ def __init__(self, name, priority, **kwargs): super().__init__(name, priority, yield_eof=True) if 'sheerka' in kwargs: sheerka = kwargs.get("sheerka") self.concepts_by_first_keyword = sheerka.resolved_concepts_by_first_keyword else: self.concepts_by_first_keyword = None def init_from_concepts(self, context, concepts, **kwargs): """ Initialize the parser with a list of concepts For unit tests convenience :param context :param concepts :return: """ concepts_by_first_keyword = self.get_concepts_by_first_token(context, concepts).body self.concepts_by_first_keyword = self.resolve_concepts_by_first_keyword(context, concepts_by_first_keyword).body def get_concepts(self, token, to_keep, custom=None, to_map=None, strip_quotes=False): """ Tries to find if there are concepts that match the value of the token Caution: Returns the actual cache, not a copy :param token: :param to_keep: predicate to tell if the concept is eligible :param custom: lambda name -> List[Concepts] that gives extra concepts, according to the name :param to_map: :param strip_quotes: Remove quotes from strings :return: """ if token.type == TokenKind.WHITESPACE: return None if token.type == TokenKind.STRING: name = token.value[1:-1] if strip_quotes else token.value else: name = token.value custom_concepts = custom(name) if custom else [] # to get extra concepts using an alternative method result = [] if name in self.concepts_by_first_keyword: for concept_id in self.concepts_by_first_keyword.get(name): concept = self.sheerka.get_by_id(concept_id) if not to_keep(concept): continue concept = to_map(concept, self, self.sheerka) if to_map else concept result.append(concept) return core.utils.make_unique(result + custom_concepts, lambda c: c.concept.id if hasattr(c, "concept") else c.id) return custom_concepts if custom else None @staticmethod def get_concepts_by_first_token(context, concepts, use_sheerka=False, previous_entries=None): """ Create the map describing the first token expected by a concept :param context: :param concepts: lists of concepts to parse :param use_sheerka: if True, update concepts_by_first_keyword from sheerka :param previous_entries: :return: """ sheerka = context.sheerka res = sheerka.cache_manager.copy(sheerka.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) if use_sheerka else (previous_entries or {}) for concept in concepts: keywords = BaseNodeParser.get_first_tokens(sheerka, concept) if keywords is None: # no first token found for a concept ? return sheerka.ret(sheerka.name, False, NoFirstTokenError(concept, concept.key)) for keyword in keywords: res.setdefault(keyword, []).append(concept.id) # 'uniquify' the lists for k, v in res.items(): res[k] = core.utils.make_unique(v) return sheerka.ret("BaseNodeParser", True, res) @staticmethod def resolve_concepts_by_first_keyword(context, concepts_by_first_keyword, modified_concepts=None): sheerka = context.sheerka res = {} def get_by_id(c_id): if modified_concepts and c_id in modified_concepts: return modified_concepts[c_id] return sheerka.get_by_id(c_id) def resolve_concepts(concept_str): c_key, c_id = core.utils.unstr_concept(concept_str) if c_id in already_seen: return ChickenAndEggError(already_seen) already_seen.add(c_id) resolved = set() to_resolve = set() chicken_and_egg = set() concept = get_by_id(c_id) if sheerka.isaset(context, concept): concepts = sheerka.get_set_elements(context, concept) else: concepts = [concept] for concept in concepts: BaseNodeParser.ensure_bnf(context, concept) # need to make sure that it cannot fail keywords = BaseNodeParser.get_first_tokens(sheerka, concept) for keyword in keywords: (to_resolve if keyword.startswith("c:|") else resolved).add(keyword) for concept_to_resolve_str in to_resolve: res = resolve_concepts(concept_to_resolve_str) if isinstance(res, ChickenAndEggError): chicken_and_egg |= res.concepts else: resolved |= res to_resolve.clear() if len(resolved) == 0 and len(chicken_and_egg) > 0: raise ChickenAndEggError(chicken_and_egg) else: return resolved for k, v in concepts_by_first_keyword.items(): if k.startswith("c:|"): try: already_seen = set() resolved_keywords = resolve_concepts(k) for resolved in resolved_keywords: res.setdefault(resolved, []).extend(v) except ChickenAndEggError as ex: context.log(f"Chicken and egg detected for {k}, concepts={ex.concepts}") concepts_in_recursion = ex.concepts # make sure to have all the parents for parent in v: concepts_in_recursion.add(parent) for concept_id in concepts_in_recursion: # make sure we keep the longest chain old = sheerka.chicken_and_eggs.get(concept_id) if old is None or len(old) < len(ex.concepts): sheerka.chicken_and_eggs.put(concept_id, concepts_in_recursion) else: res.setdefault(k, []).extend(v) # 'uniquify' the lists for k, v in res.items(): res[k] = core.utils.make_unique(v) return sheerka.ret("BaseNodeParser", True, res) @staticmethod def get_referenced_concepts(context, concept_id, already_seen): """ Gets all the tokens that may allow to recognize concept concept_id Basically, it returns all the starting tokens for concept concept_id CHICKEN_AND_EGG is returned when a circular references are found :param context: :param concept_id: :param already_seen: :return: """ if concept_id in already_seen: return ChickenAndEggError(already_seen) already_seen.add(concept_id) resolved = set() to_resolve = set() chicken_and_egg = set() sheerka = context.sheerka concept = sheerka.get_by_id(concept_id) if sheerka.isaset(context, concept): concepts = sheerka.get_set_elements(context, concept) else: concepts = [concept] for concept in concepts: BaseNodeParser.ensure_bnf(context, concept) # need to make sure that it cannot fail keywords = BaseNodeParser.get_first_tokens(sheerka, concept) for keyword in keywords: (to_resolve if keyword.startswith("c:|") else resolved).add(keyword) for concept_to_resolve_str in to_resolve: c_key, c_id = core.utils.unstr_concept(concept_to_resolve_str) res = BaseNodeParser.get_referenced_concepts(context, c_id, already_seen) if isinstance(res, ChickenAndEggError): chicken_and_egg |= res.concepts else: resolved |= res to_resolve.clear() if len(resolved) == 0 and len(chicken_and_egg) > 0: raise ChickenAndEggError(chicken_and_egg) else: return resolved @staticmethod def resolve_sya_associativity_and_precedence(context, sya): pass @staticmethod def get_first_tokens(sheerka, concept): """ :param sheerka: :param concept: :return: """ if concept.get_bnf(): from parsers.BnfNodeParser import BnfNodeFirstTokenVisitor bnf_visitor = BnfNodeFirstTokenVisitor(sheerka) bnf_visitor.visit(concept.get_bnf()) return bnf_visitor.first_tokens else: keywords = concept.key.split() for keyword in keywords: if keyword.startswith(VARIABLE_PREFIX): continue return [keyword] return None @staticmethod def ensure_bnf(context, concept, parser_name="BaseNodeParser"): if concept.get_metadata().definition_type == DEFINITION_TYPE_BNF and not concept.get_bnf(): from parsers.BnfDefinitionParser import BnfDefinitionParser regex_parser = BnfDefinitionParser() desc = f"Resolving BNF '{concept.get_metadata().definition}'" with context.push(BuiltinConcepts.INIT_BNF, concept, who=parser_name, obj=concept, desc=desc) as sub_context: sub_context.add_inputs(parser_input=concept.get_metadata().definition) bnf_parsing_ret_val = regex_parser.parse(sub_context, concept.get_metadata().definition) sub_context.add_values(return_values=bnf_parsing_ret_val) if not bnf_parsing_ret_val.status: raise Exception(bnf_parsing_ret_val.value) concept.set_bnf(bnf_parsing_ret_val.body.body) if concept.id: context.sheerka.get_by_id(concept.id).set_bnf(concept.get_bnf()) # update bnf in cache