Files
Sheerka-Old/src/parsers/BaseNodeParser.py
T

838 lines
25 KiB
Python

from collections import namedtuple
from dataclasses import dataclass
from enum import Enum
import core.utils
from core.concept import Concept, ConceptParts
from core.rule import Rule
from core.tokenizer import TokenKind, Token
from parsers.BaseParser import Node, BaseParser, ParsingError
DEBUG_COMPILED = True
@dataclass()
class LexerNode(Node):
start: int # starting index in the tokens list
end: int # ending index in the tokens list
tokens: list = None # tokens
source: str = None # string representation of what was parsed
def __post_init__(self):
if self.source is None:
self.source = core.utils.get_text_from_tokens(self.tokens)
def __eq__(self, other):
if not isinstance(other, LexerNode):
return False
return self.start == other.start and \
self.end == other.end and \
self.source == other.source and \
self.tokens == other.tokens
def fix_source(self, force=True):
if force or self.source is None:
self.source = core.utils.get_text_from_tokens(self.tokens)
return self
def clone(self):
pass
def to_short_str(self):
raise NotImplementedError
def get_source_to_parse(self):
return self.source
class UnrecognizedTokensNode(LexerNode):
def __init__(self, start, end, tokens):
super().__init__(start, end, tokens)
self.is_frozen = False # TODO: Remove as it seems to now be useless
self.parenthesis_count = 0
def freeze(self):
# TODO: Remove as it seems to now be useless
self.is_frozen = True
def reset(self):
self.start = self.end = -1
self.tokens.clear()
self.is_frozen = False
self.parenthesis_count = 0
self.source = ""
def add_token(self, token, pos):
if self.is_frozen:
raise Exception("The node is frozen")
if self.end != -1 and pos == self.end + 2:
# add the missing whitespace
p = self.tokens[-1] # previous token
self.tokens.append(Token(TokenKind.WHITESPACE, " ", p.index + 1, p.line, p.column + 1))
self.tokens.append(token)
self.end = pos
if self.start == -1:
self.start = pos
if token.type == TokenKind.LPAR:
self.parenthesis_count += 1
if token.type == TokenKind.RPAR:
self.parenthesis_count -= 1
return self
def pop(self, token_kind):
if self.is_frozen:
raise Exception("The node is frozen")
if len(self.tokens) > 0 and self.tokens[-1].type == token_kind:
self.tokens.pop()
if len(self.tokens) == 0:
self.reset()
else:
self.end -= 1
def has_open_paren(self):
return self.parenthesis_count > 0
def not_whitespace(self):
return not self.is_whitespace()
def is_whitespace(self):
for t in self.tokens:
if t.type not in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
return False
return True
def is_empty(self):
return len(self.tokens) == 0
def last_token_type(self):
if len(self.tokens) == 0:
return None
return self.tokens[-1].type
def __eq__(self, other):
if isinstance(other, utnode):
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
if isinstance(other, UTN):
return other == self
if not isinstance(other, UnrecognizedTokensNode):
return False
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __hash__(self):
return hash((self.start, self.end, self.source))
def __repr__(self):
return f"UnrecognizedTokensNode(source='{self.source}', start={self.start}, end={self.end})"
def clone(self):
clone = UnrecognizedTokensNode(self.start, self.end, self.tokens[:])
clone.is_frozen = self.is_frozen
clone.parenthesis_count = self.parenthesis_count
return clone
def to_short_str(self):
return f"UTN('{self.source}')"
class RuleNode(LexerNode):
def __init__(self, rule, start, end, tokens=None, source=None):
super().__init__(start, end, tokens, source)
self.rule = rule
self.fix_source(False)
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, RN):
return other == self
if not isinstance(other, RuleNode):
return False
return self.rule == other.rule and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __hash__(self):
return hash((self.rule, self.start, self.end, self.source))
def __repr__(self):
return f"RuleNode(rule='{self.rule}', source='{self.source}', start={self.start}, end={self.end})"
def clone(self):
return RuleNode(self.rule, self.start, self.end, self.tokens, self.source)
def to_short_str(self):
return f'RN({self.rule})'
class ConceptNode(LexerNode):
"""
Returned by the BnfNodeParser
It represents a recognized concept
"""
def __init__(self, concept, start, end, tokens=None, source=None, underlying=None):
super().__init__(start, end, tokens, source)
self.concept = concept
self.underlying = underlying
self.fix_source(False)
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, (CN, CNC)):
return other == self
if isinstance(other, cnode):
return self.concept.key == other.concept_key and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source
if isinstance(other, short_cnode):
return self.concept.key == other.concept_key and self.source == other.source
if not isinstance(other, ConceptNode):
return False
return self.concept == other.concept and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source and \
self.underlying == other.underlying
def __hash__(self):
return hash((self.concept, self.start, self.end, self.source, self.underlying))
def __repr__(self):
text = f"ConceptNode(concept='{self.concept}', source='{self.source}', start={self.start}, end={self.end}"
if DEBUG_COMPILED:
for k, v in self.concept.get_compiled().items():
text += f", {k}='{v}'"
return text + ")"
def clone(self):
# do we need to clone the concept as well ?
clone = ConceptNode(self.concept, self.start, self.end, self.tokens, self.source, self.underlying)
return clone
def as_bag(self):
"""
Creates a dictionary with the useful properties of the ConceptNode
see Concept.as_bag() for extra informations
"""
bag = {}
for k, v in self.__dict__.items():
bag[k] = v
# if isinstance(self.concept, Concept):
# bag["compiled"] = self.concept.get_compiled()
return bag
def to_short_str(self):
return f'CN({self.concept})'
class SourceCodeNode(LexerNode):
"""
Returned when some source code (like Python source code is recognized)
"""
def __init__(self, start, end, tokens=None, source=None, python_node=None, return_value=None):
"""
:param start: start position (index of the first token)
:param end: end position (index of the last token)
:param tokens:
:param source: tokens as string
:param python_node: PythonNode found (when the SourceCodeNode is validated)
:param return_value: ReturnValueConcept returned when the source was validated
When return_value is provided,
You should have return_value.body.body == node
"""
super().__init__(start, end, tokens, source)
self.python_node = python_node # The PythonNode (or whatever language node) that is found
self.return_value = return_value # original result of the parsing
def __eq__(self, other):
if isinstance(other, scnode):
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
if isinstance(other, SCN):
return other == self
if not isinstance(other, SourceCodeNode):
return False
return self.python_node == other.python_node and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __hash__(self):
return hash((self.start, self.end, self.source))
def __repr__(self):
return f"SourceCodeNode(start={self.start}, end={self.end}, source='{self.source}')"
def to_short_str(self):
return f"SCN('{self.source}')"
def get_python_node(self):
return self.python_node
def get_source_to_parse(self):
return self.python_node.source
class SourceCodeWithConceptNode(LexerNode):
"""
Kind of temporary version for SourceCodeNode
I know that there is some code,
I know that there are some concepts
I just don't want to make the glue yet
So I push all the nodes into one big bag
"""
def __init__(self, first_node, last_node, content_nodes=None, has_unrecognized=False):
super().__init__(9999, -1, None) # why not sys.maxint ?
self.first = first_node
self.last = last_node
self.nodes = content_nodes or []
self.has_unrecognized = has_unrecognized
self._all_nodes = None
self.fix_all_pos()
self.python_node = None # if the source code node is validated against a python parse, here is the PythonNode
self.return_value = None # return_value that produced the PythonNode
def add_node(self, node):
self.nodes.append(node)
self.fix_pos(node)
self._all_nodes = None
return self
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, SCWC):
return other == self
if not isinstance(other, SourceCodeWithConceptNode):
return False
if self.start != other.start or self.end != other.end:
return False
if self.first != other.first:
return False
if self.last != other.last:
return False
if len(self.nodes) != len(other.nodes):
return False
for self_node, other_node in zip(self.nodes, other.nodes):
if self_node != other_node:
return False
# at last
return True
def __hash__(self):
return hash((self.first, self.last, len(self.nodes)))
def __repr__(self):
return f"SourceCodeWithConceptNode(start={self.start}, end={self.end}, source='{self.source}')"
def fix_all_pos(self):
if self.first is None: # to ease some unit test where only the python_node is necessary
return
for n in [self.first, self.last] + self.nodes:
self.fix_pos(n)
def fix_pos(self, node):
if hasattr(node, "start") and node.start is not None:
if node.start < self.start:
self.start = node.start
if hasattr(node, "end") and node.end is not None:
if node.end > self.end:
self.end = node.end
return self
def pseudo_fix_source(self):
"""
pseudo because the code is not that clean !
:return:
"""
self.source = self.first.source
for n in self.nodes:
self.source += " "
if hasattr(n, "source"):
self.source += n.source
elif hasattr(n, "concept"):
self.source += str(n.concept)
else:
self.source += " unknown"
self.source += self.last.source
return self
def get_all_nodes(self):
if self._all_nodes:
return self._all_nodes
self._all_nodes = [self.first, *self.nodes, self.last]
return self._all_nodes
def clone(self):
clone = SourceCodeWithConceptNode(self.first, self.last, self.nodes.copy(), self.has_unrecognized)
return clone
def to_short_str(self):
return f"SCWC({self.first}" + ", ".join(n.to_short_str for n in self.nodes) + f"{self.last})"
def get_python_node(self):
return self.python_node
def get_source_to_parse(self):
return self.python_node.source
@dataclass()
class GrammarErrorNode(ParsingError):
message: str
class SyaAssociativity(Enum):
Left = "left"
Right = "right"
No = "No"
def __repr__(self):
return self.value
cnode = namedtuple("ConceptNode", "concept_key start end source")
short_cnode = namedtuple("ConceptNode", "concept_key source")
utnode = namedtuple("utnode", "start end source")
scnode = namedtuple("scnode", "start end source")
class HelperWithPos:
def __init__(self, start=None, end=None):
self.start = start
self.end = end
self.start_is_fixed = start is not None
self.end_is_fixed = end is not None
def fix_pos(self, node):
if not self.start_is_fixed:
start = node.start if hasattr(node, "start") else \
node[0] if isinstance(node, tuple) else None
if start is not None and (self.start is None or start < self.start):
self.start = start
if not self.end_is_fixed:
end = node.end if hasattr(node, "end") else \
node[1] if isinstance(node, tuple) else None
if end is not None and (self.end is None or end > self.end):
self.end = end
return self
class SCN(HelperWithPos):
"""
SourceCodeNode tester class
It matches with SourceCodeNode but with less constraints
SCN == SourceCodeNode if source, start, end (start and end are not validated when None)
"""
def __init__(self, source, start=None, end=None):
super().__init__(start, end)
self.source = source
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, SourceCodeNode):
if self.source != other.source:
return False
if self.start is not None and self.start != other.start:
return False
if self.end is not None and self.end != other.end:
return False
return True
if not isinstance(other, CN):
return False
return self.source == other.source and \
self.start == other.start and \
self.end == other.end
def __hash__(self):
return hash((self.source, self.start, self.end))
def __repr__(self):
txt = f"SCN(source='{self.source}'"
if self.start is not None:
txt += f", start={self.start}"
if self.end is not None:
txt += f", end={self.end}"
return txt + ")"
class SCWC(HelperWithPos):
"""
SourceNodeWithConcept tester class
It matches with a SourceNodeWithConcept
but it's easier to instantiate during the tests
"""
def __init__(self, first, last, *args):
super().__init__(None, None)
self.first = first
self.last = last
self.content = args
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, SourceCodeWithConceptNode):
if self.first != other.first:
return False
if self.last != other.last:
return False
if len(self.content) != len(other.nodes):
return False
for self_node, other_node in zip(self.content, other.nodes):
if self_node != other_node:
return False
# at last
return True
def __repr__(self):
txt = "SCWC("
if self.start is not None:
txt += f"start={self.start}"
if self.end is not None:
txt += f", end={self.end}"
txt += f", source='{self.source}'"
return txt + ")"
@property
def source(self):
"""
this code is a copy and paste from SourceCodeWithConceptNode.pseudo_fix_source
TODO: create a common function or whatever...
:return:
"""
source = self.first.source if hasattr(self.first, "source") else self.first
for n in self.content:
source += " "
if hasattr(n, "source"):
source += n.source
elif hasattr(n, "concept"):
source += str(n.concept)
else:
source += " unknown"
source += self.last.source if hasattr(self.last, "source") else self.last
return source
class CN(HelperWithPos):
"""
ConceptNode tester class
It matches with ConceptNode but with less constraints
CN == ConceptNode if concept key, start, end and source are the same
"""
def __init__(self, concept, start=None, end=None, source=None):
"""
:param concept: Concept or concept_key (only the key is used anyway)
:param start:
:param end:
:param source:
"""
super().__init__(start, end)
self.concept_key = concept.key if isinstance(concept, Concept) else concept
self.source = source
self.concept = concept if isinstance(concept, Concept) else None
def fix_source(self, str_tokens):
self.source = "".join(str_tokens)
return self
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, ConceptNode):
if other.concept is None:
return False
if other.concept.key != self.concept_key:
return False
if self.start is not None and self.start != other.start:
return False
if self.end is not None and self.end != other.end:
return False
if self.source is not None and self.source != other.source:
return False
return True
if not isinstance(other, CN):
return False
return self.concept_key == other.concept_key and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __hash__(self):
return hash((self.concept_key, self.start, self.end, self.source))
def __repr__(self):
if self.concept:
txt = f"CN(concept='{self.concept}'"
else:
txt = f"CN(concept_key='{self.concept_key}'"
txt += f", source='{self.source}'"
if self.start is not None:
txt += f", start={self.start}"
if self.end is not None:
txt += f", end={self.end}"
return txt + ")"
class CNC(CN):
"""
ConceptNode for Compiled tester class
It matches with ConceptNode
But focuses on the 'compiled' property of the concept
CNC == ConceptNode if CNC.get_compiled() == ConceptNode.concept.get_compiled()
"""
def __init__(self, concept_key, start=None, end=None, source=None, exclude_body=False, **kwargs):
super().__init__(concept_key, start, end, source)
self.compiled = kwargs
self.exclude_body = exclude_body
if "body" in self.compiled:
self.compiled[ConceptParts.BODY] = self.compiled["body"]
del self.compiled["body"]
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, ConceptNode):
if other.concept is None:
return False
if other.concept.key != self.concept_key:
return False
if self.start is not None and self.start != other.start:
return False
if self.end is not None and self.end != other.end:
return False
if self.source is not None and self.source != other.source:
return False
if self.exclude_body:
to_compare = {k: v for k, v in other.concept.get_compiled().items() if k != ConceptParts.BODY}
else:
to_compare = other.concept.get_compiled()
if self.compiled == to_compare: # expanded form to ease the debug
return True
else:
return False
if not isinstance(other, CNC):
return False
return self.concept_key == other.concept_key and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source and \
self.compiled == other.compiled
def __repr__(self):
if self.concept:
txt = f"CNC(concept='{self.concept}'"
else:
txt = f"CNC(concept_key='{self.concept_key}'"
txt += f", source='{self.source}'"
if self.start is not None:
txt += f", start={self.start}"
if self.end is not None:
txt += f", end={self.end}"
for k, v in self.compiled.items():
txt += f", {k}='{v}'"
return txt + ")"
class UTN(HelperWithPos):
"""
Tester class for UnrecognizedTokenNode
compare the source, and start, end if defined
"""
def __init__(self, source, start=None, end=None):
"""
:param source:
:param start:
:param end:
"""
super().__init__(start, end)
self.source = source
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, UnrecognizedTokensNode):
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
if not isinstance(other, UTN):
return False
return self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __hash__(self):
return hash((self.source, self.start, self.end))
def __repr__(self):
txt = f"UTN(source='{self.source}'"
if self.start is not None:
txt += f", start={self.start}"
if self.end is not None:
txt += f", end={self.end}"
return txt + ")"
class RN(HelperWithPos):
"""
Helper class to test RuleNode
"""
def __init__(self, rule, start=None, end=None, source=None):
"""
:param concept: Concept or concept_key (only the key is used anyway)
:param start:
:param end:
:param source:
"""
super().__init__(start, end)
self.rule_id = rule.id if isinstance(rule, Rule) else rule
self.source = source or core.utils.str_concept((None, self.rule_id), prefix="r:")
self.rule = rule if isinstance(rule, Rule) else None
def __eq__(self, other):
if id(self) == id(other):
return True
if isinstance(other, RuleNode):
if other.rule is None:
return False
if other.rule.id != self.rule_id:
return False
if self.start is not None and self.start != other.start:
return False
if self.end is not None and self.end != other.end:
return False
if self.source is not None and self.source != other.source:
return False
return True
if not isinstance(other, RN):
return False
return self.rule_id == other.rule_id and \
self.start == other.start and \
self.end == other.end and \
self.source == other.source
def __hash__(self):
return hash((self.rule_id, self.start, self.end, self.source))
def __repr__(self):
if self.rule:
txt = f"RN(rule='{self.rule}'"
else:
txt = f"RN(rule_id='{self.rule_id}'"
txt += f", source='{self.source}'"
if self.start is not None:
txt += f", start={self.start}"
if self.end is not None:
txt += f", end={self.end}"
return txt + ")"
class BaseNodeParser(BaseParser):
"""
Parser that return LexerNode
"""
def __init__(self, name, priority, **kwargs):
super().__init__(name, priority, yield_eof=True)
def init_from_concepts(self, context, concepts, **kwargs):
"""
Initialize the parser with a list of concepts
For unit tests convenience
:param context
:param concepts
:return:
"""
from core.sheerka.services.SheerkaConceptManager import SheerkaConceptManager
concepts_by_first_keyword = SheerkaConceptManager.compute_concepts_by_first_token(context, concepts).body
resolved = SheerkaConceptManager.resolve_concepts_by_first_keyword(context, concepts_by_first_keyword).body
context.sheerka.om.put(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY,
False,
resolved)