Added SyaNodeParser (finally, after one month)
This commit is contained in:
@@ -0,0 +1,669 @@
|
||||
from collections import namedtuple
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from core.builtin_concepts import BuiltinConcepts
|
||||
from core.concept import VARIABLE_PREFIX, Concept
|
||||
from core.sheerka.ExecutionContext import ExecutionContext
|
||||
from core.tokenizer import TokenKind, LexerError, Token
|
||||
from parsers.BaseParser import Node, BaseParser, ErrorNode
|
||||
|
||||
DEBUG_COMPILED = True
|
||||
|
||||
|
||||
@dataclass()
|
||||
class LexerNode(Node):
|
||||
start: int # starting index in the tokens list
|
||||
end: int # ending index in the tokens list
|
||||
tokens: list = None # tokens
|
||||
source: str = None # string representation of what was parsed
|
||||
|
||||
def __post_init__(self):
|
||||
if self.source is None:
|
||||
self.source = BaseParser.get_text_from_tokens(self.tokens)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, LexerNode):
|
||||
return False
|
||||
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source and \
|
||||
self.tokens == other.tokens
|
||||
|
||||
def fix_source(self, force=True):
|
||||
if force or self.source is None:
|
||||
self.source = BaseParser.get_text_from_tokens(self.tokens)
|
||||
return self
|
||||
|
||||
|
||||
class UnrecognizedTokensNode(LexerNode):
|
||||
def __init__(self, start, end, tokens):
|
||||
super().__init__(start, end, tokens)
|
||||
self.is_frozen = False
|
||||
self.parenthesis_count = 0
|
||||
|
||||
def freeze(self):
|
||||
self.is_frozen = True
|
||||
|
||||
def reset(self):
|
||||
self.start = self.end = -1
|
||||
self.tokens.clear()
|
||||
self.is_frozen = False
|
||||
self.parenthesis_count = 0
|
||||
|
||||
def has_open_paren(self):
|
||||
return self.parenthesis_count > 0
|
||||
|
||||
def add_token(self, token, pos):
|
||||
if self.is_frozen:
|
||||
raise Exception("The node is frozen")
|
||||
|
||||
if self.end != -1 and pos == self.end + 2:
|
||||
# add the missing whitespace
|
||||
p = self.tokens[-1] # previous token
|
||||
self.tokens.append(Token(TokenKind.WHITESPACE, " ", p.index + 1, p.line, p.column + 1))
|
||||
|
||||
self.tokens.append(token)
|
||||
self.end = pos
|
||||
if self.start == -1:
|
||||
self.start = pos
|
||||
|
||||
if token.type == TokenKind.LPAR:
|
||||
self.parenthesis_count += 1
|
||||
|
||||
if token.type == TokenKind.RPAR:
|
||||
self.parenthesis_count -= 1
|
||||
|
||||
return self
|
||||
|
||||
def not_whitespace(self):
|
||||
return not self.is_whitespace()
|
||||
|
||||
def is_whitespace(self):
|
||||
for t in self.tokens:
|
||||
if t.type not in (TokenKind.WHITESPACE, TokenKind.NEWLINE):
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_empty(self):
|
||||
return len(self.tokens) == 0
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, utnode):
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
if isinstance(other, UTN):
|
||||
return other == self
|
||||
|
||||
if not isinstance(other, UnrecognizedTokensNode):
|
||||
return False
|
||||
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.start, self.end, self.source))
|
||||
|
||||
def __repr__(self):
|
||||
return f"UnrecognizedTokensNode(start={self.start}, end={self.end}, source='{self.source}')"
|
||||
|
||||
def clone(self):
|
||||
clone = UnrecognizedTokensNode(self.start, self.end, self.tokens[:])
|
||||
clone.is_frozen = self.is_frozen
|
||||
clone.parenthesis_count = self.parenthesis_count
|
||||
return clone
|
||||
|
||||
|
||||
class ConceptNode(LexerNode):
|
||||
"""
|
||||
Returned by the BnfNodeParser
|
||||
It represents a recognized concept
|
||||
"""
|
||||
|
||||
def __init__(self, concept, start, end, tokens=None, source=None, underlying=None):
|
||||
super().__init__(start, end, tokens, source)
|
||||
self.concept = concept
|
||||
self.underlying = underlying
|
||||
self.fix_source(False)
|
||||
|
||||
def __eq__(self, other):
|
||||
if id(self) == id(other):
|
||||
return True
|
||||
|
||||
if isinstance(other, (CN, CNC)):
|
||||
return other == self
|
||||
|
||||
if isinstance(other, cnode):
|
||||
return self.concept.key == other.concept_key and \
|
||||
self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
if isinstance(other, short_cnode):
|
||||
return self.concept.key == other.concept_key and self.source == other.source
|
||||
|
||||
if not isinstance(other, ConceptNode):
|
||||
return False
|
||||
|
||||
return self.concept == other.concept and \
|
||||
self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source and \
|
||||
self.underlying == other.underlying
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.concept, self.start, self.end, self.source, self.underlying))
|
||||
|
||||
def __repr__(self):
|
||||
text = f"ConceptNode(concept='{self.concept}', source='{self.source}', start={self.start}, end={self.end}"
|
||||
if DEBUG_COMPILED:
|
||||
for k, v in self.concept.compiled.items():
|
||||
text += f", {k}='{v}'"
|
||||
return text + ")"
|
||||
|
||||
def clone(self):
|
||||
# do we need to clone the concept as well ?
|
||||
clone = ConceptNode(self.concept, self.start, self.end, self.tokens, self.source, self.underlying)
|
||||
return clone
|
||||
|
||||
|
||||
class SourceCodeNode(LexerNode):
|
||||
"""
|
||||
Returned when some source code (like Python source code is recognized)
|
||||
"""
|
||||
|
||||
def __init__(self, node, start, end, tokens=None, source=None, return_value=None):
|
||||
super().__init__(start, end, tokens, source)
|
||||
self.node = node # The PythonNode (or whatever language node) that is found
|
||||
self.return_value = return_value # original result of the parsing
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, scnode):
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
if not isinstance(other, SourceCodeNode):
|
||||
return False
|
||||
|
||||
return self.node == other.node and \
|
||||
self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.start, self.end, self.source))
|
||||
|
||||
def __repr__(self):
|
||||
return f"SourceCodeNode(start={self.start}, end={self.end}, source='{self.source}')"
|
||||
|
||||
|
||||
class SourceCodeWithConceptNode(LexerNode):
|
||||
"""
|
||||
Kind of temporary version for SourceCodeNode
|
||||
I know that there is some code,
|
||||
I know that there are some concepts
|
||||
I just don't want to make the glue yet
|
||||
|
||||
So I push all the nodes into one big bag
|
||||
"""
|
||||
|
||||
def __init__(self, first_node, last_node, content_nodes=None):
|
||||
super().__init__(9999, -1, None) # why not sys.maxint ?
|
||||
self.first = first_node
|
||||
self.last = last_node
|
||||
self.nodes = content_nodes or []
|
||||
self.has_unrecognized = False
|
||||
self.fix_all_pos()
|
||||
|
||||
def add_node(self, node):
|
||||
self.nodes.append(node)
|
||||
self.fix_pos(node)
|
||||
|
||||
return self
|
||||
|
||||
def __eq__(self, other):
|
||||
if id(self) == id(other):
|
||||
return True
|
||||
|
||||
if not isinstance(other, SourceCodeWithConceptNode):
|
||||
return False
|
||||
|
||||
if self.start != other.start or self.end != other.end:
|
||||
return False
|
||||
|
||||
if self.first != other.first:
|
||||
return False
|
||||
|
||||
if self.last != other.last:
|
||||
return False
|
||||
|
||||
if len(self.nodes) != len(other.nodes):
|
||||
return False
|
||||
|
||||
for self_node, other_node in zip(self.nodes, other.nodes):
|
||||
if self_node != other_node:
|
||||
return False
|
||||
|
||||
# at last
|
||||
return True
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.first, self.last, len(self.nodes)))
|
||||
|
||||
def __repr__(self):
|
||||
return f"SourceCodeWithConceptNode(start={self.start}, end={self.end}, source='{self.source}')"
|
||||
|
||||
def fix_all_pos(self):
|
||||
for n in [self.first, self.last] + self.nodes:
|
||||
self.fix_pos(n)
|
||||
|
||||
def fix_pos(self, node):
|
||||
if hasattr(node, "start") and node.start is not None:
|
||||
if node.start < self.start:
|
||||
self.start = node.start
|
||||
|
||||
if hasattr(node, "end") and node.end is not None:
|
||||
if node.end > self.end:
|
||||
self.end = node.end
|
||||
return self
|
||||
|
||||
def pseudo_fix_source(self):
|
||||
self.source = self.first.source
|
||||
for n in self.nodes:
|
||||
self.source += " "
|
||||
if hasattr(n, "source"):
|
||||
self.source += n.source
|
||||
elif hasattr(n, "concept"):
|
||||
self.source += str(n.concept)
|
||||
else:
|
||||
self.source += " unknown"
|
||||
self.source += self.last.source
|
||||
return self
|
||||
|
||||
def clone(self):
|
||||
clone = SourceCodeWithConceptNode(self.first, self.last, self.nodes)
|
||||
return clone
|
||||
|
||||
|
||||
@dataclass()
|
||||
class GrammarErrorNode(ErrorNode):
|
||||
message: str
|
||||
|
||||
|
||||
class SyaAssociativity(Enum):
|
||||
Left = "left"
|
||||
Right = "right"
|
||||
No = "No"
|
||||
|
||||
def __repr__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
cnode = namedtuple("ConceptNode", "concept_key start end source")
|
||||
short_cnode = namedtuple("ConceptNode", "concept_key source")
|
||||
utnode = namedtuple("utnode", "start end source")
|
||||
scnode = namedtuple("scnode", "start end source")
|
||||
|
||||
|
||||
@dataclass(init=False)
|
||||
class SCWC:
|
||||
"""
|
||||
SourceNodeWithConcept tester class
|
||||
It matches with a SourceNodeWithConcept
|
||||
but it's easier to instantiate during the tests
|
||||
"""
|
||||
first: LexerNode
|
||||
last: LexerNode
|
||||
content: tuple
|
||||
|
||||
def __init__(self, first, last, *args):
|
||||
self.first = first
|
||||
self.last = last
|
||||
self.content = args
|
||||
|
||||
|
||||
class HelperWithPos:
|
||||
def __init__(self, start=None, end=None):
|
||||
self.start = start
|
||||
self.end = end
|
||||
|
||||
self.start_is_fixed = start is not None
|
||||
self.end_is_fixed = end is not None
|
||||
|
||||
def fix_pos(self, node):
|
||||
if not self.start_is_fixed:
|
||||
start = node.start if hasattr(node, "start") else \
|
||||
node[0] if isinstance(node, tuple) else None
|
||||
|
||||
if start is not None and (self.start is None or start < self.start):
|
||||
self.start = start
|
||||
|
||||
if not self.end_is_fixed:
|
||||
end = node.end if hasattr(node, "end") else \
|
||||
node[1] if isinstance(node, tuple) else None
|
||||
|
||||
if end is not None and (self.end is None or end > self.end):
|
||||
self.end = end
|
||||
return self
|
||||
|
||||
|
||||
class CN(HelperWithPos):
|
||||
"""
|
||||
ConceptNode tester class
|
||||
It matches with ConceptNode but with less constraints
|
||||
|
||||
CNC == ConceptNode if concept key, start, end and source are the same
|
||||
"""
|
||||
|
||||
def __init__(self, concept, start=None, end=None, source=None):
|
||||
"""
|
||||
|
||||
:param concept: Concept or concept_key (only the key is used anyway)
|
||||
:param start:
|
||||
:param end:
|
||||
:param source:
|
||||
"""
|
||||
super().__init__(start, end)
|
||||
self.concept_key = concept.key if isinstance(concept, Concept) else concept
|
||||
self.source = source
|
||||
self.concept = concept if isinstance(concept, Concept) else None
|
||||
|
||||
def fix_source(self, str_tokens):
|
||||
self.source = "".join(str_tokens)
|
||||
return self
|
||||
|
||||
def __eq__(self, other):
|
||||
if id(self) == id(other):
|
||||
return True
|
||||
|
||||
if isinstance(other, ConceptNode):
|
||||
if other.concept is None:
|
||||
return False
|
||||
if other.concept.key != self.concept_key:
|
||||
return False
|
||||
if self.start is not None and self.start != other.start:
|
||||
return False
|
||||
if self.end is not None and self.end != other.end:
|
||||
return False
|
||||
return True
|
||||
|
||||
if not isinstance(other, CN):
|
||||
return False
|
||||
|
||||
return self.concept_key == other.concept_key and \
|
||||
self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.concept_key, self.start, self.end, self.source))
|
||||
|
||||
def __repr__(self):
|
||||
if self.concept:
|
||||
txt = f"CN(concept='{self.concept}'"
|
||||
else:
|
||||
txt = f"CN(concept_key='{self.concept_key}'"
|
||||
txt += f", source='{self.source}'"
|
||||
if self.start is not None:
|
||||
txt += f", start={self.start}"
|
||||
if self.end is not None:
|
||||
txt += f", end={self.end}"
|
||||
return txt + ")"
|
||||
|
||||
|
||||
class CNC(CN):
|
||||
"""
|
||||
ConceptNode for Compiled tester class
|
||||
It matches with ConceptNode
|
||||
But focuses on the 'compiled' property of the concept
|
||||
|
||||
CNC == ConceptNode if CNC.compiled == ConceptNode.concept.compiled
|
||||
"""
|
||||
|
||||
def __init__(self, concept_key, start=None, end=None, source=None, **kwargs):
|
||||
super().__init__(concept_key, start, end, source)
|
||||
self.compiled = kwargs
|
||||
|
||||
def __eq__(self, other):
|
||||
if id(self) == id(other):
|
||||
return True
|
||||
|
||||
if isinstance(other, ConceptNode):
|
||||
if other.concept is None:
|
||||
return False
|
||||
if other.concept.key != self.concept_key:
|
||||
return False
|
||||
if self.start is not None and self.start != other.start:
|
||||
return False
|
||||
if self.end is not None and self.end != other.end:
|
||||
return False
|
||||
return self.compiled == other.concept.compiled # assert instead of return to help debugging tests
|
||||
|
||||
if not isinstance(other, CNC):
|
||||
return False
|
||||
|
||||
return self.concept_key == other.concept_key and \
|
||||
self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source and \
|
||||
self.compiled == other.compiled
|
||||
|
||||
def __repr__(self):
|
||||
if self.concept:
|
||||
txt = f"CNC(concept='{self.concept}'"
|
||||
else:
|
||||
txt = f"CNC(concept_key='{self.concept_key}'"
|
||||
txt += f", source='{self.source}'"
|
||||
if self.start is not None:
|
||||
txt += f", start={self.start}"
|
||||
if self.end is not None:
|
||||
txt += f", end={self.end}"
|
||||
|
||||
for k, v in self.compiled.items():
|
||||
txt += f", {k}='{v}'"
|
||||
return txt + ")"
|
||||
|
||||
|
||||
class BaseNodeParser(BaseParser):
|
||||
def __init__(self, name, priority, **kwargs):
|
||||
super().__init__(name, priority)
|
||||
if 'sheerka' in kwargs:
|
||||
sheerka = kwargs.get("sheerka")
|
||||
self.init_from_sheerka(sheerka)
|
||||
|
||||
else:
|
||||
self.concepts_by_first_keyword = None
|
||||
self.sya_definitions = None
|
||||
|
||||
self.token = None
|
||||
self.pos = -1
|
||||
self.tokens = None
|
||||
|
||||
self.context: ExecutionContext = None
|
||||
self.text = None
|
||||
self.sheerka = None
|
||||
|
||||
def init_from_sheerka(self, sheerka):
|
||||
"""
|
||||
Use the definitons from Sheerka to initialize
|
||||
:param sheerka:
|
||||
:return:
|
||||
"""
|
||||
self.concepts_by_first_keyword = sheerka.concepts_by_first_keyword
|
||||
if sheerka.sya_definitions:
|
||||
self.sya_definitions = {}
|
||||
for k, v in sheerka.sya_definitions.items():
|
||||
self.sya_definitions[k] = (v[0], SyaAssociativity(v[1]))
|
||||
|
||||
def reset_parser(self, context, text):
|
||||
self.context = context
|
||||
self.sheerka = context.sheerka
|
||||
self.text = text
|
||||
|
||||
try:
|
||||
self.tokens = list(self.get_input_as_tokens(text))
|
||||
except LexerError as e:
|
||||
self.add_error(self.sheerka.new(BuiltinConcepts.ERROR, body=e), False)
|
||||
return False
|
||||
|
||||
self.token = None
|
||||
self.pos = -1
|
||||
return True
|
||||
|
||||
def add_error(self, error, next_token=True):
|
||||
self.error_sink.append(error)
|
||||
if next_token:
|
||||
self.next_token()
|
||||
return error
|
||||
|
||||
def get_token(self) -> Token:
|
||||
return self.token
|
||||
|
||||
def next_token(self, skip_whitespace=True):
|
||||
if self.token and self.token.type == TokenKind.EOF:
|
||||
return False
|
||||
|
||||
self.pos += 1
|
||||
self.token = self.tokens[self.pos]
|
||||
|
||||
if skip_whitespace:
|
||||
while self.token.type == TokenKind.WHITESPACE or self.token.type == TokenKind.NEWLINE:
|
||||
self.pos += 1
|
||||
self.token = self.tokens[self.pos]
|
||||
|
||||
return self.token.type != TokenKind.EOF
|
||||
|
||||
def initialize(self, context, concepts, sya_definitions=None, use_sheerka=False):
|
||||
"""
|
||||
To quickly find a concept, we store them in an hash where the key is the first token of the concept
|
||||
example :
|
||||
Concept("foo a").def_prop("a"), "foo" is a token, "a" is a variable
|
||||
So the key to use will be "foo"
|
||||
|
||||
Concept("a foo").def_prop("a") -> first token is "foo"
|
||||
|
||||
Concept("Hello my dear a").def_prop("a") -> first token is "Hello"
|
||||
Note that under the same key, there will be multiple entry
|
||||
a B-Tree may be a better implementation in the future
|
||||
|
||||
We also store sya_definition which a is tuple (concept_precedence:int, concept_associativity:SyaAssociativity)
|
||||
:param context:
|
||||
:param concepts: list[Concept]
|
||||
:param sya_definitions: hash[concept_id, tuple(precedence:int, associativity:SyaAssociativity)]
|
||||
:param use_sheerka: first init with the definitions from Sheerka
|
||||
:return:
|
||||
"""
|
||||
self.context = context
|
||||
self.sheerka = context.sheerka
|
||||
|
||||
if use_sheerka:
|
||||
self.init_from_sheerka(self.sheerka)
|
||||
|
||||
if sya_definitions:
|
||||
if self.sya_definitions:
|
||||
self.sya_definitions.update(sya_definitions)
|
||||
else:
|
||||
self.sya_definitions = sya_definitions
|
||||
|
||||
if self.concepts_by_first_keyword is None:
|
||||
self.concepts_by_first_keyword = {}
|
||||
|
||||
for concept in concepts:
|
||||
keywords = concept.key.split()
|
||||
for keyword in keywords:
|
||||
if keyword.startswith(VARIABLE_PREFIX):
|
||||
continue
|
||||
|
||||
self.concepts_by_first_keyword.setdefault(keyword, []).append(concept.id)
|
||||
break
|
||||
|
||||
return self.sheerka.ret(self.name, True, self.concepts_by_first_keyword)
|
||||
|
||||
def get_concepts(self, token, to_keep, to_map=None):
|
||||
"""
|
||||
Tries to find if there are concepts that match the value of the token
|
||||
:param token:
|
||||
:param to_keep: predicate to tell if the concept is eligible
|
||||
:param to_map:
|
||||
:return:
|
||||
"""
|
||||
|
||||
if token.type == TokenKind.STRING:
|
||||
name = token.value[1:-1]
|
||||
elif token.type == TokenKind.KEYWORD:
|
||||
name = token.value.value
|
||||
else:
|
||||
name = token.value
|
||||
|
||||
result = []
|
||||
if name in self.concepts_by_first_keyword:
|
||||
for concept_id in self.concepts_by_first_keyword[name]:
|
||||
|
||||
concept = self.sheerka.get_by_id(concept_id)
|
||||
|
||||
if not to_keep(concept):
|
||||
continue
|
||||
|
||||
concept = to_map(concept) if to_map else concept
|
||||
result.append(concept)
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_token_value(token):
|
||||
if token.type == TokenKind.STRING:
|
||||
return token.value[1:-1]
|
||||
elif token.type == TokenKind.KEYWORD:
|
||||
return token.value.value
|
||||
else:
|
||||
return token.value
|
||||
|
||||
|
||||
class UTN(HelperWithPos):
|
||||
"""
|
||||
Tester class for UnrecognizedTokenNode
|
||||
compare the source, and start, end if defined
|
||||
"""
|
||||
|
||||
def __init__(self, source, start=None, end=None):
|
||||
"""
|
||||
:param concept: Concept or concept_key (only the key is used anyway)
|
||||
:param start:
|
||||
:param end:
|
||||
:param source:
|
||||
"""
|
||||
super().__init__(start, end)
|
||||
self.source = source
|
||||
|
||||
def __eq__(self, other):
|
||||
if id(self) == id(other):
|
||||
return True
|
||||
|
||||
if isinstance(other, UnrecognizedTokensNode):
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
if not isinstance(other, UTN):
|
||||
return False
|
||||
|
||||
return self.start == other.start and \
|
||||
self.end == other.end and \
|
||||
self.source == other.source
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.source, self.start, self.end))
|
||||
|
||||
def __repr__(self):
|
||||
txt = f"UTN( source='{self.source}'"
|
||||
if self.start is not None:
|
||||
txt += f", start={self.start}"
|
||||
if self.end is not None:
|
||||
txt += f", end={self.end}"
|
||||
return txt + ")"
|
||||
Reference in New Issue
Block a user