Files
Sheerka-Old/src/parsers/BnfNodeParser.py
T

912 lines
31 KiB
Python

#####################################################################################################
# This implementation of the parser is highly inspired by the arpeggio project (https://github.com/textX/Arpeggio)
# I don't directly use the project, but it helped me figure out
# what to do.
# Dejanović I., Milosavljević G., Vaderna R.:
# Arpeggio: A flexible PEG parser for Python,
# Knowledge-Based Systems, 2016, 95, 71 - 74, doi:10.1016/j.knosys.2015.12.004
#####################################################################################################
from collections import namedtuple
from dataclasses import dataclass
from collections import defaultdict
from core.builtin_concepts import BuiltinConcepts, ParserResultConcept
from core.concept import Concept, ConceptParts, DoNotResolve
from core.tokenizer import TokenKind, Tokenizer, Token
from parsers.BaseNodeParser import LexerNode, GrammarErrorNode, ConceptNode, UnrecognizedTokensNode
from parsers.BaseParser import BaseParser, ErrorNode
import core.utils
class NonTerminalNode(LexerNode):
"""
Returned by the BnfNodeParser
"""
def __init__(self, parsing_expression, start, end, tokens, children=None):
super().__init__(start, end, tokens)
self.parsing_expression = parsing_expression
self.children = children
def __repr__(self):
name = self.parsing_expression.rule_name or self.parsing_expression.__class__.__name__
if len(self.children) > 0:
sub_names = "(" + ",".join([repr(child) for child in self.children]) + ")"
else:
sub_names = ""
return name + sub_names
def __eq__(self, other):
if not isinstance(other, NonTerminalNode):
return False
return self.parsing_expression == other.parsing_expression and \
self.start == other.start and \
self.end == other.end and \
self.children == other.children
def __hash__(self):
return hash((self.parsing_expression, self.start, self.end, self.children))
class TerminalNode(LexerNode):
"""
Returned by the BnfNodeParser
"""
def __init__(self, parsing_expression, start, end, value):
super().__init__(start, end, source=value)
self.parsing_expression = parsing_expression
self.value = value
def __repr__(self):
name = self.parsing_expression.rule_name or ""
return name + f"'{self.value}'"
def __eq__(self, other):
if not isinstance(other, TerminalNode):
return False
return self.parsing_expression == other.parsing_expression and \
self.start == other.start and \
self.end == other.end and \
self.value == other.value
def __hash__(self):
return hash((self.parsing_expression, self.start, self.end, self.value))
@dataclass()
class UnknownConceptNode(ErrorNode):
concept_key: str
@dataclass()
class TooManyConceptNode(ErrorNode):
concept_key: str
class ParsingExpression:
def __init__(self, *args, **kwargs):
self.elements = args
nodes = kwargs.get('nodes', [])
if not hasattr(nodes, '__iter__'):
nodes = [nodes]
self.nodes = nodes
self.rule_name = kwargs.get('rule_name', '')
def __eq__(self, other):
if not isinstance(other, ParsingExpression):
return False
return self.rule_name == other.rule_name and self.elements == other.elements
def __hash__(self):
return hash((self.rule_name, self.elements))
def parse(self, parser):
return self._parse(parser)
def add_rule_name_if_needed(self, text):
return text + "=" + self.rule_name if self.rule_name else text
class ConceptExpression(ParsingExpression):
"""
Will match a concept
It used only for rule definition
When the grammar is created, it is replaced by the actual concept
"""
def __init__(self, concept, rule_name=""):
super().__init__(rule_name=rule_name)
self.concept = concept
def __repr__(self):
return self.add_rule_name_if_needed(f"{self.concept}")
def __eq__(self, other):
if not super().__eq__(other):
return False
if not isinstance(other, ConceptExpression):
return False
if isinstance(self.concept, Concept):
return self.concept.name == other.concept.name
# when it's only the name of the concept
return self.concept == other.concept
def __hash__(self):
return hash((self.concept, self.rule_name))
@staticmethod
def get_parsing_expression_from_name(name):
tokens = Tokenizer(name)
nodes = [StrMatch(core.utils.strip_quotes(token.value)) for token in list(tokens)[:-1]]
if len(nodes) == 1:
return nodes[0]
else:
sequence = Sequence(nodes)
sequence.nodes = nodes
return sequence
def _parse(self, parser):
to_match = parser.get_concept(self.concept) if isinstance(self.concept, str) else self.concept
if parser.sheerka.isinstance(to_match, BuiltinConcepts.UNKNOWN_CONCEPT):
return None
self.concept = to_match # Memoize
if to_match not in parser.concepts_grammars:
# Try to match the concept using its name
expr = self.get_parsing_expression_from_name(to_match.name)
node = expr.parse(parser)
else:
node = parser.concepts_grammars[to_match].parse(parser)
if node is None:
return None
return NonTerminalNode(self, node.start, node.end, parser.tokens[node.start: node.end + 1], [node])
class ConceptGroupExpression(ConceptExpression):
def _parse(self, parser):
to_match = parser.get_concept(self.concept) if isinstance(self.concept, str) else self.concept
if parser.sheerka.isinstance(to_match, BuiltinConcepts.UNKNOWN_CONCEPT):
return None
self.concept = to_match # Memoize
if to_match not in parser.concepts_grammars:
concepts_in_group = parser.sheerka.get_set_elements(parser.context, self.concept)
nodes = [ConceptExpression(c, rule_name=c.name) for c in concepts_in_group]
expr = OrderedChoice(nodes)
expr.nodes = nodes
node = expr.parse(parser)
else:
node = parser.concepts_grammars[to_match].parse(parser)
if node is None:
return None
return NonTerminalNode(self, node.start, node.end, parser.tokens[node.start: node.end + 1], [node])
class Sequence(ParsingExpression):
"""
Will match sequence of parser expressions in exact order they are defined.
"""
def _parse(self, parser):
init_pos = parser.pos
end_pos = parser.pos
children = []
for e in self.nodes:
node = e.parse(parser)
if node is None:
return None
else:
if node.end != -1: # because returns -1 when no match
children.append(node)
end_pos = node.end
return NonTerminalNode(self, init_pos, end_pos, parser.tokens[init_pos: end_pos + 1], children)
def __repr__(self):
to_str = ", ".join(repr(n) for n in self.elements)
return self.add_rule_name_if_needed(f"({to_str})")
class OrderedChoice(ParsingExpression):
"""
Will match one among multiple
It will stop at the first match (so the order of definition is important)
"""
def _parse(self, parser):
init_pos = parser.pos
for e in self.nodes:
node = e.parse(parser)
if node:
return NonTerminalNode(self, init_pos, node.end, parser.tokens[init_pos: node.end + 1], [node])
parser.seek(init_pos) # backtrack
return None
def __repr__(self):
to_str = "| ".join(repr(n) for n in self.elements)
return self.add_rule_name_if_needed(f"({to_str})")
class Optional(ParsingExpression):
"""
Will match or not the elements
if many matches, will choose longest one
If you need order, use Optional(OrderedChoice)
"""
def _parse(self, parser):
init_pos = parser.pos
selected_node = NonTerminalNode(self, parser.pos, -1, [], []) # means that nothing is found
for e in self.nodes:
node = e.parse(parser)
if node:
if node.end > selected_node.end:
selected_node = NonTerminalNode(
self,
node.start,
node.end,
parser.tokens[node.start: node.end + 1],
[node])
parser.seek(init_pos) # backtrack
if selected_node.end != -1:
parser.seek(selected_node.end)
parser.next_token() # eat the tokens found
return selected_node
def __repr__(self):
if len(self.elements) == 1:
return f"{self.elements[0]}?"
else:
to_str = ", ".join(repr(n) for n in self.elements)
return self.add_rule_name_if_needed(f"({to_str})?")
class Repetition(ParsingExpression):
"""
Base class for all repetition-like parser expressions (?,*,+)
Args:
eolterm(bool): Flag that indicates that end of line should
terminate repetition match.
"""
def __init__(self, *elements, **kwargs):
super(Repetition, self).__init__(*elements, **kwargs)
self.sep = kwargs.get('sep', None)
class ZeroOrMore(Repetition):
"""
ZeroOrMore will try to match parser expression specified zero or more
times. It will never fail.
"""
def _parse(self, parser):
init_pos = parser.pos
end_pos = -1
children = []
while True:
current_pos = parser.pos
# maybe eat the separator if needed
if self.sep and children:
sep_result = self.sep.parse(parser)
if sep_result is None:
parser.seek(current_pos)
break
# eat the ZeroOrMore
node = self.nodes[0].parse(parser)
if node is None:
parser.seek(current_pos)
break
else:
if node.end != -1: # because returns -1 when no match
children.append(node)
end_pos = node.end
if len(children) == 0:
return NonTerminalNode(self, init_pos, -1, [], [])
return NonTerminalNode(self, init_pos, end_pos, parser.tokens[init_pos: end_pos + 1], children)
def __repr__(self):
to_str = ", ".join(repr(n) for n in self.elements)
return self.add_rule_name_if_needed(f"({to_str})*")
class OneOrMore(Repetition):
"""
OneOrMore will try to match parser expression specified one or more times.
"""
def _parse(self, parser):
init_pos = parser.pos
end_pos = -1
children = []
while True:
current_pos = parser.pos
# maybe eat the separator if needed
if self.sep and children:
sep_result = self.sep.parse(parser)
if sep_result is None:
parser.seek(current_pos)
break
# eat the ZeroOrMore
node = self.nodes[0].parse(parser)
if node is None:
parser.seek(current_pos)
break
else:
if node.end != -1: # because returns -1 when no match
children.append(node)
end_pos = node.end
if len(children) == 0: # if nothing is found, it's an error
return None
return NonTerminalNode(self, init_pos, end_pos, parser.tokens[init_pos: end_pos + 1], children)
def __repr__(self):
to_str = ", ".join(repr(n) for n in self.elements)
return self.add_rule_name_if_needed(f"({to_str})+")
class UnorderedGroup(Repetition):
"""
Will try to match all of the parsing expression in any order.
"""
def _parse(self, parser):
raise NotImplementedError()
# def __repr__(self):
# to_str = ", ".join(repr(n) for n in self.elements)
# return f"({to_str})#"
class Match(ParsingExpression):
"""
Base class for all classes that will try to match something from the input.
"""
def __init__(self, rule_name, root=False):
super(Match, self).__init__(rule_name=rule_name, root=root)
def parse(self, parser):
result = self._parse(parser)
return result
class StrMatch(Match):
"""
Matches a literal
"""
def __init__(self, to_match, rule_name="", ignore_case=True):
super(Match, self).__init__(rule_name=rule_name)
self.to_match = to_match
self.ignore_case = ignore_case
def __repr__(self):
return self.add_rule_name_if_needed(f"'{self.to_match}'")
def __eq__(self, other):
if not super().__eq__(other):
return False
if not isinstance(other, StrMatch):
return False
return self.to_match == other.to_match and self.ignore_case == other.ignore_case
def _parse(self, parser):
token = parser.get_token()
m = str(token.value).lower() == self.to_match.lower() if self.ignore_case \
else token.value == self.to_match
if m:
node = TerminalNode(self, parser.pos, parser.pos, token.value)
parser.next_token()
return node
return None
class BnfNodeParser(BaseParser):
def __init__(self, **kwargs):
super().__init__("BnfNode", 50)
if 'grammars' in kwargs:
self.concepts_grammars = kwargs.get("grammars")
elif 'sheerka' in kwargs:
self.concepts_grammars = kwargs.get("sheerka").concepts_grammars
else:
self.concepts_grammars = {}
self.ignore_case = True
self.token = None
self.pos = -1
self.tokens = None
self.context = None
self.text = None
self.sheerka = None
def add_error(self, error, next_token=True):
self.error_sink.append(error)
if next_token:
self.next_token()
return error
def reset_parser(self, context, text):
self.context = context
self.sheerka = context.sheerka
self.text = text
try:
self.tokens = list(self.get_input_as_tokens(text))
except core.tokenizer.LexerError as e:
self.add_error(self.sheerka.new(BuiltinConcepts.ERROR, body=e), False)
return False
self.token = None
self.pos = -1
self.next_token(False)
return True
def get_token(self) -> Token:
return self.token
def next_token(self, skip_whitespace=True):
if self.token and self.token.type == TokenKind.EOF:
return False
self.pos += 1
self.token = self.tokens[self.pos]
if skip_whitespace:
while self.token.type == TokenKind.WHITESPACE or self.token.type == TokenKind.NEWLINE:
self.pos += 1
self.token = self.tokens[self.pos]
return self.token.type != TokenKind.EOF
def seek(self, pos):
self.pos = pos
self.token = self.tokens[self.pos]
return True
def rewind(self, offset, skip_whitespace=True):
self.pos += offset
self.token = self.tokens[self.pos]
if skip_whitespace:
while self.pos > 0 and (self.token.type == TokenKind.WHITESPACE or self.token.type == TokenKind.NEWLINE):
self.pos -= 1
self.token = self.tokens[self.pos]
def initialize(self, context, concepts_definitions):
"""
Adds a bunch of concepts, and how they can be recognized
:param context: execution context
:param concepts_definitions: dictionary of concept, concept_definition
:return:
"""
self.context = context
self.sheerka = context.sheerka
concepts_to_resolve = set()
for concept, concept_def in concepts_definitions.items():
# ## Gets the grammars
context.log(f"Resolving grammar for '{concept}'", context.who)
concept.init_key() # make sure that the key is initialized
grammar = self.get_model(concept_def, concepts_to_resolve)
self.concepts_grammars[concept] = grammar
if self.has_error:
return self.sheerka.ret(self.name, False, self.error_sink)
# ## Removes concepts with infinite recursions
concepts_to_remove = self.detect_infinite_recursion(concepts_to_resolve)
for concept in concepts_to_remove:
concepts_to_resolve.remove(concept)
del self.concepts_grammars[concept]
if self.has_error:
return self.sheerka.ret(self.name, False, self.error_sink)
else:
return self.sheerka.ret(self.name, True, self.concepts_grammars)
def get_concept(self, concept_name):
if concept_name in self.context.concepts:
return self.context.concepts[concept_name]
return self.sheerka.get(concept_name)
def get_model(self, concept_def, concepts_to_resolve):
# TODO
# inner_get_model must not modify the initial ParsingExpression
# A copy must be created
def inner_get_model(expression):
if isinstance(expression, Concept):
if self.sheerka.isaset(self.context, expression):
ret = ConceptGroupExpression(expression, rule_name=expression.name)
else:
ret = ConceptExpression(expression, rule_name=expression.name)
concepts_to_resolve.add(expression)
elif isinstance(expression, ConceptExpression): # it includes ConceptGroupExpression
if expression.rule_name is None or expression.rule_name == "":
expression.rule_name = expression.concept.name if isinstance(expression.concept, Concept) \
else expression.concept
if isinstance(expression.concept, str):
concept = self.get_concept(expression.concept)
if self.sheerka.is_known(concept):
expression.concept = concept
concepts_to_resolve.add(expression.concept)
ret = expression
elif isinstance(expression, str):
ret = StrMatch(expression, ignore_case=self.ignore_case)
elif isinstance(expression, StrMatch):
ret = expression
if ret.ignore_case is None:
ret.ignore_case = self.ignore_case
elif isinstance(expression, Sequence) or \
isinstance(expression, OrderedChoice) or \
isinstance(expression, ZeroOrMore) or \
isinstance(expression, OneOrMore) or \
isinstance(expression, Optional):
ret = expression
ret.nodes = [inner_get_model(e) for e in ret.elements]
else:
ret = self.add_error(GrammarErrorNode(f"Unrecognized grammar element '{expression}'."), False)
# Translate separator expression.
if isinstance(expression, Repetition) and expression.sep:
expression.sep = inner_get_model(expression.sep)
return ret
model = inner_get_model(concept_def)
return model
def detect_infinite_recursion(self, concepts_to_resolve):
# infinite recursion matcher
def _is_infinite_recursion(ref_concept, node):
if isinstance(node, ConceptExpression):
if node.concept == ref_concept:
return True
if isinstance(node.concept, str):
to_match = self.get_concept(node.concept)
if self.sheerka.isinstance(to_match, BuiltinConcepts.UNKNOWN_CONCEPT):
return False
else:
to_match = node.concept
if to_match not in self.concepts_grammars:
return False
return _is_infinite_recursion(ref_concept, self.concepts_grammars[to_match])
if isinstance(node, OrderedChoice):
return _is_infinite_recursion(ref_concept, node.nodes[0])
if isinstance(node, Sequence):
for node in node.nodes:
if _is_infinite_recursion(ref_concept, node):
return True
return False
return False
removed_concepts = []
for e in concepts_to_resolve:
if isinstance(e, str):
e = self.get_concept(e)
if self.sheerka.isinstance(e, BuiltinConcepts.UNKNOWN_CONCEPT):
continue
if e not in self.concepts_grammars:
continue
to_resolve = self.concepts_grammars[e]
if _is_infinite_recursion(e, to_resolve):
removed_concepts.append(e)
return removed_concepts
def parse(self, context, parser_input):
if parser_input == "":
return context.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.IS_EMPTY)
)
if not self.reset_parser(context, parser_input):
return self.sheerka.ret(
self.name,
False,
context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink))
concepts_found = [[]]
unrecognized_tokens = None
has_unrecognized = False
# actually list of list
# The first dimension is the number of possibilities found
# The second dimension is the number of concepts found, under one possibility
#
# Example 1
# concept foo : 'one' 'two'
# concept bar : 'one' 'two'
# input 'one two' -> will produce two possibilities (foo and bar).
#
# Example 2
# concept foo : 'one'
# concept bar : 'two'
# input 'one two' -> will produce one possibility which is (foo, bar) (foo then bar)
while True:
init_pos = self.pos
res = []
for concept, grammar in self.concepts_grammars.items():
self.seek(init_pos)
node = grammar.parse(self) # a node is TerminalNode or NonTerminalNode
if node is not None and node.end != -1:
updated_concept = self.finalize_concept(context.sheerka, concept, node)
concept_node = ConceptNode(
updated_concept,
node.start,
node.end,
self.tokens[node.start: node.end + 1],
None,
node)
res.append(concept_node)
if len(res) == 0: # not recognized
self.seek(init_pos)
if unrecognized_tokens:
unrecognized_tokens.add_token(self.get_token(), init_pos)
else:
unrecognized_tokens = UnrecognizedTokensNode(init_pos, init_pos, [self.get_token()])
if not self.next_token(False):
break
else: # some concepts are recognized
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
unrecognized_tokens.fix_source()
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
unrecognized_tokens = None
res = self.get_bests(res) # only keep the concepts that eat the more tokens
concepts_found = core.utils.product(concepts_found, res)
# loop
self.seek(res[0].end)
if not self.next_token(False):
break
# Fix the source for unrecognized tokens
if unrecognized_tokens and unrecognized_tokens.not_whitespace():
unrecognized_tokens.fix_source()
concepts_found = core.utils.product(concepts_found, [unrecognized_tokens])
has_unrecognized = True
# else
# returns as many ReturnValue than choices found
ret = []
for choice in concepts_found:
ret.append(
self.sheerka.ret(
self.name,
not has_unrecognized,
self.sheerka.new(
BuiltinConcepts.PARSER_RESULT,
parser=self,
source=parser_input,
body=choice,
try_parsed=choice)))
if len(ret) == 1:
self.log_result(context, parser_input, ret[0])
return ret[0]
else:
self.log_multiple_results(context, parser_input, ret)
return ret
def finalize_concept(self, sheerka, template, underlying, init_empty_body=True):
"""
Updates the properties of the concept
Goes in recursion if the property is a concept
"""
# this cache is to make sure that we return the same concept for the same ConceptExpression
_underlying_value_cache = {}
def _add_prop(_concept, prop_name, value):
"""
Adds a new entry,
makes a list if the property already exists
"""
if prop_name not in _concept.compiled or _concept.compiled[prop_name] is None:
# new entry
_concept.compiled[prop_name] = value
else:
# make a list if there was a value
previous_value = _concept.compiled[prop_name]
if isinstance(previous_value, list):
previous_value.append(value)
else:
new_value = [previous_value, value]
_concept.compiled[prop_name] = new_value
def _look_for_concept_match(_underlying):
"""
At some point, there is either an StrMatch or a ConceptMatch,
that allowed the recognition.
Look for the ConceptMatch, with recursion if needed
"""
if isinstance(_underlying.parsing_expression, ConceptExpression):
return _underlying
if not isinstance(_underlying, NonTerminalNode):
return None
if len(_underlying.children) != 1:
return None
return _look_for_concept_match(_underlying.children[0])
def _get_underlying_value(_underlying):
concept_match_node = _look_for_concept_match(_underlying)
if concept_match_node:
# the value is a concept
if id(concept_match_node) in _underlying_value_cache:
result = _underlying_value_cache[id(concept_match_node)]
else:
ref_tpl = concept_match_node.parsing_expression.concept
result = self.finalize_concept(sheerka, ref_tpl, concept_match_node.children[0], init_empty_body)
_underlying_value_cache[id(concept_match_node)] = result
else:
# the value is a string
result = DoNotResolve(_underlying.source)
return result
def _process_rule_name(_concept, _underlying):
if _underlying.parsing_expression.rule_name:
value = _get_underlying_value(_underlying)
_add_prop(_concept, _underlying.parsing_expression.rule_name, value)
_concept.metadata.need_validation = True
if isinstance(_underlying, NonTerminalNode):
for child in _underlying.children:
_process_rule_name(_concept, child)
key = (template.key, template.id) if template.id else template.key
concept = sheerka.new(key)
if init_empty_body and concept.metadata.body is None:
value = _get_underlying_value(underlying)
concept.compiled[ConceptParts.BODY] = value
if underlying.parsing_expression.rule_name:
_add_prop(concept, underlying.parsing_expression.rule_name, value)
# KSI : Why don't we set concept.metadata.need_validation to True ?
if isinstance(underlying, NonTerminalNode):
for node in underlying.children:
_process_rule_name(concept, node)
return concept
def encode_grammar(self, grammar):
"""
Transform the grammar into something that can easily can be serialized
:param grammar:
:return:
"""
def _encode(expression):
if isinstance(expression, StrMatch):
res = f"'{expression.to_match}'"
elif isinstance(expression, ConceptExpression):
res = core.utils.str_concept(expression.concept)
elif isinstance(expression, Sequence):
res = "(" + " ".join(_encode(c) for c in expression.nodes) + ")"
elif isinstance(expression, OrderedChoice):
res = "(" + "|".join(_encode(c) for c in expression.nodes) + ")"
elif isinstance(expression, Optional):
res = _encode(expression.nodes[0]) + "?"
elif isinstance(expression, ZeroOrMore):
res = _encode(expression.nodes[0]) + "*"
elif isinstance(expression, OneOrMore):
res = _encode(expression.nodes[0]) + "+"
if expression.rule_name:
res += "=" + expression.rule_name
return res
result = {}
for k, v in grammar.items():
key = core.utils.str_concept(k)
value = _encode(v)
result[key] = value
return result
@staticmethod
def get_bests(results):
"""
Returns the result that is the longest
:param results:
:return:
"""
by_end_pos = defaultdict(list)
for result in results:
by_end_pos[result.end].append(result)
return by_end_pos[max(by_end_pos)]
class ParsingExpressionVisitor:
"""
visit ParsingExpression
"""
def visit(self, parsing_expression):
name = parsing_expression.__class__.__name__
method = 'visit_' + name
visitor = getattr(self, method, self.generic_visit)
return visitor(parsing_expression)
def generic_visit(self, parsing_expression):
if hasattr(self, "visit_all"):
self.visit_all(parsing_expression)
for node in parsing_expression.elements:
if isinstance(node, Concept):
self.visit(ConceptExpression(node.key or node.name))
elif isinstance(node, str):
self.visit(StrMatch(node))
else:
self.visit(node)