From 646c428edb0e999da9901c21fd32eb1e998576f5 Mon Sep 17 00:00:00 2001 From: Kodjo Sossouvi Date: Wed, 24 Feb 2021 17:23:03 +0100 Subject: [PATCH] Fixed #30 : Add variable support in BNF concept definition Fixed #31 : Add regex support in BNF Concept Fixed #33 : Do not memorize object during restore --- _concepts_default.txt | 11 + _concepts_python.txt | 4 +- src/core/builtin_helpers.py | 60 +- src/core/concept.py | 31 +- src/core/global_symbols.py | 6 + src/core/sheerka/ExecutionContext.py | 10 + src/core/sheerka/Sheerka.py | 9 +- .../sheerka/services/SheerkaConceptManager.py | 273 +++++- .../sheerka/services/SheerkaDebugManager.py | 5 +- .../services/SheerkaEvaluateConcept.py | 4 + src/core/sheerka/services/SheerkaExecute.py | 49 +- src/core/sheerka/services/SheerkaMemory.py | 2 +- src/core/tokenizer.py | 11 +- src/evaluators/DefConceptEvaluator.py | 45 +- src/parsers/BaseNodeParser.py | 66 +- src/parsers/BnfDefinitionParser.py | 24 +- src/parsers/BnfNodeParser.py | 781 ++++++++++++++---- src/parsers/PythonParser.py | 4 + src/parsers/SyaNodeParser.py | 2 + src/sheerkapickle/SheerkaUnpickler.py | 4 +- tests/core/test_SheerkaConceptManager.py | 189 ++++- tests/core/test_sheerka.py | 10 +- tests/core/test_tokenizer.py | 5 + tests/evaluators/EvaluatorTestsUtils.py | 2 +- tests/evaluators/test_DefConceptEvaluator.py | 55 +- tests/non_reg/test_sheerka_non_reg.py | 12 + tests/parsers/parsers_utils.py | 37 +- tests/parsers/test_BnfNodeParser.py | 482 ++++++++++- tests/parsers/test_BnfParser.py | 54 +- tests/parsers/test_DefConceptParser.py | 108 ++- tests/parsers/test_parsers_utils.py | 109 +++ tests/sheerkapickle/test_SheerkaPickler.py | 3 +- 32 files changed, 2107 insertions(+), 360 deletions(-) create mode 100644 tests/parsers/test_parsers_utils.py diff --git a/_concepts_default.txt b/_concepts_default.txt index b722a13..86d0145 100644 --- a/_concepts_default.txt +++ b/_concepts_default.txt @@ -37,3 +37,14 @@ woman is a female def concept human man is a human woman is a human + +# days of the week +def concept monday +def concept tuesday +def concept wednesday +def concept thursday +def concept friday +def concept saturday +def concept sunday + + diff --git a/_concepts_python.txt b/_concepts_python.txt index a3ed9e7..33c8b29 100644 --- a/_concepts_python.txt +++ b/_concepts_python.txt @@ -1,4 +1,6 @@ def concept x is a string pre is_question() as isinstance(x, str) def concept x is a int pre is_question() as isinstance(x, int) def concept x is a integer pre is_question() as isinstance(x, int) -def concept x starts with y pre is_question() where x is a string as x.startswith(y) \ No newline at end of file +def concept x starts with y pre is_question() where x is a string as x.startswith(y) +def concept sha256 from bnf r'[a-f0-9]{64}' +def concept sha512 from bnf r'[a-f0-9]{128}' \ No newline at end of file diff --git a/src/core/builtin_helpers.py b/src/core/builtin_helpers.py index 7e555d4..46e0352 100644 --- a/src/core/builtin_helpers.py +++ b/src/core/builtin_helpers.py @@ -342,8 +342,9 @@ def evaluate(context, def get_lexer_nodes(return_values, start, tokens): """ - From a parser result, return the corresponding LexerNode - either ConceptNode, UnrecognizedTokensNode or SourceCodeNode + Transform all elements from return_values into lexer nodes (ConceptNode, UnrecognizedTokensNode, SourceCodeNode...) + On the contrary of the other method (get_lexer_nodes_using_positions), + all created lexer node will use the same offset (start) :param return_values: :param start: :param tokens: @@ -360,13 +361,12 @@ def get_lexer_nodes(return_values, start, tokens): continue end = start + len(tokens) - 1 - lexer_nodes.append( - [SourceCodeNode(start, - end, - tokens, - ret_val.body.source, - python_node=ret_val.body.body, - return_value=ret_val)]) + lexer_nodes.append([SourceCodeNode(start, + end, + tokens, + ret_val.body.source, + python_node=ret_val.body.body, + return_value=ret_val)]) elif ret_val.who == "parsers.ExactConcept": concepts = ret_val.body.body if hasattr(ret_val.body.body, "__iter__") else [ret_val.body.body] @@ -379,6 +379,11 @@ def get_lexer_nodes(return_values, start, tokens): for node in nodes: node.start += start node.end += start + if isinstance(node, ConceptNode): + for k, v in node.concept.get_compiled().items(): + if hasattr(v, "start"): + v.start += start + v.end += start # but append the whole sequence if when it's a sequence lexer_nodes.append(nodes) @@ -397,9 +402,15 @@ def get_lexer_nodes(return_values, start, tokens): def get_lexer_nodes_using_positions(return_values, positions): """ - Transform all elements from return_values into lexer nodes - use positions to remap the exact positions + Transform all elements from return_values into lexer nodes (ConceptNode, UnrecognizedTokensNode, SourceCodeNode...) + Use positions to compute the exact new positions + On the contrary of the other method (get_lexer_nodes), + one return value is mapped with one position. it's not a offset, but an absolute position + :param return_values: + :param positions: is a list of triplets (start, end, tokens) + :return: """ + lexer_nodes = [] for ret_val, position in zip(return_values, positions): if ret_val.who in ("parsers.Python", 'parsers.PythonWithConcepts'): @@ -425,6 +436,11 @@ def get_lexer_nodes_using_positions(return_values, positions): for node in nodes: node.start = position.start node.end = position.end + if isinstance(node, ConceptNode): + for k, v in node.concept.get_compiled().items(): + if hasattr(v, "start"): + v.start += position.start + v.end += position.start # but append the whole sequence if when it's a sequence lexer_nodes.extend(nodes) @@ -493,9 +509,10 @@ def get_lexer_nodes_from_unrecognized(context, unrecognized_tokens_node, parsers def update_compiled(context, concept, errors, parsers=None): """ - recursively iterate over concept.get_compiled() to replace LexerNode into concepts or list of ReturnValueConcept - When parsing using a LexerNodeParser (SyaNodeParser, BnfNodeParser...) - the result will be a LexerNode. + TL;DR; + Recursively iterate over concept.get_compiled() to replace LexerNode into concepts or list of ReturnValueConcept + Long version: + When parsing using a LexerNodeParser (SyaNodeParser, BnfNodeParser...) the result will be a LexerNode. In the specific case of a ConceptNode, the compiled variables will also be LexerNode (UnrecognizedTokensNode...) This function iterate over the compile to transform these nodes into concept of compiled AST :param context: @@ -518,9 +535,12 @@ def update_compiled(context, concept, errors, parsers=None): _validate_concept(v) elif isinstance(v, SourceCodeWithConceptNode): - from parsers.PythonWithConceptsParser import PythonWithConceptsParser - parser_helper = PythonWithConceptsParser() - res = parser_helper.parse_nodes(context, v.get_all_nodes()) + if v.return_value: + res = v.return_value + else: + from parsers.PythonWithConceptsParser import PythonWithConceptsParser + parser_helper = PythonWithConceptsParser() + res = parser_helper.parse_nodes(context, v.get_all_nodes()) if res.status: c.get_compiled()[k] = [res] else: @@ -556,7 +576,7 @@ def update_compiled(context, concept, errors, parsers=None): # example : Concept("a plus b").def_var("a").def_var("b") # and the user has entered 'a plus b' # Chances are that we are talking about the concept itself, and not an instantiation (like '10 plus 2') - # This means that 'a' and 'b' don't have any real value + # This means that 'a' and 'b' don't have any real values if len(concept.get_metadata().variables) > 0: for name, value in concept.get_metadata().variables: if _get_source(concept.get_compiled(), name) != name: @@ -633,7 +653,7 @@ def ensure_concept_or_rule(*items): raise TypeError(f"'{items}' must be a concept or rule") -def ensure_bnf(context, concept, parser_name="BaseNodeParser"): +def ensure_bnf(context, concept, parser_name="BaseNodeParser", update_bnf_for_cached_concept=True): if concept.get_metadata().definition_type == DEFINITION_TYPE_BNF and not concept.get_bnf(): from parsers.BnfDefinitionParser import BnfDefinitionParser regex_parser = BnfDefinitionParser() @@ -651,7 +671,7 @@ def ensure_bnf(context, concept, parser_name="BaseNodeParser"): raise Exception(bnf_parsing_ret_val.value) concept.set_bnf(bnf_parsing_ret_val.body.body) - if concept.id: + if concept.id and update_bnf_for_cached_concept: context.sheerka.get_by_id(concept.id).set_bnf(concept.get_bnf()) # update bnf in cache diff --git a/src/core/concept.py b/src/core/concept.py index 8ba206b..c593994 100644 --- a/src/core/concept.py +++ b/src/core/concept.py @@ -694,6 +694,33 @@ class CC: self.end = end return self + def to_compare(self, other, to_compare_delegate): + """ + Transform other into CNC, to ease the comparison + :param other: + :param to_compare_delegate: + :return: + """ + + if isinstance(other, CC): + return other + + if isinstance(other, Concept): + if self.exclude_body: + compiled = {k: v for k, v in other.get_compiled().items() if k != ConceptParts.BODY} + else: + compiled = other.get_compiled() + + self_compile_to_use = self.compiled or compiled + + compiled = to_compare_delegate(self_compile_to_use, compiled, to_compare_delegate) + return CC(other, + self.source, + self.exclude_body, + **compiled) + + raise NotImplementedError(f"CC, {other=}") + @dataclass() class CB: @@ -825,8 +852,8 @@ class CIO: self.concept_id = concept.id self.concept = concept self.source = source - self.start = -1 - self.end = -1 + self.start = None + self.end = None def set_concept(self, concept): self.concept = concept diff --git a/src/core/global_symbols.py b/src/core/global_symbols.py index 4a70f93..164b05e 100644 --- a/src/core/global_symbols.py +++ b/src/core/global_symbols.py @@ -47,9 +47,15 @@ class RemovedType(CustomType): super(RemovedType, self).__init__("**Removed**") +class NoFirstTokenType(CustomType): + def __init__(self): + super(NoFirstTokenType, self).__init__("**NoFirstToken**") + + NotInit = NotInitType() NotFound = NotFoundType() Removed = RemovedType() +NoFirstToken = NoFirstTokenType() class ErrorObj: diff --git a/src/core/sheerka/ExecutionContext.py b/src/core/sheerka/ExecutionContext.py index cd66813..2a35027 100644 --- a/src/core/sheerka/ExecutionContext.py +++ b/src/core/sheerka/ExecutionContext.py @@ -199,6 +199,16 @@ class ExecutionContext: self._push = None def add_preprocess(self, name, **kwargs): + """ + PreProcess item are used during the parsing and the evaluation of the ReturnValueConcept + Using them, you can twitch the behaviour of parser and evaluator (you can disable them for instance) + example : + context.add_preprocess(BaseEvaluator.get_name("priority15"), enabled=False) + context.add_preprocess(BaseEvaluator.get_name("all_priority15"), priority=99) + :param name: + :param kwargs: + :return: + """ preprocess = self.sheerka.new(BuiltinConcepts.EVALUATOR_PRE_PROCESS) preprocess.set_value("preprocess_name", name) for k, v in kwargs.items(): diff --git a/src/core/sheerka/Sheerka.py b/src/core/sheerka/Sheerka.py index 0b85905..7a33f6d 100644 --- a/src/core/sheerka/Sheerka.py +++ b/src/core/sheerka/Sheerka.py @@ -734,7 +734,7 @@ class Sheerka(Concept): if not isinstance(obj, Concept): return True - return obj.key not in (BuiltinConcepts.UNKNOWN_CONCEPT, BuiltinConcepts.UNKNOWN_RULE) + return obj.key not in (None, BuiltinConcepts.UNKNOWN_CONCEPT, BuiltinConcepts.UNKNOWN_RULE) @staticmethod def isinstance(a, b): @@ -879,6 +879,13 @@ class Sheerka(Concept): return concept + @staticmethod + def deepdiff(a, b): + from deepdiff import DeepDiff + ddiff = DeepDiff(a, b, ignore_order=True) + print(ddiff) + return ddiff + def to_profile(): sheerka = Sheerka() diff --git a/src/core/sheerka/services/SheerkaConceptManager.py b/src/core/sheerka/services/SheerkaConceptManager.py index b247330..fa73f2c 100644 --- a/src/core/sheerka/services/SheerkaConceptManager.py +++ b/src/core/sheerka/services/SheerkaConceptManager.py @@ -1,5 +1,6 @@ +import re from dataclasses import dataclass -from typing import Set +from typing import Set, List, Union import core.utils from cache.Cache import Cache @@ -12,9 +13,10 @@ from core.builtin_concepts_ids import BuiltinConcepts, AllBuiltinConcepts, Built from core.builtin_helpers import ensure_concept, ensure_bnf from core.concept import Concept, DEFINITION_TYPE_DEF, DEFINITION_TYPE_BNF, freeze_concept_attrs, ConceptMetadata, \ VARIABLE_PREFIX -from core.global_symbols import EVENT_CONCEPT_CREATED, NotInit, NotFound, ErrorObj, EVENT_CONCEPT_DELETED +from core.global_symbols import EVENT_CONCEPT_CREATED, NotInit, NotFound, ErrorObj, EVENT_CONCEPT_DELETED, NoFirstToken from core.sheerka.services.sheerka_service import BaseService from core.tokenizer import Tokenizer, TokenKind +from parsers.BnfNodeParser import RegExDef from sdp.sheerkaDataProvider import SheerkaDataProviderDuplicateKeyError BASE_NODE_PARSER_CLASS = "parsers.BaseNodeParser.BaseNodeParser" @@ -98,6 +100,8 @@ class SheerkaConceptManager(BaseService): CONCEPTS_BY_FIRST_KEYWORD_ENTRY = "ConceptManager:Concepts_By_First_Keyword" RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY = "ConceptManager:Resolved_Concepts_By_First_Keyword" + CONCEPTS_BY_REGEX_ENTRY = "ConceptManager:Concepts_By_Regex" + CONCEPTS_BNF_DEFINITIONS_ENTRY = "ConceptManager:Concepts_BNF_Definitions" def __init__(self, sheerka): @@ -105,6 +109,7 @@ class SheerkaConceptManager(BaseService): self.forbidden_meta = {"is_builtin", "key", "id", "props", "variables"} self.allowed_meta = {attr for attr in vars(ConceptMetadata) if not attr.startswith("_") and attr not in self.forbidden_meta} + self.compiled_concepts_by_regex = [] def initialize(self): self.sheerka.bind_service_method(self.create_new_concept, True) @@ -119,6 +124,7 @@ class SheerkaConceptManager(BaseService): self.sheerka.bind_service_method(self.get_by_id, False, visible=False) self.sheerka.bind_service_method(self.is_not_a_variable, False, visible=False) self.sheerka.bind_service_method(self.get_concepts_by_first_token, False, visible=False) + self.sheerka.bind_service_method(self.get_concepts_by_first_regex, False, visible=False) self.sheerka.bind_service_method(self.get_concepts_bnf_definitions, False, visible=False) self.sheerka.bind_service_method(self.clear_bnf_definition, True, visible=False) @@ -145,6 +151,9 @@ class SheerkaConceptManager(BaseService): cache = DictionaryCache().auto_configure(self.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY) self.sheerka.om.register_cache(self.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY, cache, persist=False) + cache = DictionaryCache().auto_configure(self.CONCEPTS_BY_REGEX_ENTRY) + self.sheerka.om.register_cache(self.CONCEPTS_BY_REGEX_ENTRY, cache) + cache = Cache().auto_configure(self.CONCEPTS_BNF_DEFINITIONS_ENTRY) self.sheerka.om.register_cache(self.CONCEPTS_BNF_DEFINITIONS_ENTRY, cache, persist=False) @@ -158,6 +167,14 @@ class SheerkaConceptManager(BaseService): res = self.resolve_concepts_by_first_keyword(context, concepts_by_first_keyword) self.sheerka.om.put(self.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, res.body) + # init the regular expression + self.sheerka.om.get(self.CONCEPTS_BY_REGEX_ENTRY, None) + from_db = self.sheerka.om.current_cache_manager().copy(self.CONCEPTS_BY_REGEX_ENTRY) + concepts_by_first_regex = {RegExDef().deserialize(k): v for k, v in from_db.items()} + res = self.compile_concepts_by_first_regex(context, concepts_by_first_regex) + self.compiled_concepts_by_regex.clear() + self.compiled_concepts_by_regex.extend(res.body) + def initialize_builtin_concepts(self): """ Initializes the builtin concepts @@ -201,9 +218,9 @@ class SheerkaConceptManager(BaseService): concept.init_key() init_bnf_ret_value = None - ontology = sheerka.om + om = sheerka.om - if ontology.exists(self.CONCEPTS_BY_HASH_ENTRY, concept.get_definition_hash()): + if om.exists(self.CONCEPTS_BY_HASH_ENTRY, concept.get_definition_hash()): error = SheerkaDataProviderDuplicateKeyError(self.CONCEPTS_BY_KEY_ENTRY + "." + concept.key, concept) return sheerka.ret( self.NAME, @@ -220,33 +237,44 @@ class SheerkaConceptManager(BaseService): except Exception as ex: return sheerka.ret(self.NAME, False, ex.args[0]) - # compute new concepts_by_first_keyword - init_ret_value = self.compute_concepts_by_first_token(context, [concept], True) + # compute first token and/or first regex + init_ret_value = self.compute_concepts_by_first_item(context, [concept], True) if not init_ret_value.status: return sheerka.ret(self.NAME, False, ErrorConcept(init_ret_value.value)) - concepts_by_first_keyword = init_ret_value.body + by_first_keyword, by_first_regex = init_ret_value.body # computes resolved concepts_by_first_keyword - init_ret_value = self.resolve_concepts_by_first_keyword(context, concepts_by_first_keyword) + init_ret_value = self.resolve_concepts_by_first_keyword(context, by_first_keyword) if not init_ret_value.status: return sheerka.ret(self.NAME, False, ErrorConcept(init_ret_value.value)) resolved_concepts_by_first_keyword = init_ret_value.body + # compile regex + compile_ret = self.compile_concepts_by_first_regex(context, by_first_regex) + if not compile_ret.status: + return sheerka.ret(self.NAME, False, ErrorConcept(compile_ret.value)) + compiled_concepts_by_first_regex = compile_ret.body + # if everything is fine freeze_concept_attrs(concept) concept.freeze_definition_hash() - ontology.add_concept(concept) - ontology.put(self.CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, concepts_by_first_keyword) - ontology.put(self.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, resolved_concepts_by_first_keyword) + om.add_concept(concept) + om.put(self.CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, by_first_keyword) + om.put(self.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, resolved_concepts_by_first_keyword) + om.put(self.CONCEPTS_BY_REGEX_ENTRY, False, {k.serialize(): v for k, v in by_first_regex.items()}) + + # update the compiled regex + self.compiled_concepts_by_regex.clear() + self.compiled_concepts_by_regex.extend(compiled_concepts_by_first_regex) if concept.get_metadata().definition_type == DEFINITION_TYPE_DEF and concept.get_metadata().definition != concept.name: # allow search by definition when definition relevant - ontology.put(self.sheerka.CONCEPTS_BY_NAME_ENTRY, concept.get_metadata().definition, concept) + om.put(self.sheerka.CONCEPTS_BY_NAME_ENTRY, concept.get_metadata().definition, concept) # update references for ref in self.compute_references(concept): - ontology.put(self.CONCEPTS_REFERENCES_ENTRY, ref, concept.id) + om.put(self.CONCEPTS_REFERENCES_ENTRY, ref, concept.id) # TODO : this line seems to be useless # The grammar is never reset @@ -286,7 +314,7 @@ class SheerkaConceptManager(BaseService): # } # sheerka = self.sheerka - cache_manager = self.sheerka.om + om = self.sheerka.om if not to_add and not to_remove: return sheerka.ret(self.NAME, False, sheerka.err(NoModificationFound(concept))) @@ -301,23 +329,19 @@ class SheerkaConceptManager(BaseService): if res is not None: return res - # To update concept by first keyword - # first remove the old references - keywords = self.get_first_tokens(sheerka, concept) # keyword of the old concept - concepts_by_first_keyword = cache_manager.copy(self.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) - for keyword in keywords: - try: - concepts_by_first_keyword[keyword].remove(concept.id) - if len(concepts_by_first_keyword[keyword]) == 0: - del concepts_by_first_keyword[keyword] - except KeyError: # only occurs in unit tests when concepts are created without create_new() - pass + # To update concept by first keyword and first regex + # first remove old first token and first regex entries + concepts_by_first_keyword, concepts_by_regex = self._remove_concept_first_token_and_first_regex(concept) # and then update - init_ret_value = self.compute_concepts_by_first_token(context, [new_concept], False, concepts_by_first_keyword) + init_ret_value = self.compute_concepts_by_first_item(context, + [new_concept], + False, + concepts_by_first_keyword, + concepts_by_regex) if not init_ret_value.status: return sheerka.ret(self.NAME, False, ErrorConcept(init_ret_value.value)) - concepts_by_first_keyword = init_ret_value.body + concepts_by_first_keyword, concepts_by_regex = init_ret_value.body # computes resolved concepts_by_first_keyword init_ret_value = self.resolve_concepts_by_first_keyword(context, @@ -327,18 +351,30 @@ class SheerkaConceptManager(BaseService): return sheerka.ret(self.NAME, False, ErrorConcept(init_ret_value.value)) resolved_concepts_by_first_keyword = init_ret_value.body + # compile new regex + compile_ret = self.compile_concepts_by_first_regex(context, concepts_by_regex) + if not compile_ret.status: + return sheerka.ret(self.NAME, False, ErrorConcept(compile_ret.value)) + compiled_concepts_by_first_regex = compile_ret.body + # update concept that referenced the old concept and clear old references self.update_references(context, concept, new_concept, to_add) for ref in self.compute_references(concept): - cache_manager.delete(self.CONCEPTS_REFERENCES_ENTRY, ref, concept.id) + om.delete(self.CONCEPTS_REFERENCES_ENTRY, ref, concept.id) # compute new references for ref in self.compute_references(new_concept): - cache_manager.put(self.CONCEPTS_REFERENCES_ENTRY, ref, new_concept.id) + om.put(self.CONCEPTS_REFERENCES_ENTRY, ref, new_concept.id) - cache_manager.update_concept(concept, new_concept) - cache_manager.put(self.CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, concepts_by_first_keyword) - cache_manager.put(self.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, resolved_concepts_by_first_keyword) + # everything is ok, update the caches + om.update_concept(concept, new_concept) + om.put(self.CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, concepts_by_first_keyword) + om.put(self.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, resolved_concepts_by_first_keyword) + om.put(self.CONCEPTS_BY_REGEX_ENTRY, False, {k.serialize(): v for k, v in concepts_by_regex.items()}) + + # update the compiled regex + self.compiled_concepts_by_regex.clear() + self.compiled_concepts_by_regex.extend(compiled_concepts_by_first_regex) # everything seems to be fine. Update the list of attributes # Caution. Must be done AFTER update_concept() @@ -349,6 +385,7 @@ class SheerkaConceptManager(BaseService): if modify_source: self._update_concept(context, concept, to_add, to_remove) + # KSI 2021-02-16 publish the modification of the concept only when someone needs it ret = sheerka.ret(self.NAME, True, sheerka.new(BuiltinConcepts.NEW_CONCEPT, body=new_concept)) return ret @@ -362,17 +399,44 @@ class SheerkaConceptManager(BaseService): # TODO : resolve concept first sheerka = context.sheerka - refs = self.sheerka.om.get(self.CONCEPTS_REFERENCES_ENTRY, concept.id) + + if not sheerka.is_known(concept): + return sheerka.ret(self.NAME, False, sheerka.err(ConceptNotFound(concept))) + + om = sheerka.om + + refs = om.get(self.CONCEPTS_REFERENCES_ENTRY, concept.id) if refs is not NotFound: refs_instances = [sheerka.new_from_template(c, c.key) for c in [self.get_by_id(ref) for ref in refs]] return sheerka.ret(self.NAME, False, sheerka.err(ConceptIsReferenced(refs_instances))) - try: - sheerka.om.remove_concept(concept) - sheerka.publish(context, EVENT_CONCEPT_DELETED, concept) - return sheerka.ret(self.NAME, True, sheerka.new(BuiltinConcepts.SUCCESS)) - except ConceptNotFound as ex: - return sheerka.ret(self.NAME, False, sheerka.err(ex)) + concepts_by_first_keyword, concepts_by_regex = self._remove_concept_first_token_and_first_regex(concept) + + # computes resolved concepts_by_first_keyword + init_ret_value = self.resolve_concepts_by_first_keyword(context, concepts_by_first_keyword) + if not init_ret_value.status: + return sheerka.ret(self.NAME, False, ErrorConcept(init_ret_value.value)) + resolved_concepts_by_first_keyword = init_ret_value.body + + # compile new regex + compile_ret = self.compile_concepts_by_first_regex(context, concepts_by_regex) + if not compile_ret.status: + return sheerka.ret(self.NAME, False, ErrorConcept(compile_ret.value)) + compiled_concepts_by_first_regex = compile_ret.body + + # everything seems fine. I can commit the modification and remove + om.remove_concept(concept) + + om.put(self.CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, concepts_by_first_keyword) + om.put(self.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, resolved_concepts_by_first_keyword) + om.put(self.CONCEPTS_BY_REGEX_ENTRY, False, {k.serialize(): v for k, v in concepts_by_regex.items()}) + + # update the compiled regex + self.compiled_concepts_by_regex.clear() + self.compiled_concepts_by_regex.extend(compiled_concepts_by_first_regex) + + sheerka.publish(context, EVENT_CONCEPT_DELETED, concept) + return sheerka.ret(self.NAME, True, sheerka.new(BuiltinConcepts.SUCCESS)) def set_attr(self, concept, attribute, value): """ @@ -497,7 +561,7 @@ class SheerkaConceptManager(BaseService): if c.id == concept_id: return c - metadata = [(index_name, key), ("id", concept_id)] if concept_id else (index_name, key) + metadata = {index_name: key, "id": concept_id} if concept_id else {index_name: key} return self.sheerka.get_unknown(metadata) def update_references(self, context, concept, modified_concept=None, modifications=None): @@ -663,12 +727,39 @@ class SheerkaConceptManager(BaseService): concept.get_metadata().key = None if self._definition_has_changed(to_add) and concept.get_metadata().definition_type == DEFINITION_TYPE_BNF: concept.set_bnf(None) - ensure_bnf(context, concept) + ensure_bnf(context, concept, update_bnf_for_cached_concept=False) concept.init_key() return + def _remove_concept_first_token_and_first_regex(self, concept): + keywords_or_regex = self.get_first_items(self.sheerka, concept) # keyword of the old concept + concepts_by_first_keyword = self.sheerka.om.copy(self.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) + concepts_by_regex = self.sheerka.om.copy(self.CONCEPTS_BY_REGEX_ENTRY) + for item in keywords_or_regex: + try: + if isinstance(item, RegExDef): + serialized = item.serialize() + copy = concepts_by_regex[serialized].copy() + copy.remove(concept.id) + if len(copy) == 0: + del concepts_by_regex[serialized] + else: + concepts_by_regex[serialized] = copy + else: + copy = concepts_by_first_keyword[item].copy() + copy.remove(concept.id) + if len(copy) == 0: + del concepts_by_first_keyword[item] + else: + concepts_by_first_keyword[item] = copy + except KeyError: # only occurs in unit tests when concepts are created without create_new() + pass + + # return concepts_by_first_keyword, concepts_by_regex + return concepts_by_first_keyword, {RegExDef().deserialize(k): v for k, v in concepts_by_regex.items()} + @staticmethod def get_first_tokens(sheerka, concept): """ @@ -677,6 +768,30 @@ class SheerkaConceptManager(BaseService): :param concept: :return: """ + if concept.get_bnf(): + from parsers.BnfNodeParser import BnfNodeFirstTokenVisitor + bnf_visitor = BnfNodeFirstTokenVisitor(sheerka) + bnf_visitor.visit(concept.get_bnf()) + return [t for t in bnf_visitor.first_tokens if t is not NoFirstToken] + else: + keywords = concept.key.split() + for keyword in keywords: + if keyword.startswith(VARIABLE_PREFIX): + continue + + return [keyword] + + return None + + @staticmethod + def get_first_items(sheerka, concept) -> List[Union[str, RegExDef]]: + """ + Get all the first item needed by the concept + An item can either be a token, or regular expression + :param sheerka: + :param concept: + :return: List of string (if it's token or RegExDef if it's the definition of a regex) + """ if concept.get_bnf(): from parsers.BnfNodeParser import BnfNodeFirstTokenVisitor bnf_visitor = BnfNodeFirstTokenVisitor(sheerka) @@ -692,6 +807,55 @@ class SheerkaConceptManager(BaseService): return None + @staticmethod + def compute_concepts_by_first_item(context, + concepts, + use_sheerka=False, + previous_first_keywords=None, + previous_first_regex=None): + """ + Create two map, + one for describing the first token expected by a concept + one for the first regular expression + eg the dictionaries that go into CONCEPTS_BY_FIRST_KEYWORD_ENTRY and CONCEPTS_BY_REGEX_ENTRY + :param context: + :param concepts: lists of concepts to parse + :param use_sheerka: if True, updates sheerka + :param previous_first_keywords: + :param previous_first_regex: + :return: Returns two dictionaries : on for ALL first item entries, another one for all first regex entries + """ + sheerka = context.sheerka + if use_sheerka: + previous_first_keywords = sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) + previous_first_regex = sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) + previous_first_regex = {RegExDef().deserialize(k): v for k, v in previous_first_regex.items()} + else: + previous_first_keywords = previous_first_keywords or {} + previous_first_regex = previous_first_regex or {} + + for concept in concepts: + items = SheerkaConceptManager.get_first_items(sheerka, concept) + + if items is None: + # no first token found for a concept ? + return sheerka.ret(sheerka.name, False, NoFirstTokenError(concept, concept.key)) + + for item in items: + if isinstance(item, RegExDef): + previous_first_regex.setdefault(item, []).append(concept.id) + else: + previous_first_keywords.setdefault(item, []).append(concept.id) + + # 'uniquify' the lists + for k, v in previous_first_keywords.items(): + previous_first_keywords[k] = core.utils.make_unique(v) + + for k, v in previous_first_regex.items(): + previous_first_regex[k] = core.utils.make_unique(v) + + return sheerka.ret("BaseNodeParser", True, (previous_first_keywords, previous_first_regex)) + @staticmethod def compute_concepts_by_first_token(context, concepts, use_sheerka=False, previous_entries=None): """ @@ -812,6 +976,19 @@ class SheerkaConceptManager(BaseService): return sheerka.ret("BaseNodeParser", True, res) + @staticmethod + def compile_concepts_by_first_regex(context, concepts_by_first_regex): + res = [] + + try: + for k, v in concepts_by_first_regex.items(): + flags = RegExDef.compile_flags(k.ignore_case, k.multiline, k.explicit_flags) + res.append((re.compile(k.to_match, flags), v)) + except Exception as ex: + return context.sheerka.ret("BaseNodeParser", False, ex) + + return context.sheerka.ret("BaseNodeParser", True, res) + def get_concepts_by_first_token(self, token, to_keep, custom=None, to_map=None, strip_quotes=False, parser=None): """ Tries to find if there are concepts that match the value of the token @@ -853,5 +1030,19 @@ class SheerkaConceptManager(BaseService): return core.utils.make_unique(result + custom_concepts, lambda c: c.concept.id if hasattr(c, "concept") else c.id) + def get_concepts_by_first_regex(self, expr, pos): + """ + Go thru all the declared regular expressions and try to see if there is a match + :param expr: + :param pos: + :return: + """ + result = [] + for compiled_regex, concept_ids in self.compiled_concepts_by_regex: + if compiled_regex.match(expr, pos): + result.extend([self.sheerka.get_by_id(concept_id) for concept_id in concept_ids]) + + return result + def get_concepts_bnf_definitions(self): return self.sheerka.om.current_cache_manager().caches[self.CONCEPTS_BNF_DEFINITIONS_ENTRY].cache diff --git a/src/core/sheerka/services/SheerkaDebugManager.py b/src/core/sheerka/services/SheerkaDebugManager.py index af0c1d0..cf4f44b 100644 --- a/src/core/sheerka/services/SheerkaDebugManager.py +++ b/src/core/sheerka/services/SheerkaDebugManager.py @@ -239,14 +239,17 @@ class ConsoleDebugLogger(BaseDebugLogger): :param kwargs: :return: """ - raw = kwargs.pop('raw', None) if not self.debug_manager.compute_debug_concept(self.context, self.service_name, self.method_name, concept.id, self.debug_id): return + raw = kwargs.pop('raw', None) + color = kwargs.pop('color', None) str_vars = raw if raw else pp.pformat(kwargs) if kwargs else "" + if color: + str_vars = CCM[color] + str_vars + CCM['reset'] text = " - " + text if text is not None else "" colon = ": " if str_vars else "" str_text = f"{CCM['cyan']}..concept#{concept.id}{text}{colon} {CCM['reset']}" diff --git a/src/core/sheerka/services/SheerkaEvaluateConcept.py b/src/core/sheerka/services/SheerkaEvaluateConcept.py index 698f63a..2c9a684 100644 --- a/src/core/sheerka/services/SheerkaEvaluateConcept.py +++ b/src/core/sheerka/services/SheerkaEvaluateConcept.py @@ -5,6 +5,7 @@ from core.builtin_helpers import expect_one, only_successful, evaluate, ensure_c from core.concept import Concept, DoNotResolve, ConceptParts, InfiniteRecursionResolved, AllConceptParts, \ concept_part_value from core.global_symbols import NotInit +from core.rule import Rule from core.sheerka.services.SheerkaConceptManager import SheerkaConceptManager from core.sheerka.services.SheerkaExecute import ParserInput from core.sheerka.services.sheerka_service import BaseService @@ -421,6 +422,9 @@ class SheerkaEvaluateConcept(BaseService): else: return evaluated + elif isinstance(to_resolve, Rule): + raise NotImplementedError() # how to resolve rules ? + # otherwise, execute all return values to find out what is the value else: # update short term memory with current concept variables diff --git a/src/core/sheerka/services/SheerkaExecute.py b/src/core/sheerka/services/SheerkaExecute.py index b417de9..d5d307b 100644 --- a/src/core/sheerka/services/SheerkaExecute.py +++ b/src/core/sheerka/services/SheerkaExecute.py @@ -22,7 +22,7 @@ class ParserInput: Helper class that tokenizes the input once for all """ - def __init__(self, text, tokens=None, start=None, end=None, yield_oef=True): + def __init__(self, text, tokens=None, length=None, start=None, end=None, yield_oef=True): self.text = text self.tokens = tokens or None if self.tokens: @@ -38,13 +38,13 @@ class ParserInput: last_token.line, last_token.column + 1)] - self.length = None # to be computed in reset() + self.length = length # to be computed (again) in reset() self.yield_oef = yield_oef self.start = start or 0 if end: - self.original_end = end + 1 - self.end = self.original_end + self.original_end = end # forced index of the last token + self.end = self.original_end # index of the last token => len(tokens) - 1 if full tokens else: self.original_end = self.end = None @@ -61,30 +61,43 @@ class ParserInput: return f"ParserInput({from_tokens}'{self.text}')" def reset(self, yield_oef=None): + + def _get_end_from_yield_eof(_length, _yield_oef): + return _length - 1 if _yield_oef else _length - 2 + if yield_oef is None: yield_oef = self.yield_oef # make sure tokens is correctly initialized if self.tokens is None: + # the eof if forced, but will not be yield if not set to. self.tokens = list(Tokenizer(self.text, yield_eof=True)) + self.length = len(self.tokens) + if self.original_end is None: - self.end = len(self.tokens) if yield_oef else len(self.tokens) - 1 + self.end = _get_end_from_yield_eof(self.length, yield_oef) else: - self.end = self.original_end if self.original_end <= len(self.tokens) else self.tokens + self.end = self.original_end if self.original_end < self.length else \ + _get_end_from_yield_eof(self.length, yield_oef) self.pos = self.start - 1 self.token = None return self def as_text(self, custom_switcher=None, tracker=None): + if not self.tokens or self.end is None: + # as_text is requested before reset(). + # It means that we want the original text + return self.text + if custom_switcher is None: if self.sub_text: return self.sub_text - if self.start == 0 and self.end == self.length: + if self.start == 0 and self.end == self.length - 1: self.sub_text = self.text return self.sub_text - self.sub_text = core.utils.get_text_from_tokens(self.tokens[self.start:self.end]) + self.sub_text = core.utils.get_text_from_tokens(self.tokens[self.start:self.end + 1]) return self.sub_text else: return core.utils.get_text_from_tokens(self.as_tokens(), custom_switcher, tracker) @@ -92,16 +105,16 @@ class ParserInput: def as_tokens(self): if self.sub_tokens: return self.sub_tokens - if self.start == 0 and self.end == self.length: + if self.start == 0 and self.end == self.length - 1: self.sub_tokens = self.tokens return self.sub_tokens - self.sub_tokens = self.tokens[self.start:self.end] + self.sub_tokens = self.tokens[self.start:self.end + 1] return self.sub_tokens def next_token(self, skip_whitespace=True): self.pos += 1 - if self.pos >= self.end: + if self.pos > self.end: return False self.token = self.tokens[self.pos] @@ -111,11 +124,11 @@ class ParserInput: if skip_whitespace: while self.token.type in (TokenKind.WHITESPACE, TokenKind.NEWLINE): self.pos += 1 - if self.pos == self.end: + if self.pos > self.end: return False self.token = self.tokens[self.pos] - return self.pos < self.end + return self.pos <= self.end def the_token_after(self, skip_whitespace=True): """ @@ -123,13 +136,13 @@ class ParserInput: Never returns None (returns TokenKind.EOF instead) """ my_pos = self.pos + 1 - if my_pos >= self.end: + if my_pos > self.end: return Token(TokenKind.EOF, "", -1, -1, -1) if skip_whitespace: while self.tokens[my_pos].type in (TokenKind.WHITESPACE, TokenKind.NEWLINE): my_pos += 1 - if my_pos == self.end: + if my_pos > self.end: return Token(TokenKind.EOF, "", -1, -1, -1) return self.tokens[my_pos] @@ -140,7 +153,7 @@ class ParserInput: :param pos: :return: True is pos is a valid position False otherwise """ - if pos < 0 or pos >= self.end: + if pos < 0 or pos > self.end: self.token = None return False @@ -355,10 +368,10 @@ class SheerkaExecute(BaseService): if pi is NotFound: # when CacheManager.cache_only is True pi = ParserInput(text) self.pi_cache.put(text, pi) - return ParserInput(text, pi.tokens) # new instance, but no need to tokenize the text again + return ParserInput(text, tokens=pi.tokens, length=pi.length) # new instance, but no need to tokenize the text again key = text or core.utils.get_text_from_tokens(tokens) - pi = ParserInput(key, tokens) + pi = ParserInput(key, tokens=tokens, length=len(tokens)) self.pi_cache.put(key, pi) return pi diff --git a/src/core/sheerka/services/SheerkaMemory.py b/src/core/sheerka/services/SheerkaMemory.py index de4b168..5d017b0 100644 --- a/src/core/sheerka/services/SheerkaMemory.py +++ b/src/core/sheerka/services/SheerkaMemory.py @@ -144,7 +144,7 @@ class SheerkaMemory(BaseService): :param concept: :return: """ - if self.sheerka.during_initialisation: + if self.sheerka.during_initialisation or self.sheerka.during_restore: return self.registration[key] = concept diff --git a/src/core/tokenizer.py b/src/core/tokenizer.py index f702965..51834bf 100644 --- a/src/core/tokenizer.py +++ b/src/core/tokenizer.py @@ -49,8 +49,8 @@ class TokenKind(Enum): DEGREE = "degree" # ° WORD = "word" EQUALSEQUALS = "==" - VAR_DEF = "__var__" - REGEX = "r'xxx' or r\"xxx\" or r|xxx| or r/xxx/" + VAR_DEF = "concept variable" # __var__ + REGEX = "regex" # r'xxx' or r\"xxx\" or r|xxx| or r/xxx/ but not r:xxx: which means rules @dataclass() @@ -73,7 +73,10 @@ class Token: if self._strip_quote: return self._strip_quote - self._strip_quote = self.value[1:-1] if self.type == TokenKind.STRING else self.value + if self.type in (TokenKind.STRING, TokenKind.REGEX): + self._strip_quote = self.value[1:-1] + else: + self._strip_quote = self.value return self._strip_quote @property @@ -120,6 +123,8 @@ class Token: elif self.type == TokenKind.RULE: from core.utils import str_concept return str_concept(self.value, prefix="r:") + elif self.type == TokenKind.REGEX: + return "r" + self.value else: return str(self.value) diff --git a/src/evaluators/DefConceptEvaluator.py b/src/evaluators/DefConceptEvaluator.py index 61d7f3e..f9a15b2 100644 --- a/src/evaluators/DefConceptEvaluator.py +++ b/src/evaluators/DefConceptEvaluator.py @@ -1,3 +1,5 @@ +from dataclasses import dataclass + import core.utils from core.ast_helpers import UnreferencedVariablesVisitor from core.builtin_concepts import ParserResultConcept, ReturnValueConcept, BuiltinConcepts @@ -11,6 +13,29 @@ from parsers.DefConceptParser import DefConceptNode, NameNode from parsers.PythonParser import get_python_node +@dataclass(eq=True, frozen=True) +class MandatoryVariable: + """ + When we are searching for variables, we are searching for potential variable + So if the variable found has no match in the concept definition, it's not a problem + for example: + def concept foo x as isinstance(x, str) + {x, str} will be detected as potential variable, but 'str' will find no match. + + But there are cases where the variable found must exist, otherwise, it's an error + example: + def concept foo from bnf xxx + 'xxx' is detected as a variable (assuming that there is no concept named 'xxx' and a match must be + found in the the name of the variable + + To distinguish between mandatory and not mandatory variable, we use MandatoryVariable + """ + name: str + + def __hash__(self): + return hash(("MandatoryVariable", self.name)) + + class ConceptOrRuleNameVisitor(ParsingExpressionVisitor): """ Gets the concepts referenced by BNF @@ -29,6 +54,9 @@ class ConceptOrRuleNameVisitor(ParsingExpressionVisitor): else: self.names.add(node.concept) + def visit_VariableExpression(self, node): + self.names.add(MandatoryVariable(node.rule_name)) + def visit_all(self, node): if node.rule_name: self.names.add(node.rule_name) @@ -60,11 +88,13 @@ class DefConceptEvaluator(OneReturnValueEvaluator): # validate the node variables_found = set() + mandatory_variables = set() # these variable MUST have a match in the name (if the name is not None) concept = Concept(str(def_concept_node.name)) concept.get_metadata().definition_type = def_concept_node.definition_type name_to_use = self.get_name_to_use(def_concept_node) + # get variables for prop in ("definition", "where", "pre", "post", "body", "ret"): part_ret_val = getattr(def_concept_node, prop) @@ -87,13 +117,26 @@ class DefConceptEvaluator(OneReturnValueEvaluator): # try to find what can be a property for p in self.get_variables(context, part_ret_val, name_to_use): - variables_found.add(p) + if isinstance(p, MandatoryVariable): + variables_found.add(p.name) + mandatory_variables.add(p.name) + else: + variables_found.add(p) # add variables by order of appearance when possible for name_part in name_to_use: if name_part in variables_found: concept.def_var(name_part, None) + # check that all mandatory variables are defined in the name + # KSI: 2021-02-17 + # The mandatory variables come for bnf definition where it was not possible to resolve to a concept + # So rather that issuing a 'UnresolvedVariableError' I prefer UNKNOWN_CONCEPT + if (diff := mandatory_variables.difference(set(name_to_use))) != set(): + unknown_concepts = [sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body={"name": c}) for c in sorted(diff)] + error = sheerka.new(BuiltinConcepts.ERROR, body=unknown_concepts) + return sheerka.ret(self.name, False, error, parents=[return_value]) + # add the remaining properties # They mainly come from BNF definition for p in variables_found: diff --git a/src/parsers/BaseNodeParser.py b/src/parsers/BaseNodeParser.py index 23cb03a..ee8ddda 100644 --- a/src/parsers/BaseNodeParser.py +++ b/src/parsers/BaseNodeParser.py @@ -441,6 +441,11 @@ class GrammarErrorNode(ParsingError): message: str +@dataclass() +class NoMatchingTokenError(ParsingError): + pos: int + + class SyaAssociativity(Enum): Left = "left" Right = "right" @@ -720,6 +725,35 @@ class CNC(CN): txt += f", {k}='{v}'" return txt + ")" + def to_compare(self, other, to_compare_delegate): + """ + Transform other into CNC, to ease the comparison + :param other: + :param to_compare_delegate: + :return: + """ + + if isinstance(other, CNC): + return other + + if isinstance(other, ConceptNode): + if self.exclude_body: + compiled = {k: v for k, v in other.concept.get_compiled().items() if k != ConceptParts.BODY} + else: + compiled = other.concept.get_compiled() + + self_compile_to_use = self.compiled or compiled + + compiled = to_compare_delegate(self_compile_to_use, compiled, to_compare_delegate) + return CNC(other.concept, + other.start if self.start is not None else None, + other.end if self.end is not None else None, + other.source if self.source is not None else None, + self.exclude_body, + **compiled) + + raise NotImplementedError("CNC") + class UTN(HelperWithPos): """ @@ -763,6 +797,24 @@ class UTN(HelperWithPos): txt += f", end={self.end}" return txt + ")" + def to_compare(self, other, to_compare_delegate): + """ + Transform other into CNC, to ease the comparison + :param other: + :param to_compare_delegate: + :return: + """ + + if isinstance(other, UTN): + return other + + if isinstance(other, UnrecognizedTokensNode): + return UTN(other.source, + other.start, + other.end) + + raise NotImplementedError("UTN") + class RN(HelperWithPos): """ @@ -840,9 +892,19 @@ class BaseNodeParser(BaseParser): :return: """ from core.sheerka.services.SheerkaConceptManager import SheerkaConceptManager - concepts_by_first_keyword = SheerkaConceptManager.compute_concepts_by_first_token(context, concepts).body - resolved = SheerkaConceptManager.resolve_concepts_by_first_keyword(context, concepts_by_first_keyword).body + service = context.sheerka.services[SheerkaConceptManager.NAME] + by_token, by_regex = SheerkaConceptManager.compute_concepts_by_first_item(context, concepts).body + context.sheerka.om.put(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY, + False, + {k.serialize(): v for k, v in by_regex.items()}) + compiled = service.compile_concepts_by_first_regex(context, by_regex).body + service.compiled_concepts_by_regex.clear() + service.compiled_concepts_by_regex.extend(compiled) + + resolved = SheerkaConceptManager.resolve_concepts_by_first_keyword(context, by_token).body context.sheerka.om.put(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY, False, resolved) + + return self diff --git a/src/parsers/BnfDefinitionParser.py b/src/parsers/BnfDefinitionParser.py index 82da669..abc35c8 100644 --- a/src/parsers/BnfDefinitionParser.py +++ b/src/parsers/BnfDefinitionParser.py @@ -4,7 +4,7 @@ from core.sheerka.Sheerka import ExecutionContext from core.tokenizer import Tokenizer, Token, TokenKind, LexerError from parsers.BaseParser import BaseParser, UnexpectedTokenParsingError, UnexpectedEofParsingError from parsers.BnfNodeParser import OrderedChoice, Sequence, Optional, ZeroOrMore, OneOrMore, \ - ConceptExpression, StrMatch + ConceptExpression, StrMatch, RegExMatch, VariableExpression class BnfDefinitionParser(BaseParser): @@ -231,9 +231,11 @@ class BnfDefinitionParser(BaseParser): if token.type == TokenKind.CONCEPT: self.next_token() concept = self.sheerka.new((token.value[0], token.value[1])) - expr = ConceptExpression(concept) - # expr = ConceptGroupExpression(concept) if self.sheerka.isaset(self.context, concept) \ - # else ConceptExpression(concept) + if not self.sheerka.is_known(concept): + self.add_error(concept) + return None + + expr = ConceptExpression(concept, rule_name=concept.name) return self.eat_rule_name_if_needed(expr) if token.type in (TokenKind.IDENTIFIER, TokenKind.KEYWORD): @@ -245,20 +247,19 @@ class BnfDefinitionParser(BaseParser): # (for example of recursive bnf definition) if self.context.obj and hasattr(self.context.obj, "name"): if concept_name == str(self.context.obj.name): - return self.eat_rule_name_if_needed(ConceptExpression(concept_name)) + return self.eat_rule_name_if_needed(ConceptExpression(concept_name)) # 2021-02-17 no rule name ? concept = self.context.get_concept(concept_name) if not self.sheerka.is_known(concept): - self.add_error(concept) - return None + expr = VariableExpression(concept_name) + return self.eat_rule_name_if_needed(expr) elif hasattr(concept, "__iter__"): self.add_error( self.sheerka.new(BuiltinConcepts.CANNOT_RESOLVE_CONCEPT, body=("key", concept_name))) return None else: - expr = ConceptExpression(concept) - expr.rule_name = concept.name + expr = ConceptExpression(concept, rule_name=concept.name) return self.eat_rule_name_if_needed(expr) if token.type == TokenKind.STRING: @@ -272,6 +273,11 @@ class BnfDefinitionParser(BaseParser): ret = Sequence(*elements) return self.eat_rule_name_if_needed(ret) + if token.type == TokenKind.REGEX: + self.next_token() + ret = RegExMatch(core.utils.strip_quotes(token.strip_quote)) + return self.eat_rule_name_if_needed(ret) + ret = StrMatch(core.utils.strip_quotes(token.value)) self.next_token() return self.eat_rule_name_if_needed(ret) diff --git a/src/parsers/BnfNodeParser.py b/src/parsers/BnfNodeParser.py index 57b5d1f..3476c05 100644 --- a/src/parsers/BnfNodeParser.py +++ b/src/parsers/BnfNodeParser.py @@ -6,64 +6,85 @@ # Arpeggio: A flexible PEG parser for Python, # Knowledge-Based Systems, 2016, 95, 71 - 74, doi:10.1016/j.knosys.2015.12.004 ##################################################################################################### +import re from collections import defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, field from operator import attrgetter +from typing import List import core.builtin_helpers import core.utils from cache.Cache import Cache from core.builtin_concepts import BuiltinConcepts from core.concept import DEFINITION_TYPE_BNF, DoNotResolve, ConceptParts, Concept +from core.global_symbols import NotFound from core.sheerka.services.SheerkaExecute import ParserInput from core.tokenizer import Tokenizer, TokenKind, Token from core.utils import CONSOLE_COLORS_MAP as CCM -from parsers.BaseNodeParser import BaseNodeParser, GrammarErrorNode, UnrecognizedTokensNode, ConceptNode, LexerNode +from parsers.BaseNodeParser import BaseNodeParser, GrammarErrorNode, UnrecognizedTokensNode, ConceptNode, \ + NoMatchingTokenError, RuleNode, SourceCodeNode, SourceCodeWithConceptNode PARSERS = ["Sequence", "Sya", "Python"] +VARIABLE_EXPR_PARSER = ["Sequence", "Sya", "Python", "Bnf"] -@dataclass -class ParsingContext: +@dataclass(eq=True) +class RegExDef: + to_match: str = None + ignore_case: bool = True + multiline: bool = None + explicit_flags: int = re.MULTILINE + + def __hash__(self): + return hash((self.to_match, self.ignore_case, self.multiline, self.explicit_flags)) + + @staticmethod + def compile_flags(ignore_case, multiline, explicit_flags): + flags = explicit_flags + if multiline is True: + flags |= re.DOTALL + if multiline is False and flags & re.DOTALL: + flags -= re.DOTALL + if ignore_case is True: + flags |= re.IGNORECASE + if ignore_case is False and flags & re.IGNORECASE: + flags -= re.IGNORECASE + return flags + + def serialize(self): + return f"{self.to_match}__!##ZZSEPZZ##!__{self.ignore_case}|{self.multiline}|{self.explicit_flags}" + + def deserialize(self, txt): + parts = txt.split("__!##ZZSEPZZ##!__") + parts2 = parts[1].split("|") + self.to_match = parts[0] + self.ignore_case = None if parts2[0] == "None" else True if parts2[0] == "True" else False + self.multiline = None if parts2[1] == "None" else True if parts2[1] == "True" else False + self.explicit_flags = int(parts2[2]) + + return self + + +class ParseTreeNode: + def __init__(self, parsing_expression, start: int, end: int, tokens: List[Token] = None, source: str = None): + self.parsing_expression = parsing_expression + self.start = start + self.end = end + self.tokens = tokens + self.source = source + + if self.source is None: + self.source = core.utils.get_text_from_tokens(self.tokens) + + +class NonTerminalNode(ParseTreeNode): """ - Class used to allow backtracking when parsing UnOrderedChoice pexpression - It keeps the LexerNode parsed and the position of the parser right after the parsing - """ - node: LexerNode # node parsed - pos: int # position of the parser after the parsing - - def clone(self): - return ParsingContext(self.node.clone(), self.pos) - - def fix_tokens(self, parser_helper): - """ - When the nodes are fully created, make sure that their sources and tokens are correct - :param parser_helper: - :return: - """ - self.node.tokens = parser_helper.parser.parser_input.tokens[self.node.start: self.node.end + 1] - self.node.source = core.utils.get_text_from_tokens(self.node.tokens) - - def __mul__(self, other): - res = [self] - for i in range(other - 1): - res.append(self.clone()) - return res - - def __repr__(self): - res = f"ParsingContext('{self.node.get_debug()}', pos={self.pos})" - return res - - -class NonTerminalNode(LexerNode): - """ - A LexerNode is the result of the parsing of a parsing expression (pexpression) + A ParseTreeNode is the result of the parsing of a parsing expression (pexpression) NonTerminalNode when parsing a pexpression which has children (Sequence, OrderedChoice, Optional, Repetition...) """ def __init__(self, parsing_expression, start, end, tokens, children=None): - super().__init__(start, end, tokens) - self.parsing_expression = parsing_expression + super().__init__(parsing_expression, start, end, tokens) self.children = children def __repr__(self): @@ -95,15 +116,14 @@ class NonTerminalNode(LexerNode): return res + ".".join([c.get_debug() for c in self.children]) -class TerminalNode(LexerNode): +class TerminalNode(ParseTreeNode): """ - A LexerNode is the result of the parsing of a parsing expression (pexpression) + A ParseTreeNode is the result of the parsing of a parsing expression (pexpression) TerminalNode for StrMatch """ - def __init__(self, parsing_expression, start, end, value): - super().__init__(start, end, source=value) - self.parsing_expression = parsing_expression + def __init__(self, parsing_expression, start, end, source, value): + super().__init__(parsing_expression, start, end, source=source) self.value = value def __repr__(self): @@ -117,23 +137,25 @@ class TerminalNode(LexerNode): return self.parsing_expression == other.parsing_expression and \ self.start == other.start and \ self.end == other.end and \ + self.source == other.source and \ self.value == other.value def __hash__(self): - return hash((self.parsing_expression, self.start, self.end, self.value)) + return hash((self.parsing_expression, self.start, self.end, self.source, self.value)) def clone(self): - clone = TerminalNode(self.parsing_expression, self.start, self.end, self.value) + clone = TerminalNode(self.parsing_expression, self.start, self.end, self.source, self.value) return clone def get_debug(self): - return self.value + return str(self.value) class MultiNode: """" - A LexerNode is the result of the parsing of a parsing expression (pexpression) MultiNode is used by the UnorderedChoice parsing expression when multiple choices are found + It should inherit from ParseTreeNode, like its siblings TerminalNode and NonTerminal node + but I am too lazy to bother with start and end positions """ def __init__(self, results): @@ -157,6 +179,74 @@ class MultiNode: return self +@dataclass +class ParsingContext: + """ + Class used to allow backtracking when parsing UnOrderedChoice pexpression + It keeps the ParseTreeNode parsed and the position of the parser right after the parsing + """ + node: ParseTreeNode # node or nodes parsed + pos: int # position of the parser after the parsing + next_results: List[ParseTreeNode] = None # other node parsed, when known + variables: dict = field(default_factory=dict) # variables already seen + to_remove: bool = False # an error/inconsistency is detected. Remove this parsing context ASAP + + def clone(self): + next_result_clones = [n.clone() for n in self.next_results] if self.next_results else None + return ParsingContext(self.node.clone(), self.pos, next_result_clones, self.variables.copy(), False) + + def fix_tokens(self, parser_helper): + """ + When the nodes are fully created, make sure that their sources and tokens are correct + :param parser_helper: + :return: + """ + self.node.tokens = parser_helper.parser.parser_input.tokens[self.node.start: self.node.end + 1] + self.node.source = core.utils.get_text_from_tokens(self.node.tokens) + + def update_with_ptree_node(self, ptree_node, pos): + next_results = None + + if isinstance(ptree_node, list): + next_results = ptree_node[1:] + ptree_node = ptree_node[0] + + if ptree_node.end == -1: + # means that the node must not be added, but the parsing context is not in error + return + + if isinstance(ptree_node.parsing_expression, VariableExpression): + # check the variables consistency + var_name = ptree_node.parsing_expression.rule_name + if var_name in self.variables and self.variables[var_name] != ptree_node.source: + self.to_remove = True + return + self.variables[var_name] = ptree_node.source + + self.pos = pos + self.node.children.append(ptree_node) + self.node.end = ptree_node.end + if ptree_node.start < self.node.start: + # fix start pos when sequence stars with VariableExpression + self.node.start = ptree_node.start + + if next_results is not None: + self.next_results = next_results + + def __mul__(self, other): + res = [self] + for i in range(other - 1): + res.append(self.clone()) + return res + + def __repr__(self): + if isinstance(self.node, list): + res = f"ParsingContext('{[n.get_debug() for n in self.node]}', pos={self.pos})" + else: + res = f"ParsingContext('{self.node.get_debug()}', pos={self.pos})" + return res + + class ParsingExpression: log_sink = [] @@ -265,6 +355,25 @@ class ParsingExpression: str_rule_name = f":{current_rule_name}" if current_rule_name not in (None, current_concept.name) else "" return f"{self_name}({current_concept}{str_rule_name})" + @staticmethod + def debug_remaining_text(parser_helper): + remaining_text = parser_helper.get_parsing_text()[parser_helper.token.index:] + if len(remaining_text) > 50: + remaining_text = remaining_text[:47] + "..." + return remaining_text + + @staticmethod + def debug_to_raw(variables): + res = "" + first = True + for k, v in variables.items(): + if not first: + res += ", " + res += f"{k}={v}" + first = False + + return res + class ConceptExpression(ParsingExpression): """ @@ -326,6 +435,188 @@ class ConceptExpression(ParsingExpression): return f"{parent_id}#{concept_id}({rule_name})" +class VariableExpression(ParsingExpression): + def __init__(self, rule_name): + super().__init__(rule_name=rule_name) + self.before_first_token_node = False + self.expected_variables = [self] + self.next_node_to_parse = None + + def __repr__(self): + return self.add_rule_name_if_needed(f"Var") + + def __eq__(self, other): + if not super().__eq__(other): + return False + + return isinstance(other, VariableExpression) + + def __hash__(self): + return hash(("VariableExpression", self.rule_name)) + + def init_parsing(self): + """ + Get the instance of the following VariableExpression if they exists, + :return: + """ + + next_node_to_parse = self.nodes[0] if len(self.nodes) > 0 else None + while isinstance(next_node_to_parse, VariableExpression): + self.expected_variables.append(next_node_to_parse) + next_node_to_parse = next_node_to_parse.nodes[0] if len(next_node_to_parse.nodes) > 0 else None + + self.next_node_to_parse = next_node_to_parse + + def get_nodes_sequences_when_variables_are_first(self, parser_helper): + if len(parser_helper.sequence) < len(self.expected_variables): + # variable(s) is/are expected. But nothing found + return None + + # only take the requested number of variables + nodes_sequence = parser_helper.sequence[-len(self.expected_variables):] + nodes_sequences = [nodes_sequence] + + return nodes_sequences + + def get_nodes_sequences_when_variables_are_last(self, parser_helper): + tokens = parser_helper.get_remaining_tokens() + start = parser_helper.pos + end = parser_helper.get_last_token_pos() + nodes_sequences = self.get_nodes_sequences_from_tokens(parser_helper, start, end, tokens) + if not nodes_sequences: + return nodes_sequences + + # only take the requested number of variables + sequences_to_keep = [] + for sequence in nodes_sequences: + if len(sequence) < len(self.expected_variables): + continue # not enough parameters to feed the VariableExpression + sequences_to_keep.append(sequence[:len(self.expected_variables)]) + + return sequences_to_keep + + def get_nodes_sequences_when_variables_are_in_between(self, parser_helper): + start = parser_helper.pos + end = parser_helper.get_last_token_pos() + # start by the end, to be the greediest + while end >= start: + parser_helper.seek(end) + node = self.next_node_to_parse.parse(parser_helper) + if node and node.end != -1: + break + end -= 1 + tokens = parser_helper.parser.parser_input.tokens[start:end] + parser_helper.seek(end) # for the next node + return self.get_nodes_sequences_from_tokens(parser_helper, start, end, tokens) + + def _parse(self, parser_helper): + + if parser_helper.debugger.is_enabled(): + debug_prefix = self.debug_prefix("VariableExpression", parser_helper) + debug_vars = {"pos": parser_helper.pos, + "expected variables": self.expected_variables, + "next to match": self.next_node_to_parse} + debug_text = self.debug_to_raw(debug_vars) + parser_helper.debug_concept(debug_prefix, color="cyan", raw=debug_text) + + if self.before_first_token_node: + nodes_sequences = self.get_nodes_sequences_when_variables_are_first(parser_helper) + elif not self.next_node_to_parse: + nodes_sequences = self.get_nodes_sequences_when_variables_are_last(parser_helper) + else: + nodes_sequences = self.get_nodes_sequences_when_variables_are_in_between(parser_helper) + + if nodes_sequences is None or self.has_unrecognized(nodes_sequences): + # nothing is recognized or only part is recognized + return None + + all_results = [] + + for nodes_sequence in nodes_sequences: + # this outer loop deals with when there a multiple choices + # ie, the result in either nodes_sequence_1 or nodes_sequence_2, etc.. + + ptree_nodes = [] + for variable_expr, node in zip(self.expected_variables, nodes_sequence): + # this inner loop deals with results with multiples concepts in a row + # ie the result is a sequence of node_1, then node_2, etc... + + resolved = self.get_resolved(node) + if resolved is None: + parser_helper.errors.append(f"Failed to recognize {node.source}") + break + + ptree_nodes.append(TerminalNode(variable_expr, node.start, node.end, node.source, resolved)) + + if len(ptree_nodes) != len(nodes_sequence): + # it means that we did not recognize all the nodes + # So it's a mismatch + continue + + # finally adds the results + if len(ptree_nodes) == 1: + all_results.append(ptree_nodes[0]) + else: + all_results.append(ptree_nodes) + + if len(all_results) == 0: + return None + + # every seems to be fine. We can pop the nodes from parser_helper used as variable + if self.before_first_token_node: + for i in range(len(self.expected_variables)): + parser_helper.sequence.pop() + + if len(all_results) == 1: + return all_results[0] + + # all results are valid, let's return them + parsing_contexts = [ParsingContext(ptree_node, parser_helper.pos) for ptree_node in all_results] + return MultiNode(parsing_contexts) + + @staticmethod + def get_resolved(node): + """ + Turn Lexer node into Concept, Rule or List[ReturnValueConcept], (basically what is + expected by SheerkaEvaluateConcept.resolve()) + May be merged with builtin_helpers.update_compiled() ? + :param node: + :return: + """ + if isinstance(node, UnrecognizedTokensNode): + return None + + if isinstance(node, RuleNode): + return node.rule + + if isinstance(node, ConceptNode): + return node.concept + + if isinstance(node, (SourceCodeNode, SourceCodeWithConceptNode)): + return node.return_value + + raise NotImplementedError() + + @staticmethod + def get_nodes_sequences_from_tokens(parser_helper, start, end, tokens): + if len(tokens) == 0: + return None + + utn = UnrecognizedTokensNode(start, end, tokens) + nodes_sequences = core.builtin_helpers.get_lexer_nodes_from_unrecognized(parser_helper.parser.context, + utn, + VARIABLE_EXPR_PARSER) + return nodes_sequences + + @staticmethod + def has_unrecognized(nodes_sequence: list): + for n in nodes_sequence: + if isinstance(n, UnrecognizedTokensNode): + return True + + return False + + class Sequence(ParsingExpression): """ Will match sequence of parser expressions in exact order they are defined. @@ -337,14 +628,18 @@ class Sequence(ParsingExpression): if parser_helper.debugger.is_enabled(): debug_prefix = self.debug_prefix("Sequence", parser_helper) - parser_helper.debug_concept(debug_prefix, nodes=self.nodes) + debug_vars = {"pos": parser_helper.pos, + "nodes": self.nodes, + "to_match": self.debug_remaining_text(parser_helper)} + debug_text = self.debug_to_raw(debug_vars) + parser_helper.debug_concept(debug_prefix, color="cyan", raw=debug_text) + ntn = NonTerminalNode(self, init_pos, end_pos, None, []) parsing_contexts = [ParsingContext(ntn, parser_helper.pos)] - to_remove = [] to_append = [] for e in self.nodes: @@ -352,37 +647,38 @@ class Sequence(ParsingExpression): if parser_helper.debugger.is_enabled(): parser_helper.debug_concept(debug_prefix, node=e, pcontext=pcontext) - parser_helper.seek(pcontext.pos) - node = e.parse(parser_helper) + if pcontext.next_results: + node = pcontext.next_results + else: + parser_helper.seek(pcontext.pos) + node = e.parse(parser_helper) + if node is None: - to_remove.append(pcontext) + pcontext.to_remove = True elif isinstance(node, MultiNode): - clones = pcontext * len(node.results) # clones pcontext (but first item is pcontext) + clones = pcontext * len(node.results) # clones pcontext (and first item is current pcontext) to_append.extend(clones[1:]) for clone, node_pcontext in zip(clones, node.results): - clone.pos = node_pcontext.pos - clone.node.children.append(node_pcontext.node) - clone.node.end = node_pcontext.node.end - else: - if node.end != -1: # because returns -1 when no match - pcontext.pos = parser_helper.pos - pcontext.node.children.append(node) - pcontext.node.end = node.end + clone.update_with_ptree_node(node_pcontext.node, node_pcontext.pos) - for pcontext in to_remove: - parsing_contexts.remove(pcontext) + else: + pcontext.update_with_ptree_node(node, parser_helper.pos) + + # clean up and reorganize list of parsing_contexts parsing_contexts.extend(to_append) + core.utils.remove_from_list(parsing_contexts, lambda pc: pc.to_remove) if len(parsing_contexts) == 0: if parser_helper.debugger.is_enabled(): - parser_helper.debug_concept(debug_prefix + " All pcontexts are failed. Sequence failed") + parser_helper.debug_concept(debug_prefix, + raw="All pcontexts are failed. Sequence failed", + color="red") return None to_append.clear() - to_remove.clear() - # reset tokenizer the following pexpression + # reset tokenizer for the following pexpression parser_helper.seek(parsing_contexts[0].pos) # update nodes sources and tokens @@ -442,7 +738,9 @@ class UnOrderedChoice(ParsingExpression): if parser_helper.debugger.is_enabled(): debug_prefix = self.debug_prefix("UnOrderedChoice", parser_helper) - parser_helper.debug_concept(debug_prefix) + debug_text = {"pos": parser_helper.pos, "text": self.debug_remaining_text(parser_helper)} + parser_helper.debug_concept(debug_prefix, raw=f"{CCM['green']}{debug_text}{CCM['reset']}") + debug_text = "" for e in self.nodes: if isinstance(e, ConceptExpression) and e.concept.id in parser_helper.get_concepts_ids(): @@ -686,12 +984,16 @@ class StrMatch(Match): self.ignore_case == other.ignore_case and \ self.skip_white_space == other.skip_white_space + def __hash__(self): + return hash(("StrMatch", self.to_match, self.ignore_case, self.skip_white_space)) + def _parse(self, parser_helper): token = parser_helper.get_token() if parser_helper.debugger.is_enabled(): debug_prefix = self.debug_prefix("StrMatch", parser_helper) debug_text = f"pos={parser_helper.pos}, to_match={self.to_match}, token={token.str_value}" + parser_helper.debug_concept(debug_prefix, raw=f"{CCM['green']}{debug_text}{CCM['reset']}") m = token.str_value.lower() == self.to_match.lower() if self.ignore_case \ else token.strip_quote == self.to_match @@ -699,7 +1001,7 @@ class StrMatch(Match): if m: if parser_helper.debugger.is_enabled(): parser_helper.debug_concept(debug_prefix, raw=f"{CCM['green']}{debug_text}{CCM['reset']}") - node = TerminalNode(self, parser_helper.pos, parser_helper.pos, token.str_value) + node = TerminalNode(self, parser_helper.pos, parser_helper.pos, token.str_value, token.str_value) parser_helper.next_token(self.skip_white_space) return node @@ -708,67 +1010,77 @@ class StrMatch(Match): return None -# class RegExMatch(Match): -# ''' -# This Match class will perform input matching based on Regular Expressions. -# -# Args: -# to_match (regex string): A regular expression string to match. -# It will be used to create regular expression using re.compile. -# ignore_case(bool): If case insensitive match is needed. -# Default is None to support propagation from global parser setting. -# multiline(bool): allow regex to works on multiple lines -# (re.DOTALL flag). Default is None to support propagation from -# global parser setting. -# str_repr(str): A string that is used to represent this regex. -# re_flags: flags parameter for re.compile if neither ignore_case -# or multiple are set. -# -# ''' -# def __init__(self, to_match, rule_name='', root=False, ignore_case=None, -# multiline=None, str_repr=None, re_flags=re.MULTILINE): -# super(RegExMatch, self).__init__(rule_name, root) -# self.to_match_regex = to_match -# self.ignore_case = ignore_case -# self.multiline = multiline -# self.explicit_flags = re_flags -# -# self.to_match = str_repr if str_repr is not None else to_match -# -# def compile(self): -# flags = self.explicit_flags -# if self.multiline is True: -# flags |= re.DOTALL -# if self.multiline is False and flags & re.DOTALL: -# flags -= re.DOTALL -# if self.ignore_case is True: -# flags |= re.IGNORECASE -# if self.ignore_case is False and flags & re.IGNORECASE: -# flags -= re.IGNORECASE -# self.regex = re.compile(self.to_match_regex, flags) -# -# def __str__(self): -# return self.to_match -# -# def __unicode__(self): -# return self.__str__() -# -# def _parse(self, parser): -# c_pos = parser.position -# m = self.regex.match(parser.input, c_pos) -# if m: -# matched = m.group() -# if parser.debug: -# parser.dprint( -# "++ Match '%s' at %d => '%s'" % -# (matched, c_pos, parser.context(len(matched)))) -# parser.position += len(matched) -# if matched: -# return Terminal(self, c_pos, matched, extra_info=m) -# else: -# if parser.debug: -# parser.dprint("-- NoMatch at {}".format(c_pos)) -# parser._nm_raise(self, c_pos, parser) +class RegExMatch(Match): + """ + Matches regular expression + """ + + def __init__(self, to_match, rule_name="", ignore_case=True, multiline=None): + super(Match, self).__init__(rule_name=rule_name) + self.to_match = to_match + self.ignore_case = ignore_case + self.multiline = multiline + self.explicit_flags = re.MULTILINE + self.regex = None + + def __eq__(self, other): + if not super().__eq__(other): + return False + + if not isinstance(other, RegExMatch): + return False + + return self.to_match == other.to_match and \ + self.ignore_case == other.ignore_case and \ + self.multiline == other.multiline and \ + self.explicit_flags == other.explicit_flags + + def __hash__(self): + return hash(("RegExMatch", self.to_match, self.ignore_case, self.multiline, self.explicit_flags)) + + def __repr__(self): + text = self.to_match + if not self.ignore_case: + text += "#!ic" + if self.multiline: + text += "#ml" + return self.add_rule_name_if_needed(f"r'{text}'") + + def compile(self): + flags = RegExDef.compile_flags(self.ignore_case, self.multiline, self.explicit_flags) + self.regex = re.compile(self.to_match, flags) + + def _parse(self, parser_helper): + text = parser_helper.get_parsing_text() + + # if parser_helper.debugger.is_enabled(): + # debug_prefix = self.debug_prefix("StrMatch", parser_helper) + # text_debug = text[:12] + "..." if len(text) > 12 else text + # debug_text = f"pos={parser_helper.pos}, to_match={self.to_match}, text={text_debug}" + # parser_helper.debug_concept(debug_prefix, raw=f"{CCM['green']}{debug_text}{CCM['reset']}") + + m = self.regex.match(text, parser_helper.token.index) + if m: + matched = m.group() + + # TODO: Add debug info here + + if matched: + # the match is only valid if it fits with the actual tokens + next_pos = parser_helper.get_next_matching_pos(m.end()) + if next_pos is NotFound: + parser_helper.errors.append(NoMatchingTokenError(m.end())) + return None + + node = TerminalNode(self, parser_helper.pos, next_pos - 1, matched, matched) + parser_helper.seek(next_pos - 1) + parser_helper.next_token() + return node + + # if parser_helper.debugger.is_enabled(): + # parser_helper.debug_concept(debug_prefix, raw=f"{CCM['red']}{debug_text}{CCM['reset']}") + return None + class ParsingExpressionVisitor: """ @@ -837,6 +1149,12 @@ class BnfNodeFirstTokenVisitor(ParsingExpressionVisitor): self.add_first_token(pe.to_match) return self.STOP + def visit_RegExMatch(self, pe): + if not pe.to_match: + return + self.add_first_token(RegExDef(pe.to_match, pe.ignore_case, pe.multiline, pe.explicit_flags)) + return self.STOP + def visit_OrderedChoice(self, parsing_expression): for node in parsing_expression.elements: self.visit(node) @@ -917,6 +1235,16 @@ class BnfConceptParserHelper: def get_token(self) -> Token: return self.token + def get_parsing_text(self) -> str: + return self.parser.parser_input.sub_text + + def get_remaining_tokens(self): + return self.parser.parser_input.tokens[self.pos:-1] # do not return the trailing EOF + + def get_last_token_pos(self): + last_token = self.parser.parser_input.tokens[self.parser.parser_input.end] + return self.parser.parser_input.end - 1 if last_token.type == TokenKind.EOF else self.parser.parser_input.end + def next_token(self, skip_whitespace=True): if self.token and self.token.type == TokenKind.EOF: return False @@ -931,6 +1259,21 @@ class BnfConceptParserHelper: return self.token.type != TokenKind.EOF + def get_next_matching_pos(self, token_index): + """ + Given the token, tries to find a token (within the remaining tokens) that matches the index + :param token_index: + :return: + """ + current = self.pos + while current <= self.parser.parser_input.end: + if self.parser.parser_input.tokens[current].index == token_index: + return current + current += 1 + + # No matching token + return NotFound + def seek(self, pos): self.pos = pos self.token = self.parser.parser_input.tokens[self.pos] @@ -942,6 +1285,20 @@ class BnfConceptParserHelper: return self.parser.parser_input.pos <= self.pos or self.has_error() def eat_concept(self, concept, token): + def _get_longest_valid_node(multi_node): + res = [] + longest = -1 + for node_res in multi_node.results: + if node_res.node is None or node_res.node.end == -1: + continue + if longest == -1 or node_res.pos == longest: + res.append(node_res.node) + longest = node_res.pos + else: + break + + return None if len(res) == 0 else res[0] if len(res) == 1 else res + if self.is_locked(): return @@ -972,12 +1329,27 @@ class BnfConceptParserHelper: node = parsing_expression.parse(self) if isinstance(node, MultiNode): - # when multiple choices are found, use the longest result - node = node.results[0].node - if node is not None and node.end != -1: + node = _get_longest_valid_node(node) + + if isinstance(node, list): + # multiple results are found. + # add the nodes to the forks + instances = [self] + for i in range(len(node) - 1): + clone = self.clone() + instances.append(clone) + self.forked.append(clone) + + for instance, n in zip(instances, node): + instance.sequence.append(instance.create_concept_node(concept, n)) + instance.pos = n.end + instance.bnf_parsed = True + + elif isinstance(node, ParseTreeNode) and node.end != -1: self.sequence.append(self.create_concept_node(concept, node)) self.pos = node.end self.bnf_parsed = True + else: self.debug.append(("Rewind", token)) self.unrecognized_tokens.add_token(token, self.parser.parser_input.pos) @@ -1004,10 +1376,9 @@ class BnfConceptParserHelper: self.unrecognized_tokens.fix_source() # try to recognize concepts - nodes_sequences = core.builtin_helpers.get_lexer_nodes_from_unrecognized( - self.parser.context, - self.unrecognized_tokens, - PARSERS) + nodes_sequences = core.builtin_helpers.get_lexer_nodes_from_unrecognized(self.parser.context, + self.unrecognized_tokens, + PARSERS) if nodes_sequences: instances = [self] @@ -1034,7 +1405,7 @@ class BnfConceptParserHelper: def clone(self): clone = BnfConceptParserHelper(self.parser, self.debugger) clone.debug = self.debug[:] - self.errors = self.errors[:] + clone.errors = self.errors[:] clone.sequence = self.sequence[:] clone.unrecognized_tokens = self.unrecognized_tokens.clone() @@ -1074,7 +1445,7 @@ class BnfConceptParserHelper: # this cache is to make sure that we return the same concept for the same ConceptExpression _underlying_value_cache = {} - def _add_prop(_concept, prop_name, value): + def _add_compiled(_concept, prop_name, value): """ Adds a new entry, makes a list if the property already exists @@ -1109,8 +1480,7 @@ class BnfConceptParserHelper: return _look_for_concept_match(_underlying.children[0]) def _get_underlying_value(_underlying): - concept_match_node = _look_for_concept_match(_underlying) - if concept_match_node: + if (concept_match_node := _look_for_concept_match(_underlying)) is not None: # the value is a concept if id(concept_match_node) in _underlying_value_cache: result = _underlying_value_cache[id(concept_match_node)] @@ -1119,29 +1489,35 @@ class BnfConceptParserHelper: new = sheerka.new_from_template(ref_tpl, ref_tpl.key) result = self.finalize_concept(sheerka, new, concept_match_node.children[0], init_empty_body) _underlying_value_cache[id(concept_match_node)] = result - else: - # the value is a string + elif not hasattr(_underlying, "value") or isinstance(_underlying.value, str): result = DoNotResolve(_underlying.source) + else: + result = _underlying.value return result def _process_rule_name(_concept, _underlying): if _underlying.parsing_expression.rule_name: - value = _get_underlying_value(_underlying) - _add_prop(_concept, _underlying.parsing_expression.rule_name, value) - _concept.get_metadata().need_validation = True + # make sure VariableExpression are only added once + if (not isinstance(_underlying.parsing_expression, VariableExpression) or + _underlying.parsing_expression.rule_name not in _concept.get_compiled()): + var_value = _get_underlying_value(_underlying) + _add_compiled(_concept, _underlying.parsing_expression.rule_name, var_value) + _concept.get_metadata().need_validation = True elif isinstance(_underlying, NonTerminalNode): for child in _underlying.children: _process_rule_name(_concept, child) + # first set the body to something if it is required if init_empty_body and concept.get_metadata().body is None: value = _get_underlying_value(underlying) concept.get_compiled()[ConceptParts.BODY] = value if underlying.parsing_expression.rule_name: - _add_prop(concept, underlying.parsing_expression.rule_name, value) + _add_compiled(concept, underlying.parsing_expression.rule_name, value) # KSI : Why don't we set concept.get_metadata().need_validation to True ? + # then recursively browse children to update concept variables if isinstance(underlying, NonTerminalNode) and not isinstance(underlying.parsing_expression, ConceptExpression): for node in underlying.children: _process_rule_name(concept, node) @@ -1150,7 +1526,7 @@ class BnfConceptParserHelper: def get_node_value(self, node): """ - Try to evaluate the value of a given LexerNode (TerminalNode or NonTerminalNode) + Try to evaluate the value of a given ParseTreeNode (TerminalNode or NonTerminalNode) :param node: :return: """ @@ -1205,21 +1581,6 @@ class BnfNodeParser(BaseNodeParser): """ return concept.get_metadata().definition_type == DEFINITION_TYPE_BNF - @staticmethod - def get_valid(parsers_helpers): - - valid_parser_helpers = [] - for parser_helper in parsers_helpers: - if not parser_helper.bnf_parsed or parser_helper.has_error(): - continue - - if parser_helper in valid_parser_helpers: - continue - - valid_parser_helpers.append(parser_helper) - - return valid_parser_helpers - @staticmethod def get_expression_from_concept_name(name): """ @@ -1248,6 +1609,26 @@ class BnfNodeParser(BaseNodeParser): return res[0] if len(res) == 1 else Sequence(*res) + def get_valid(self, parsers_helpers): + bnf_found = False + valid_parser_helpers = [] + for parser_helper in parsers_helpers: + if parser_helper.bnf_parsed: + bnf_found = True + + if parser_helper.has_error(): + self.error_sink.extend(parser_helper.errors) + + if not parser_helper.bnf_parsed or parser_helper.has_error(): + continue + + if parser_helper in valid_parser_helpers: + continue + + valid_parser_helpers.append(parser_helper) + + return valid_parser_helpers if bnf_found else None + def get_concepts_sequences(self, context): """ Main method that parses the tokens and extract the concepts @@ -1273,6 +1654,14 @@ class BnfNodeParser(BaseNodeParser): return by_end_pos[max(by_end_pos)] + def _merge(list1, list2): + if not list1: + return list2 + if not list2: + return list1 + + return list1 + list2 + forked = [] debugger = context.get_debugger(self.NAME, "parse") debugger.debug_entering(source=self.parser_input.as_text()) @@ -1285,13 +1674,21 @@ class BnfNodeParser(BaseNodeParser): debug_prefix = f"pos={self.parser_input.pos}, {token=}, {len(concept_parser_helpers)} parser(s)" try: + # KSI 2021-02-13. I am not quite sure of the reason why we want to stop the processing + # if all the parsers are locked. + # It means that if we have two concepts 'foo bar baz' and 'bar baz' + # we are going to miss the sequence '[UTN('foo'), CN('bar baz')] + # ... not_locked = [p for p in concept_parser_helpers if not p.is_locked()] if len(not_locked) == 0: if debugger.is_enabled(): debugger.debug_log(debug_prefix + ", all parsers are locked. Nothing to do.") continue - concepts = context.sheerka.get_concepts_by_first_token(token, self._is_eligible, strip_quotes=False) + by_token = context.sheerka.get_concepts_by_first_token(token, self._is_eligible, strip_quotes=False) + by_regex = context.sheerka.get_concepts_by_first_regex(self.parser_input.sub_text, token.index) + + concepts = _merge(by_token, by_regex) if not concepts: if debugger.is_enabled(): @@ -1563,21 +1960,43 @@ class BnfNodeParser(BaseNodeParser): if ret.ignore_case is None: ret.ignore_case = self.ignore_case - elif isinstance(expression, Sequence) or \ - isinstance(expression, OrderedChoice) or \ - isinstance(expression, UnOrderedChoice) or \ - isinstance(expression, ZeroOrMore) or \ - isinstance(expression, OneOrMore) or \ - isinstance(expression, Optional): + elif isinstance(expression, RegExMatch): + # Regular expression are not compiled yet + # to support global settings propagation from + # parser. + ret = expression + if ret.ignore_case is None: + ret.ignore_case = self.ignore_case + ret.compile() + + elif isinstance(expression, (Sequence, OrderedChoice, UnOrderedChoice, ZeroOrMore, OneOrMore, Optional)): ret = expression ret.nodes = [] for e in ret.elements: - pe = self.resolve_parsing_expression(context, e, grammar, to_skip, to_update) - if not isinstance(pe, (ParsingExpression, UnderConstruction)): - return pe # an error is detected, escalate it - if isinstance(pe, UnderConstruction): - to_update.add(ToUpdate(id(expression), ret)) - ret.nodes.append(pe) + if not isinstance(e, VariableExpression): + pe = self.resolve_parsing_expression(context, e, grammar, to_skip, to_update) + if not isinstance(pe, (ParsingExpression, UnderConstruction)): + return pe # an error is detected, escalate it + if isinstance(pe, UnderConstruction): + to_update.add(ToUpdate(id(expression), ret)) + ret.nodes.append(pe) + else: + ret.nodes.append(e) + + # manage VariableExpression + start_node = None # first non VariableExpression node + variable_expr_nodes = [] + for i, e in enumerate(ret.nodes): + if isinstance(e, VariableExpression): + variable_expr_nodes.append(e) + e.before_first_token_node = start_node is None + if i < len(ret.nodes) - 1: + e.nodes.append(ret.nodes[i + 1]) + else: + start_node = e + + for variable_expr in variable_expr_nodes: + variable_expr.init_parsing() else: ret = self.add_error(GrammarErrorNode(f"Unrecognized grammar element '{expression}'."), False) @@ -1630,18 +2049,18 @@ class BnfNodeParser(BaseNodeParser): sequences = self.get_concepts_sequences(context) valid_parser_helpers = self.get_valid(sequences) if valid_parser_helpers is None: + return self.sheerka.ret( + self.name, + False, + context.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=parser_input.as_text(), reason=self.error_sink)) + + if len(valid_parser_helpers) == 0: # token error return self.sheerka.ret( self.name, False, context.sheerka.new(BuiltinConcepts.ERROR, body=self.error_sink)) - if len(valid_parser_helpers) == 0: - return self.sheerka.ret( - self.name, - False, - context.sheerka.new(BuiltinConcepts.NOT_FOR_ME, body=parser_input.as_text())) - ret = [] for parser_helper in valid_parser_helpers: ret.append( diff --git a/src/parsers/PythonParser.py b/src/parsers/PythonParser.py index 6262356..97a9783 100644 --- a/src/parsers/PythonParser.py +++ b/src/parsers/PythonParser.py @@ -30,6 +30,10 @@ class PythonErrorNode(ParsingError): @dataclass() class ConceptDetectedError(ParsingError): + """ + When the Python parser finds an identifier, and that identifier is a concept + So it's not for the PythonParser to respond + """ name: str diff --git a/src/parsers/SyaNodeParser.py b/src/parsers/SyaNodeParser.py index 18172ca..7f7aabe 100644 --- a/src/parsers/SyaNodeParser.py +++ b/src/parsers/SyaNodeParser.py @@ -1142,6 +1142,8 @@ class SyaNodeParser(BaseNodeParser): if sya_definitions: self.test_only_sya_definitions = sya_definitions + return self + @staticmethod def _is_eligible(concept): """ diff --git a/src/sheerkapickle/SheerkaUnpickler.py b/src/sheerkapickle/SheerkaUnpickler.py index cb150f4..ae2d0f7 100644 --- a/src/sheerkapickle/SheerkaUnpickler.py +++ b/src/sheerkapickle/SheerkaUnpickler.py @@ -1,7 +1,7 @@ import json import core.utils -from core.global_symbols import NotInit, NotFound, Removed +from core.global_symbols import NotInit, NotFound, Removed, NoFirstToken from sheerkapickle import tags, utils, handlers @@ -54,6 +54,8 @@ class SheerkaUnpickler: instance = NotFound elif obj[tags.CUSTOM] == Removed.value: instance = Removed + elif obj[tags.CUSTOM] == NoFirstToken.value: + instance = NoFirstToken else: raise KeyError(f"unknown {obj[tags.CUSTOM]}") diff --git a/tests/core/test_SheerkaConceptManager.py b/tests/core/test_SheerkaConceptManager.py index c26dcdf..2e6faa2 100644 --- a/tests/core/test_SheerkaConceptManager.py +++ b/tests/core/test_SheerkaConceptManager.py @@ -8,7 +8,8 @@ from core.concept import PROPERTIES_TO_SERIALIZE, Concept, DEFINITION_TYPE_DEF, from core.global_symbols import NotInit, NotFound from core.sheerka.services.SheerkaConceptManager import SheerkaConceptManager, NoModificationFound, ForbiddenAttribute, \ UnknownAttribute, CannotRemoveMeta, ValueNotFound, ConceptIsReferenced, NoFirstTokenError -from parsers.BnfNodeParser import Sequence, StrMatch, ConceptExpression, OrderedChoice, Optional, ZeroOrMore, OneOrMore +from parsers.BnfNodeParser import Sequence, StrMatch, ConceptExpression, OrderedChoice, Optional, ZeroOrMore, OneOrMore, \ + RegExDef, RegExMatch from tests.TestUsingFileBasedSheerka import TestUsingFileBasedSheerka from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka @@ -60,6 +61,50 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert sheerka.om.current_sdp().exists(service.CONCEPTS_BY_HASH_ENTRY, concept.get_definition_hash()) assert sheerka.om.current_sdp().exists(service.CONCEPTS_BY_FIRST_KEYWORD_ENTRY, "+") + def test_i_can_create_a_bnf_concept_that_starts_with_a_regex(self): + sheerka = self.get_sheerka(cache_only=False) + context = self.get_context(sheerka) + service = sheerka.services[SheerkaConceptManager.NAME] + foo = self.bnf_concept("foo", RegExMatch("[a-z]+")) + bar = self.bnf_concept("bar", RegExMatch("[0-9]+")) + + res = sheerka.create_new_concept(context, foo) + + assert res.status + assert sheerka.isinstance(res.value, BuiltinConcepts.NEW_CONCEPT) + + # I can get by the first regex + assert sheerka.om.get(service.CONCEPTS_BY_REGEX_ENTRY, RegExDef("[a-z]+").serialize()) == [foo.id] + assert len(service.compiled_concepts_by_regex) == 1 + + # I can commit + sheerka.om.commit(context) + + # I can load from DB + entry = sheerka.om.current_sdp().get(service.CONCEPTS_BY_REGEX_ENTRY) + assert entry == {RegExDef("[a-z]+").serialize(): [foo.id]} + + # I can create another concept + res = sheerka.create_new_concept(context, bar) + + assert res.status + assert sheerka.isinstance(res.value, BuiltinConcepts.NEW_CONCEPT) + + # I can get by the first regex + assert sheerka.om.get(service.CONCEPTS_BY_REGEX_ENTRY, RegExDef("[0-9]+").serialize()) == [bar.id] + assert sheerka.om.get(service.CONCEPTS_BY_REGEX_ENTRY, RegExDef("[a-z]+").serialize()) == [foo.id] + assert len(service.compiled_concepts_by_regex) == 2 + + # I can commit + sheerka.om.commit(context) + + # I can load from DB + entry = sheerka.om.current_sdp().get(service.CONCEPTS_BY_REGEX_ENTRY) + assert entry == { + RegExDef("[a-z]+").serialize(): [foo.id], + RegExDef("[0-9]+").serialize(): [bar.id] + } + def test_i_cannot_create_a_bnf_concept_that_references_a_concept_that_cannot_be_resolved(self): sheerka, context, one_1, one_1_0 = self.init_concepts(Concept("one", body="1"), Concept("one", body="1.0")) twenty_one = Concept("twenty one", definition="'twenty' one", definition_type=DEFINITION_TYPE_BNF) @@ -361,17 +406,16 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): # sdp is updated sheerka.om.commit(context) - from_sdp = sheerka.om.current_sdp().get(service.CONCEPTS_BY_ID_ENTRY, new_concept.id) + sdp = sheerka.om.current_sdp() + from_sdp = sdp.get(service.CONCEPTS_BY_ID_ENTRY, new_concept.id) assert from_sdp.get_metadata().body == "metadata value" assert from_sdp.get_metadata().variables == [("var_name", "default value")] assert from_sdp.get_prop(BuiltinConcepts.ISA) == {bar} - assert sheerka.om.current_sdp().get(service.CONCEPTS_BY_NAME_ENTRY, - new_concept.name).get_metadata().body == "metadata value" - assert sheerka.om.current_sdp().get(service.CONCEPTS_BY_KEY_ENTRY, - new_concept.key).get_metadata().body == "metadata value" - assert sheerka.om.current_sdp().get(service.CONCEPTS_BY_HASH_ENTRY, - new_concept.get_definition_hash()).get_metadata().body == "metadata value" + assert sdp.get(service.CONCEPTS_BY_NAME_ENTRY, new_concept.name).get_metadata().body == "metadata value" + assert sdp.get(service.CONCEPTS_BY_KEY_ENTRY, new_concept.key).get_metadata().body == "metadata value" + assert sdp.get(service.CONCEPTS_BY_HASH_ENTRY, + new_concept.get_definition_hash()).get_metadata().body == "metadata value" def test_caches_are_update_when_i_modify_the_name(self): sheerka, context, foo = self.init_concepts("foo", cache_only=False) @@ -496,6 +540,7 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): Concept("baz", definition="foo"), create_new=True).unpack() + # sanity check assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == { "foo": ["1001"], "bar": ["1002"], @@ -514,6 +559,71 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert sheerka.om.copy(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == { 'bar': ['1002', '1001', '1003']} + def test_i_can_modify_bnf_definition_from_first_token_to_first_regex(self): + sheerka, context, foo, = self.init_test().with_concepts( + Concept("foo", definition="'hello'|'hola'"), create_new=True).unpack() + service = sheerka.services[SheerkaConceptManager.NAME] + + # sanity + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == { + "hello": ["1001"], + "hola": ["1001"]} + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) == {} + assert len(service.compiled_concepts_by_regex) == 0 + + to_add = {"meta": {"definition": "r'[a-z]+'"}} + res = sheerka.modify_concept(context, foo, to_add) + + assert res.status + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == {} + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) == { + RegExDef("[a-z]+").serialize(): ["1001"] + } + assert len(service.compiled_concepts_by_regex) == 1 + + def test_i_can_modify_bnf_definition_from_first_regex_to_first_token(self): + sheerka, context, foo, = self.init_test().with_concepts( + Concept("foo", definition="r'[a-z]+'"), create_new=True).unpack() + service = sheerka.services[SheerkaConceptManager.NAME] + + # sanity + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == {} + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) == { + RegExDef("[a-z]+").serialize(): ["1001"] + } + assert len(service.compiled_concepts_by_regex) == 1 + + to_add = {"meta": {"definition": "'hello'|'hola'"}} + res = sheerka.modify_concept(context, foo, to_add) + + assert res.status + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == { + "hello": ["1001"], + "hola": ["1001"]} + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) == {} + assert len(service.compiled_concepts_by_regex) == 0 + + def test_i_can_modify_when_multiple_bnf_definitions_are_already_defined(self): + sheerka, context, foo, bar, baz = self.init_test().with_concepts( + Concept("foo", definition="r'[a-z]+'"), + Concept("bar", definition="r'[0-1]+'"), + Concept("baz", definition="'one'|'twox'"), create_new=True).unpack() + service = sheerka.services[SheerkaConceptManager.NAME] + # it does not matter than baz is a bnf + + to_add = {"meta": {"definition": "'one'|'two'"}} + res = sheerka.modify_concept(context, baz, to_add) + + assert res.status + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == { + "one": ["1003"], + "two": ["1003"]} + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) == { + RegExDef("[a-z]+").serialize(): ["1001"], + RegExDef("[0-1]+").serialize(): ["1002"], + } + assert len(service.compiled_concepts_by_regex) == 2 + def test_references_are_updated_after_concept_modification(self): sheerka, context, one, twenty_one = self.init_test().with_concepts( "onz", @@ -602,7 +712,7 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert sheerka.isinstance(res.body, BuiltinConcepts.ERROR) assert res.body.body == NoModificationFound(foo, {"name": "foo", "body": "a body"}) - def test_i_cannot_remove_meta_attributes(self): + def test_i_cannot_modify_and_remove_meta_attributes(self): sheerka, context, foo = self.init_concepts(Concept("foo")) res = sheerka.modify_concept(context, foo, to_remove={"meta": {"any_value": "foo"}}) @@ -611,7 +721,7 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert sheerka.isinstance(res.body, BuiltinConcepts.ERROR) assert res.body.body == CannotRemoveMeta({"any_value": "foo"}) - def test_i_cannot_remove_props_that_does_not_exists(self): + def test_i_cannot_modify_and_remove_props_that_does_not_exists(self): sheerka, context, foo = self.init_concepts(Concept("foo")) res = sheerka.modify_concept(context, foo, to_remove={"props": {"any_value": "foo"}}) @@ -620,7 +730,7 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert sheerka.isinstance(res.body, BuiltinConcepts.ERROR) assert res.body.body == UnknownAttribute("any_value") - def test_i_cannot_remove_props_value_that_does_not_exists(self): + def test_i_cannot_modify_and_remove_props_value_that_does_not_exists(self): # Need to returns an error, otherwise, we will save a concept that is not modified sheerka, context, foo = self.init_concepts(Concept("foo", props={"a": {"value"}})) @@ -630,7 +740,7 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert sheerka.isinstance(res.body, BuiltinConcepts.ERROR) assert res.body.body == ValueNotFound("a", "dummy") - def test_i_cannot_remove_variable_that_does_not_exists(self): + def test_i_cannot_modify_and_remove_variable_that_does_not_exists(self): sheerka, context, foo = self.init_concepts(Concept("foo").def_var("a")) res = sheerka.modify_concept(context, foo, to_remove={"variables": ["b"]}) @@ -649,6 +759,30 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert not res.status assert sheerka.isinstance(res.body, BuiltinConcepts.UNKNOWN_CONCEPT) + def test_i_cannot_modify_with_an_invalid_regex_expression(self): + sheerka, context, foo, = self.init_test().with_concepts( + Concept("foo", definition="'hello'|'hola'"), create_new=True).unpack() + service = sheerka.services[SheerkaConceptManager.NAME] + + # sanity + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == { + "hello": ["1001"], + "hola": ["1001"]} + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) == {} + assert len(service.compiled_concepts_by_regex) == 0 + + to_add = {"meta": {"definition": "r'[a-z+'"}} # invalid regex definition + res = sheerka.modify_concept(context, foo, to_add) + + assert not res.status + assert sheerka.isinstance(res.body, BuiltinConcepts.ERROR) + assert res.body.body.msg == 'unterminated character set' + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == { + "hello": ["1001"], + "hola": ["1001"]} + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) == {} + assert len(service.compiled_concepts_by_regex) == 0 + def test_i_can_get_and_set_attribute(self): sheerka, context = self.init_concepts() foo = Concept("foo") @@ -683,6 +817,8 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert sheerka.get_by_name(one.name) == one assert sheerka.get_by_key(one.key) == one assert sheerka.get_by_hash(one.get_definition_hash()) == one + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) != {} + assert sheerka.om.copy(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY) != {} res = sheerka.remove_concept(context, one) @@ -694,6 +830,35 @@ class TestSheerkaConceptManager(TestUsingMemoryBasedSheerka): assert sheerka.isinstance(sheerka.get_by_key(one.key), BuiltinConcepts.UNKNOWN_CONCEPT) assert sheerka.isinstance(sheerka.get_by_hash(one.get_definition_hash()), BuiltinConcepts.UNKNOWN_CONCEPT) + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == {} + assert sheerka.om.copy(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == {} + + def test_i_can_remove_a_first_regex_concept(self): + sheerka, context, one = self.init_test().with_concepts( + Concept("one", definition="r'[a-z]+'"), + create_new=True).unpack() + service = sheerka.services[SheerkaConceptManager.NAME] + + # sanity check + assert sheerka.get_by_id(one.id) == one + assert sheerka.get_by_name(one.name) == one + assert sheerka.get_by_key(one.key) == one + assert sheerka.get_by_hash(one.get_definition_hash()) == one + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) != {} + assert len(service.compiled_concepts_by_regex) != 0 + + res = sheerka.remove_concept(context, one) + + assert res.status + assert sheerka.isinstance(res.body, BuiltinConcepts.SUCCESS) + + assert sheerka.isinstance(sheerka.get_by_id(one.id), BuiltinConcepts.UNKNOWN_CONCEPT) + assert sheerka.isinstance(sheerka.get_by_name(one.name), BuiltinConcepts.UNKNOWN_CONCEPT) + assert sheerka.isinstance(sheerka.get_by_key(one.key), BuiltinConcepts.UNKNOWN_CONCEPT) + assert sheerka.isinstance(sheerka.get_by_hash(one.get_definition_hash()), BuiltinConcepts.UNKNOWN_CONCEPT) + assert sheerka.om.copy(SheerkaConceptManager.CONCEPTS_BY_REGEX_ENTRY) == {} + assert len(service.compiled_concepts_by_regex) == 0 + def test_i_cannot_remove_a_concept_that_does_not_exist(self): sheerka, context = self.init_concepts() one = Concept("one", id="1001") diff --git a/tests/core/test_sheerka.py b/tests/core/test_sheerka.py index d3885d8..81b0298 100644 --- a/tests/core/test_sheerka.py +++ b/tests/core/test_sheerka.py @@ -73,7 +73,7 @@ class TestSheerkaUsingMemoryBasedSheerka(TestUsingMemoryBasedSheerka): assert loaded is not None assert sheerka.isinstance(loaded, BuiltinConcepts.UNKNOWN_CONCEPT) - assert loaded.body == ("key", "key_that_does_not_exist") + assert loaded.body == {"key": "key_that_does_not_exist"} assert loaded.get_metadata().is_evaluated def test_i_cannot_get_when_id_is_not_found(self): @@ -83,7 +83,7 @@ class TestSheerkaUsingMemoryBasedSheerka(TestUsingMemoryBasedSheerka): assert loaded is not None assert sheerka.isinstance(loaded, BuiltinConcepts.UNKNOWN_CONCEPT) - assert loaded.body == ("id", "id_that_does_not_exist") + assert loaded.body == {"id": "id_that_does_not_exist"} assert loaded.get_metadata().is_evaluated def test_i_can_instantiate_a_builtin_concept_when_it_has_its_own_class(self): @@ -200,7 +200,7 @@ class TestSheerkaUsingMemoryBasedSheerka(TestUsingMemoryBasedSheerka): new = sheerka.new("fake_concept") assert sheerka.isinstance(new, BuiltinConcepts.UNKNOWN_CONCEPT) - assert new.body == ('key', 'fake_concept') + assert new.body == {'key': 'fake_concept'} def test_i_cannot_instantiate_with_invalid_id(self): sheerka, context, *concepts = self.init_test().with_concepts(Concept("foo", body="foo1"), @@ -210,7 +210,7 @@ class TestSheerkaUsingMemoryBasedSheerka(TestUsingMemoryBasedSheerka): new = sheerka.new(("foo", "invalid_id")) assert sheerka.isinstance(new, BuiltinConcepts.UNKNOWN_CONCEPT) - assert new.body == [('key', 'foo'), ('id', 'invalid_id')] + assert new.body == {'key': 'foo', 'id': 'invalid_id'} def test_i_cannot_instantiate_with_invalid_key(self): sheerka, context, *concepts = self.init_test().with_concepts(Concept("foo", body="foo1"), @@ -220,7 +220,7 @@ class TestSheerkaUsingMemoryBasedSheerka(TestUsingMemoryBasedSheerka): new = sheerka.new(("invalid_key", "1001")) assert sheerka.isinstance(new, BuiltinConcepts.UNKNOWN_CONCEPT) - assert new.body == [('key', 'invalid_key'), ('id', '1001')] + assert new.body == {'key': 'invalid_key', 'id': '1001'} def test_concept_id_is_irrelevant_when_only_one_concept(self): sheerka, context, *concepts = self.init_test().with_concepts(Concept("foo", body="foo1"), diff --git a/tests/core/test_tokenizer.py b/tests/core/test_tokenizer.py index 1bd707c..c98cc91 100644 --- a/tests/core/test_tokenizer.py +++ b/tests/core/test_tokenizer.py @@ -1,4 +1,5 @@ import pytest + from core.tokenizer import Tokenizer, Token, TokenKind, LexerError @@ -172,6 +173,7 @@ def test_i_can_parse_concept_token(text, expected): assert tokens[0].type == TokenKind.CONCEPT assert tokens[0].value == expected + @pytest.mark.parametrize("text, expected", [ ("r:key:", ("key", None)), ("r:key|id:", ("key", "id")), @@ -197,3 +199,6 @@ def test_i_can_parse_regex_token(text, expected): assert tokens[0].type == TokenKind.REGEX assert tokens[0].value == expected + assert tokens[0].str_value == "r" + expected + assert tokens[0].repr_value == "r" + expected + assert tokens[0].strip_quote == expected[1:-1] diff --git a/tests/evaluators/EvaluatorTestsUtils.py b/tests/evaluators/EvaluatorTestsUtils.py index abb455b..2230b34 100644 --- a/tests/evaluators/EvaluatorTestsUtils.py +++ b/tests/evaluators/EvaluatorTestsUtils.py @@ -58,7 +58,7 @@ def pr_ret_val(value, parser="parser", source=None): def python_ret_val(source): - python_node = PythonNode(source, ast.parse(source, f"", 'eval')) + python_node = PythonNode(source.strip(), ast.parse(source.strip(), f"", 'eval')) return pr_ret_val(python_node, parser="Python", source=source) diff --git a/tests/evaluators/test_DefConceptEvaluator.py b/tests/evaluators/test_DefConceptEvaluator.py index 78b920d..1df9768 100644 --- a/tests/evaluators/test_DefConceptEvaluator.py +++ b/tests/evaluators/test_DefConceptEvaluator.py @@ -4,12 +4,13 @@ import pytest from core.builtin_concepts import ReturnValueConcept, ParserResultConcept, BuiltinConcepts from core.concept import VARIABLE_PREFIX, Concept, DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF +from core.sheerka.services.SheerkaConceptManager import NoFirstTokenError from core.sheerka.services.SheerkaExecute import ParserInput from core.tokenizer import Tokenizer from evaluators.DefConceptEvaluator import DefConceptEvaluator from parsers.BaseParser import BaseParser from parsers.BnfDefinitionParser import BnfDefinitionParser -from parsers.BnfNodeParser import Sequence, StrMatch, ZeroOrMore, ConceptExpression +from parsers.BnfNodeParser import Sequence, StrMatch, ZeroOrMore, ConceptExpression, VariableExpression from parsers.DefConceptParser import DefConceptNode, NameNode, DefConceptParser from parsers.PythonParser import PythonNode, PythonParser from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka @@ -277,3 +278,55 @@ class TestDefConceptEvaluator(TestUsingMemoryBasedSheerka): assert evaluated.status assert evaluated.body.body.key == "foo2 __var__0" assert evaluated.body.body.get_metadata().variables == [("x", None)] + + def test_i_can_eval_when_bnf_concept_with_regex(self): + context = self.get_context() + def_ret_val = DefConceptParser().parse(context, ParserInput("def concept hello a from bnf r'[a-z]+'=a 'hello'")) + evaluated = DefConceptEvaluator().eval(context, def_ret_val) + + assert evaluated.status + assert context.sheerka.isinstance(evaluated.body, BuiltinConcepts.NEW_CONCEPT) + + created_concept = evaluated.body.body + assert created_concept.get_metadata().name == "hello a" + assert created_concept.get_metadata().key == "hello __var__0" + assert created_concept.get_metadata().definition == "r'[a-z]+'=a 'hello'" + assert created_concept.get_metadata().definition_type == "bnf" + + def test_i_can_eval_when_bnf_concept_with_variable(self): + context = self.get_context() + def_ret_val = DefConceptParser().parse(context, ParserInput("def concept hello x from bnf 'hello' x")) + evaluated = DefConceptEvaluator().eval(context, def_ret_val) + + assert evaluated.status + assert context.sheerka.isinstance(evaluated.body, BuiltinConcepts.NEW_CONCEPT) + + created_concept = evaluated.body.body + assert created_concept.get_metadata().name == "hello x" + assert created_concept.get_metadata().key == "hello __var__0" + assert created_concept.get_metadata().definition == "'hello' x" + assert created_concept.get_metadata().definition_type == "bnf" + assert created_concept.get_metadata().variables == [("x", None)] + assert created_concept._bnf == Sequence(StrMatch("hello"), VariableExpression("x")) + + def test_i_cannot_eval_bnf_concept_with_unknown_variable(self): + context = self.get_context() + def_ret_val = DefConceptParser().parse(context, ParserInput("def concept name from bnf unknown foo")) + evaluated = DefConceptEvaluator().eval(context, def_ret_val) + + assert not evaluated.status + assert context.sheerka.isinstance(evaluated.body, BuiltinConcepts.ERROR) + unknown_concepts = [ + context.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body={"name": "foo"}), + context.sheerka.new(BuiltinConcepts.UNKNOWN_CONCEPT, body={"name": "unknown"}), + ] + assert evaluated.body.body == unknown_concepts + + def test_i_cannot_eval_bnf_concept_with_only_variable(self): + sheerka, context = self.init_test().unpack() + def_ret_val = DefConceptParser().parse(context, ParserInput("def concept foo x from bnf x")) + evaluated = DefConceptEvaluator().eval(context, def_ret_val) + + assert not evaluated.status + assert sheerka.isinstance(evaluated.body, BuiltinConcepts.ERROR) + assert isinstance(evaluated.body.body, NoFirstTokenError) diff --git a/tests/non_reg/test_sheerka_non_reg.py b/tests/non_reg/test_sheerka_non_reg.py index b8ba5d7..bced11b 100644 --- a/tests/non_reg/test_sheerka_non_reg.py +++ b/tests/non_reg/test_sheerka_non_reg.py @@ -1237,6 +1237,18 @@ as: assert len(l) > 0 sheerka.save_execution_context = False + def test_i_can_define_a_regex_concept_and_parse_it(self): + sheerka, context = self.init_test().unpack() + + res = sheerka.evaluate_user_input("def concept binary from bnf r'[01]+'") + assert len(res) == 1 + assert res[0].status + + res = sheerka.evaluate_user_input("01001") + assert len(res) == 1 + assert res[0].status + assert sheerka.isinstance(res[0].body, "binary") + class TestSheerkaNonRegFile(TestUsingFileBasedSheerka): def test_i_can_def_several_concepts(self): diff --git a/tests/parsers/parsers_utils.py b/tests/parsers/parsers_utils.py index c97c8d9..5790d0b 100644 --- a/tests/parsers/parsers_utils.py +++ b/tests/parsers/parsers_utils.py @@ -1,6 +1,7 @@ import ast from dataclasses import dataclass +from core.builtin_concepts import ReturnValueConcept from core.builtin_helpers import CreateObjectIdentifiers from core.concept import CC, Concept, ConceptParts, DoNotResolve, CIO, CMV from core.tokenizer import Tokenizer, TokenKind, Token @@ -256,13 +257,17 @@ def get_node( if sub_expr == "')'": return ")" + if isinstance(sub_expr, ReturnValueConcept): + return sub_expr + if isinstance(sub_expr, (scnode, utnode, DoNotResolve)): return sub_expr if isinstance(sub_expr, CIO): sub_expr.set_concept(concepts_map[sub_expr.concept_name]) - if sub_expr.source: - node = get_node(concepts_map, expression_as_tokens, sub_expr.source, sya=sya) + source = sub_expr.source or sub_expr.concept_name + if source: + node = get_node(concepts_map, expression_as_tokens, source, sya=sya) sub_expr.start = node.start sub_expr.end = node.end return sub_expr @@ -366,7 +371,7 @@ def get_node( return CN(concept_found, start, start + length - 1, source=sub_expr) else: # else an UnrecognizedTokensNode - return utnode(start, start + length - 1, sub_expr) + return UTN(sub_expr, start, start + length - 1) def init_body(item, concept, value): @@ -482,3 +487,29 @@ def get_rete_conditions(*conditions_as_string): res.append(Condition(identifier, attribute, value)) return AndConditions(res) + + +def get_test_obj(test_obj, real_obj, to_compare_delegate=None): + """ + From a production object (Concept, ConceptNode, ....) + Create a test object (CNC, CC ...) that can be used to validate the unit tests + :param test_obj: + :param real_obj: + :param to_compare_delegate: + :return: + """ + if isinstance(test_obj, list): + if len(test_obj) != len(real_obj): + raise Exception(f"Not the same size ! {test_obj=}, {real_obj=}") + return [get_test_obj(t, r) for t, r in zip(test_obj, real_obj)] + + if isinstance(test_obj, dict): + if len(test_obj) != len(real_obj): + raise Exception(f"Not the same size ! {test_obj=}, {real_obj=}") + + return {k: get_test_obj(v, real_obj[k]) for k, v in test_obj.items()} + + if not hasattr(test_obj, "to_compare"): + return real_obj + + return test_obj.to_compare(real_obj, get_test_obj) diff --git a/tests/parsers/test_BnfNodeParser.py b/tests/parsers/test_BnfNodeParser.py index e65cf6e..9338b15 100644 --- a/tests/parsers/test_BnfNodeParser.py +++ b/tests/parsers/test_BnfNodeParser.py @@ -1,3 +1,5 @@ +import re + import pytest import tests.parsers.parsers_utils @@ -6,12 +8,14 @@ from core.concept import Concept, ConceptParts, DoNotResolve, CC, DEFINITION_TYP from core.global_symbols import NotInit from core.sheerka.services.SheerkaConceptManager import SheerkaConceptManager from core.sheerka.services.SheerkaExecute import ParserInput -from parsers.BaseNodeParser import CNC, UTN, CN +from parsers.BaseNodeParser import CNC, UTN, CN, NoMatchingTokenError, SCN from parsers.BnfDefinitionParser import BnfDefinitionParser from parsers.BnfNodeParser import StrMatch, TerminalNode, NonTerminalNode, Sequence, OrderedChoice, \ - Optional, ZeroOrMore, OneOrMore, ConceptExpression, UnOrderedChoice, BnfNodeParser + Optional, ZeroOrMore, OneOrMore, ConceptExpression, UnOrderedChoice, BnfNodeParser, RegExMatch, \ + BnfNodeFirstTokenVisitor, Match, RegExDef, VariableExpression from tests.BaseTest import BaseTest from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka +from tests.evaluators.EvaluatorTestsUtils import python_ret_val cmap = { "one": Concept("one"), @@ -42,9 +46,14 @@ cmap = { "three_four": Concept("three_four", definition="three | four").def_var("three").def_var("four"), "t2": Concept("t2", definition="'twenty' three_four=unit").def_var("unit").def_var("three").def_var("four"), + # bnf with variable + "one thing": Concept("one x", definition="one x").def_var("x"), + "x shoe": Concept("x shoe", definition="x 'shoe'").def_var("x"), + # testing keywords "def_only": Concept("def"), "def number": Concept("def number", definition="def (one|two)=number"), + # sequence of keywords using bnf definition # "def_concept_bnf": Concept("def_concept_bnf", definition="'def' 'concept'"), # "def concept_bnf number": Concept("def number", definition="def_concept_bnf (one|two)=number"), @@ -68,8 +77,8 @@ def u(parsing_expression, start, end, children=None): if isinstance(parsing_expression, str): parsing_expression = StrMatch(parsing_expression) - if isinstance(parsing_expression, StrMatch): - return TerminalNode(parsing_expression, start, end, parsing_expression.to_match) + if isinstance(parsing_expression, Match): + return TerminalNode(parsing_expression, start, end, parsing_expression.to_match, parsing_expression.to_match) return NonTerminalNode(parsing_expression, start, end, [], children) @@ -105,7 +114,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): sheerka.set_isa(context, cmap["one hundred"], cmap["number"]) sheerka.set_isa(context, cmap["hundreds"], cmap["number"]) - # Pay attention. 'twenties (t1 and t2) are not set as number + # Pay attention. 'twenties (t1 and t2) are not set as 'number' thirties = cls.update_bnf(context, Concept("thirties", definition="thirty number", @@ -158,7 +167,10 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): for i, pair in enumerate(my_concepts_map): my_concepts_map[pair] = updated[i] - parser = BnfNodeParser(sheerka=sheerka) if init_from_sheerka else BnfNodeParser() + if init_from_sheerka: + parser = BnfNodeParser(sheerka=sheerka) + else: + parser = BnfNodeParser().init_from_concepts(context, my_concepts_map.values()) return sheerka, context, parser def validate_get_concepts_sequences(self, my_map, text, expected, multiple_result=False, post_init_concepts=None): @@ -198,7 +210,9 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): assert len(bnf_parsers_helpers) == len(expected_array) for parser_helper, expected_sequence in zip(bnf_parsers_helpers, expected_array): - assert parser_helper.sequence == expected_sequence + to_compare = tests.parsers.parsers_utils.get_test_obj(expected_sequence, parser_helper.sequence) + # assert parser_helper.sequence == expected_sequence + assert to_compare == expected_sequence if len(bnf_parsers_helpers) == 1: return bnf_parsers_helpers[0].sequence @@ -221,7 +235,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): (StrMatch("3.14"), "3.14"), (StrMatch("+"), "+"), ]) - def test_i_can_match_simple_bnf(self, expr, text): + def test_i_can_match_str_bnf(self, expr, text): my_map = { text: self.bnf_concept("foo", expr) } @@ -229,6 +243,57 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): sequence = self.validate_get_concepts_sequences(my_map, text, [text]) assert sequence[0].underlying == u(expr, 0, 0) + @pytest.mark.parametrize("expr, text, end", [ + (RegExMatch("bar"), "bar", 0), + (RegExMatch("[a-z]+"), "xyz", 0), + (RegExMatch("[a-z=]+"), "uvt=xyz=abc", 4), + ]) + def test_i_can_match_regex_bnf(self, expr, text, end): + my_map = { + text: self.bnf_concept("foo", expr) + } + + sequence = self.validate_get_concepts_sequences(my_map, text, [text]) + assert sequence[0].underlying.start == 0 + assert sequence[0].underlying.end == end + assert sequence[0].underlying.parsing_expression == expr + + @pytest.mark.parametrize("expr, text, end", [ + (Sequence(StrMatch("foo"), RegExMatch("bar")), "foo bar", 2), + (Sequence(StrMatch("foo"), RegExMatch("[a-z]+")), "foo xyz", 2), + (Sequence(StrMatch("foo"), RegExMatch("[a-z=]+")), "foo uvt=xyz=abc", 6), + ]) + def test_i_can_match_sequence_str_regex(self, expr, text, end): + my_map = { + text: self.bnf_concept("foo", expr) + } + + sequence = self.validate_get_concepts_sequences(my_map, text, [text]) + assert sequence[0].underlying == u(expr, 0, end, sequence[0].underlying.children) + + @pytest.mark.parametrize("expr, text, end", [ + (Sequence(RegExMatch("bar"), StrMatch("foo")), "bar foo", 2), + (Sequence(RegExMatch("[a-z]+"), StrMatch("foo")), "xyz foo", 2), + (Sequence(RegExMatch("[a-z=]+"), StrMatch("foo")), "uvt=xyz=abc foo", 6), + ]) + def test_i_can_match_sequence_regex_str(self, expr, text, end): + my_map = { + text: self.bnf_concept("foo", expr) + } + + sequence = self.validate_get_concepts_sequences(my_map, text, [text]) + assert sequence[0].underlying == u(expr, 0, end, sequence[0].underlying.children) + + def test_i_can_match_sequence_str_regex_str(self): + text = "foo uvt=xyz=abc baz" + expr = Sequence(StrMatch("foo"), RegExMatch("[a-z=]+"), StrMatch("baz")) + my_map = { + text: self.bnf_concept("foo", expr) + } + + sequence = self.validate_get_concepts_sequences(my_map, text, [text]) + assert sequence[0].underlying == u(expr, 0, 8, sequence[0].underlying.children) + def test_i_can_match_multiple_concepts_in_one_input(self): my_map = { "one": self.bnf_concept("one"), @@ -356,8 +421,8 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): self.validate_get_concepts_sequences(my_map, text, expected) @pytest.mark.parametrize("text, expected", [ - # ("twenty one", [CNC("foo", source="twenty one")]), - # ("twenty three", []), # three does not exist + ("twenty one", [CNC("foo", source="twenty one")]), + ("twenty three", []), # three does not exist ("twenty four", []), # four exists but should not be seen ]) def test_i_can_mix_sequence_and_ordered_2(self, text, expected): @@ -388,7 +453,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("three", []), ]) - def test_i_can_parse_unordered_choice(self, text, expected): + def test_i_can_match_unordered_choice(self, text, expected): my_map = { "foo": self.bnf_concept("foo", UnOrderedChoice( StrMatch("one"), @@ -402,7 +467,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("", []), ("two", []), ]) - def test_i_can_parse_optional(self, text, expected): + def test_i_can_match_optional(self, text, expected): my_map = { "foo": self.bnf_concept("foo", Optional(StrMatch("one"))) } @@ -413,7 +478,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("twenty one", [CNC("foo", source="twenty one")]), ("one", [CNC("foo", source="one")]), ]) - def test_i_can_parse_sequence_starting_with_optional(self, text, expected): + def test_i_can_match_sequence_starting_with_optional(self, text, expected): my_map = { "foo": self.bnf_concept("foo", Sequence( @@ -427,7 +492,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("one two three", [CNC("foo", source="one two three")]), ("one two", [CNC("foo", source="one two")]), ]) - def test_i_can_parse_sequence_ending_with_optional(self, text, expected): + def test_i_can_match_sequence_ending_with_optional(self, text, expected): my_map = { "foo": self.bnf_concept("foo", Sequence( @@ -442,7 +507,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("one two three", [CNC("foo", source="one two three")]), ("one three", [CNC("foo", source="one three")]), ]) - def test_i_can_parse_sequence_with_optional_in_between(self, text, expected): + def test_i_can_match_sequence_with_optional_in_between(self, text, expected): my_map = { "foo": self.bnf_concept("foo", Sequence( @@ -459,7 +524,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("one", [CNC("foo", source="one")]), ("one one", [CNC("foo", source="one one")]), ]) - def test_i_can_parse_zero_or_more(self, text, expected): + def test_i_can_match_zero_or_more(self, text, expected): my_map = { "foo": self.bnf_concept("foo", ZeroOrMore(StrMatch("one"))) } @@ -471,7 +536,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("one two", [CNC("foo", source="one two")]), ("one one two", [CNC("foo", source="one one two")]), ]) - def test_i_can_parse_sequence_and_zero_or_more(self, text, expected): + def test_i_can_match_sequence_and_zero_or_more(self, text, expected): my_map = { "foo": self.bnf_concept("foo", Sequence( @@ -485,7 +550,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): @pytest.mark.parametrize("text, expected", [ ("one, one , one", [CNC("foo", source="one, one , one")]), ]) - def test_i_can_parse_zero_or_more_with_separator(self, text, expected): + def test_i_can_match_zero_or_more_with_separator(self, text, expected): my_map = { "foo": self.bnf_concept("foo", ZeroOrMore(StrMatch("one"), sep=",")) } @@ -508,7 +573,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("one", [CNC("foo", source="one")]), ("one one one", [CNC("foo", source="one one one")]), ]) - def test_i_can_parse_one_or_more(self, text, expected): + def test_i_can_match_one_or_more(self, text, expected): my_map = { "foo": self.bnf_concept("foo", OneOrMore(StrMatch("one"))), } @@ -520,7 +585,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("one two", [CNC("foo", source="one two")]), ("one one two", [CNC("foo", source="one one two")]), ]) - def test_i_can_parse_sequence_one_and_or_more(self, text, expected): + def test_i_can_match_sequence_one_and_or_more(self, text, expected): my_map = { "foo": self.bnf_concept("foo", Sequence( @@ -534,7 +599,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): @pytest.mark.parametrize("text, expected", [ ("one, one , one", [CNC("foo", source="one, one , one")]), ]) - def test_i_can_parse_one_or_more_with_separator(self, text, expected): + def test_i_can_match_one_or_more_with_separator(self, text, expected): my_map = { "foo": self.bnf_concept("foo", OneOrMore(StrMatch("one"), sep=",")) } @@ -763,7 +828,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): assert concept_bar.get_compiled()["foo"][1].get_compiled() == {ConceptParts.BODY: DoNotResolve("two")} assert concept_bar.get_compiled()["foo"][2].get_compiled() == {ConceptParts.BODY: DoNotResolve("three")} - def test_i_can_parse_concept_reference_that_is_not_in_grammar(self): + def test_i_can_match_concept_reference_that_is_not_in_grammar(self): my_map = { "one": Concept("one"), "two": Concept("two"), @@ -817,6 +882,234 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): assert concept_foo.get_compiled() == {'number': CC(my_map["number"], body=my_map["one"], one=my_map["one"]), ConceptParts.BODY: DoNotResolve(value='twenty one')} + @pytest.mark.parametrize("expr, expected", [ + ("one 'car'", [CNC("foo", source="one 'car'", x=python_ret_val("'car'"))]), # python + ("one bar", [CNC("foo", source="one bar", x=CC("bar"))]), # simple concept + ("one super car", [CNC("foo", source="one super car", x=CC("super car"))]), # long concept + ("one shoe", [CNC("foo", source="one shoe", x=CC("thing", source="shoe", body=DoNotResolve("shoe")))]), # bnf + ]) + def test_i_can_match_variable_when_ending_with_one_variable(self, expr, expected): + my_map = { + "foo": self.bnf_concept("foo", Sequence(StrMatch("one"), VariableExpression("x"))), + "bar": Concept("bar"), + "baz": Concept("baz"), + "thing": Concept("thing", definition="'shoe'|'skirt'"), + "super car": Concept("super car"), + "plus": Concept("x plus y").def_var("x").def_var("y"), + } + + self.validate_get_concepts_sequences(my_map, expr, expected) + + def test_i_can_match_variable_when_ending_with_one_variable_and_sya(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(StrMatch("one"), VariableExpression("x"))), + "bar": Concept("bar"), + "baz": Concept("baz"), + "plus": Concept("x plus y").def_var("x").def_var("y"), + } + + expr = "one bar plus baz" + expected = [ + [CNC("foo", source="one bar", x=CC("bar")), UTN(" plus "), CN("baz")], + [CNC("foo", source="one bar plus baz", x=CC("plus", source="bar plus baz", x="bar", y="baz"))], + ] + + self.validate_get_concepts_sequences(my_map, expr, expected, multiple_result=True) + + def test_i_can_match_variable_when_ending_with_one_variable_and_multiple_results(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(StrMatch("one"), VariableExpression("x"))), + "pretty big": Concept("pretty big", body="'pretty big'"), + "pbig": Concept("pretty big"), + } + + expr = "one pretty big" + expected = [ + [CNC("foo", source="one pretty big", x=CC("pretty big"))], + [CNC("foo", source="one pretty big", x=CC("pbig", source="pretty big"))] + ] + self.validate_get_concepts_sequences(my_map, expr, expected, multiple_result=True) + + def test_i_can_match_variable_when_ending_with_multiple_variables_and_multiple_results(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(StrMatch("one"), VariableExpression("x"), VariableExpression("y"))), + "pretty": Concept("pretty", body="pretty"), + "pretty2": Concept("pretty"), + "big": Concept("big", body="big"), + } + + expr = "one pretty big" + expected = [ + [CNC("foo", source="one pretty big", x=CC("pretty"), y=CC("big"))], + [CNC("foo", source="one pretty big", x=CC("pretty2", source="pretty"), y=CC("big"))] + ] + self.validate_get_concepts_sequences(my_map, expr, expected, multiple_result=True) + + @pytest.mark.parametrize("expr, expected", [ + ("'my' shoe", [CNC("foo", source="'my' shoe", x=python_ret_val("'my' "))]), # python + ("one shoe", [CNC("foo", source="one shoe", x=CC("one"))]), # concept + ("my little shoe", [CNC("foo", source="my little shoe", x=CC("my little"))]), # long concept + ("black shoe", [CNC("foo", source="black shoe", x=CC("color", source="black", body=DoNotResolve('black')))]), + ]) + def test_i_can_match_variable_when_starting_with_one_variable(self, expr, expected): + my_map = { + "foo": self.bnf_concept("foo", Sequence(VariableExpression("x"), StrMatch("shoe"))), + "one": Concept("one"), + "my little": Concept("my little"), + "color": Concept("color", definition="'blue'|'black'"), + "and": Concept("x and y").def_var("x").def_var("y"), + } + + self.validate_get_concepts_sequences(my_map, expr, expected) + + def test_i_can_match_variable_when_starting_with_one_variable_and_sya(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(VariableExpression("x"), StrMatch("shoe"))), + "tiny": Concept("tiny"), + "beautiful": Concept("beautiful"), + "but": Concept("x but y").def_var("x").def_var("y"), + } + expr = "tiny but beautiful shoe" + expected_res = [ + CNC("foo", + source="tiny but beautiful shoe", + x=CC("but", source="tiny but beautiful", x="tiny", y="beautiful"))] + unwanted_res = [CN("tiny"), UTN(" but "), CNC("foo", source="beautiful shoe", x=CC("beautiful"))] + self.validate_get_concepts_sequences(my_map, expr, [unwanted_res, expected_res], multiple_result=True) + + def test_i_can_match_variable_when_starting_with_multiple_variables(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(VariableExpression("x"), + VariableExpression("y"), + VariableExpression("z"), + StrMatch("shoe"))), + "one": Concept("one"), + "two": Concept("two"), + "plus": Concept("x plus y").def_var("x").def_var("y"), + } + + text = "one 'one' one plus two shoe" + + unwanted_res = [CN("one"), SCN(" 'one' "), ("one", 1), UTN(" plus "), CN("two")] + expected_res = [CNC("foo", + source="one 'one' one plus two shoe", + x=CC("one"), + y=python_ret_val(" 'one' "), + z=CC("plus", source="one plus two", x="one", y="two"))] + expected = [unwanted_res, expected_res] + + self.validate_get_concepts_sequences(my_map, text, expected, multiple_result=True) + + def test_i_can_match_variable_when_starting_with_one_variable_and_longer_str(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(VariableExpression("x"), + StrMatch("foo"), + StrMatch("bar"), + StrMatch("baz"))), + "one": Concept("one") + } + text = "one foo bar baz" + expected = [CNC("foo", source="one foo bar baz", x=CC("one"))] + self.validate_get_concepts_sequences(my_map, text, expected) + + @pytest.mark.parametrize("expr, expected", [ + ("one 'pretty' shoe", [CNC("foo", source="one 'pretty' shoe", x=python_ret_val("'pretty' "))]), # python + ("one little shoe", [CNC("foo", source="one little shoe", x=CC("little"))]), # concept + ("one very big shoe", [CNC("foo", source="one very big shoe", x=CC("very big"))]), # long concept + ("one black shoe", + [CNC("foo", source="one black shoe", x=CC("color", source="black", body=DoNotResolve('black')))]), + ("one tiny but beautiful shoe", + [CNC("foo", + source="one tiny but beautiful shoe", + x=CC("but", source="tiny but beautiful", x="tiny", y="beautiful "))]), + ]) + def test_i_can_match_variable_in_between(self, expr, expected): + my_map = { + "foo": self.bnf_concept("foo", Sequence(StrMatch("one"), VariableExpression("x"), StrMatch("shoe"))), + "little": Concept("little"), + "very big": Concept("very big"), + "color": Concept("color", definition="'blue'|'black'"), + "but": Concept("x but y").def_var("x").def_var("y"), + } + + self.validate_get_concepts_sequences(my_map, expr, expected) + + def test_i_can_match_variable_when_multiple_results_in_between(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(StrMatch("one"), VariableExpression("x"), StrMatch("shoe"))), + "pretty big": Concept("pretty big", body="'pretty big'"), + "pbig": Concept("pretty big"), + } + + expr = "one pretty big shoe" + expected = [ + [CNC("foo", source="one pretty big shoe", x=CC("pretty big"))], + [CNC("foo", source="one pretty big shoe", x=CC("pbig", source="pretty big"))] + ] + self.validate_get_concepts_sequences(my_map, expr, expected, multiple_result=True) + + def test_i_can_match_regex_and_variable(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(RegExMatch("[a-z]+"), + VariableExpression("x"))), + "shoe": Concept("shoe") + } + text = "onyx shoe" + expected = [CNC("foo", source="onyx shoe", x=CC("shoe"))] + self.validate_get_concepts_sequences(my_map, text, expected) + + def test_i_can_match_variable_and_regex(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(VariableExpression("x"), + RegExMatch("[a-z]+"))), + "one": Concept("one") + } + text = "one onyx" + expected = [CNC("foo", source="one onyx", x=CC("one"))] + self.validate_get_concepts_sequences(my_map, text, expected) + + def test_i_can_reuse_the_same_variable(self): + # in this test, the variable appears several times, but only once in concept.compiled + my_map = { + "foo": self.bnf_concept("foo", Sequence(VariableExpression("x"), + StrMatch("equals"), + VariableExpression("x"))), + "one": Concept("one"), + "two": Concept("two"), + } + sheerka, context, *updated = self.init_concepts(*my_map.values()) + parser = BnfNodeParser() + parser.init_from_concepts(context, updated) + + # same variable appears only once in the compiled variables + text = "one equals one" + expected = [CNC("foo", source="one equals one", x=CC("one"))] + expected_sequence = compute_expected_array(my_map, text, expected) + + parser.reset_parser(context, ParserInput(text)) + bnf_parsers_helpers = parser.get_concepts_sequences(context) + to_compare = tests.parsers.parsers_utils.get_test_obj(expected_sequence, bnf_parsers_helpers[0].sequence) + assert to_compare == expected + + def test_i_cannot_match_variable_when_variables_discrepancy(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(VariableExpression("x"), + StrMatch("equals"), + VariableExpression("x"))), + "one": Concept("one"), + "one_1": Concept("one", body="1"), + "two": Concept("two"), + "two_2": Concept("two", body="2"), + } + sheerka, context, *updated = self.init_concepts(*my_map.values()) + parser = BnfNodeParser() + parser.init_from_concepts(context, updated) + + text = "one equals two" + parser.reset_parser(context, ParserInput(text)) + bnf_parsers_helpers = parser.get_concepts_sequences(context) + assert bnf_parsers_helpers[0].sequence == [] + @pytest.mark.parametrize("bar_expr, expected", [ (ConceptExpression("foo"), {}), (OrderedChoice(ConceptExpression("foo"), StrMatch("one")), {'one': ['1002']}), @@ -833,7 +1126,6 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): parser.sheerka = sheerka # every obvious cyclic recursion are removed from concept_by_first_keyword dict - parser.init_from_concepts(context, my_map.values()) assert sheerka.om.copy(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == expected # get_parsing_expression() also returns CHICKEN_AND_EGG @@ -858,7 +1150,6 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): parser.sheerka = sheerka # every obvious cyclic recursion are removed from concept_by_first_keyword dict - parser.init_from_concepts(context, my_map.values()) assert sheerka.om.copy(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == {} parsing_expression = parser.get_parsing_expression(context, my_map["foo"]) @@ -884,7 +1175,6 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): parser.sheerka = sheerka # every obvious cyclic recursion are removed from concept_by_first_keyword dict - parser.init_from_concepts(context, my_map.values()) assert sheerka.om.copy(SheerkaConceptManager.RESOLVED_CONCEPTS_BY_FIRST_KEYWORD_ENTRY) == {} parsing_expression = parser.get_parsing_expression(context, my_map["foo"]) @@ -908,8 +1198,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): "foo": self.bnf_concept("foo", expr), } - sheerka, context, parser = self.init_parser(my_map, singleton=True) - parser.init_from_concepts(context, my_map.values()) + sheerka, context, parser = self.init_parser(my_map) parser.context = context parser.sheerka = sheerka @@ -923,7 +1212,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): "number": Concept("number"), "twenties": self.bnf_concept("twenties", Sequence(ConceptExpression("twenty"), ConceptExpression("number"))) } - sheerka, context, parser = self.init_parser(my_map, singleton=True) + sheerka, context, parser = self.init_parser(my_map) parser.context = context parser.sheerka = sheerka sheerka.set_isa(context, sheerka.new("one"), my_map["number"]) @@ -1025,8 +1314,8 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ConceptExpression(my_map["one"], rule_name="one")) @pytest.mark.parametrize("expr, text, expected", [ - # (ZeroOrMore(StrMatch("one"), sep=","), "one,", [CNC("foo", source="one"), UTN(",")]), - # (StrMatch("one"), "one two", [CNC("foo", source="one"), UTN(" two")]), + (ZeroOrMore(StrMatch("one"), sep=","), "one,", [CNC("foo", source="one"), UTN(",")]), + (StrMatch("one"), "one two", [CNC("foo", source="one"), UTN(" two")]), (StrMatch("one"), "two one", [UTN("two "), CNC("foo", source="one")]), ]) def test_i_can_recognize_unknown_concepts(self, expr, text, expected): @@ -1053,7 +1342,6 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): "three": self.bnf_concept("three") } sheerka, context, parser = self.init_parser(my_map, singleton=True) - parser.init_from_concepts(context, my_map.values()) parser.reset_parser(context, ParserInput("one three")) sequences = parser.get_concepts_sequences(context) @@ -1067,6 +1355,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): ("bar", True, [CNC("foo or bar", source="bar", bar="bar", body="bar")]), ("one plus two", True, [CNC("plus", source="one plus two", one="one", two="two")]), ("twenty one", True, [CNC("t1", source="twenty one", unit="one")]), + ("one 'car'", True, [CNC("one thing", source="one 'car'", x=python_ret_val("'car'"), one="one")]) ]) def test_i_can_parse_simple_expressions(self, parser_input, expected_status, expected): sheerka, context, parser = self.init_parser(init_from_sheerka=True) @@ -1359,8 +1648,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): "expr": Concept("expr", definition="term ('+' term)*"), } - sheerka, context, parser = self.init_parser(my_map, singleton=True) - parser.init_from_concepts(context, my_map.values()) + sheerka, context, parser = self.init_parser(my_map) text = "1 + 2 * 3" @@ -1396,8 +1684,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): Sequence(ConceptExpression("term"), StrMatch("+"), ConceptExpression("expr")), ConceptExpression("term"))), } - sheerka, context, parser = self.init_parser(my_map, singleton=True) - parser.init_from_concepts(context, my_map.values()) + sheerka, context, parser = self.init_parser(my_map) text = "1 + 2 * 3" @@ -1437,8 +1724,7 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): OrderedChoice(StrMatch("bar"), ConceptExpression("foo")))), } - sheerka, context, parser = self.init_parser(my_map, singleton=True) - parser.init_from_concepts(context, my_map.values()) + sheerka, context, parser = self.init_parser(my_map) assert parser.parse(context, ParserInput("foo bar")).status assert parser.parse(context, ParserInput("foo foo foo bar")).status @@ -1475,6 +1761,128 @@ class TestBnfNodeParser(TestUsingMemoryBasedSheerka): assert res.status assert res.value.value == compute_expected_array(cmap, text, [CN("thirties", source=text)]) + def test_i_do_not_eat_unwanted_tokens_at_the_beginning_when_concept_with_variable(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(VariableExpression("x"), StrMatch("shoe"))), + "one": Concept("one"), + "two": Concept("two"), + } + sheerka, context, parser = self.init_parser(my_map) + + text = "two one shoe" + res = parser.parse(context, ParserInput(text)) + assert res.status + assert res.value.value == compute_expected_array(my_map, text, [ + CN("two"), + CNC("foo", source="one shoe", x=CC("one"))]) + + def test_i_do_not_eat_unwanted_tokens_at_the_end_when_concept_with_variable(self): + my_map = { + "foo": self.bnf_concept("foo", Sequence(StrMatch("one"), VariableExpression("x"))), + "bar": Concept("bar"), + "baz": Concept("baz"), + } + sheerka, context, parser = self.init_parser(my_map) + + text = "one bar baz" + res = parser.parse(context, ParserInput(text)) + assert res.status + assert res.value.value == compute_expected_array(my_map, text, [ + CNC("foo", source="one bar", x=CC("bar")), + CN("baz")]) + + @pytest.mark.parametrize("parsing_expression, expected", [ + (RegExMatch("a"), [RegExDef("a")]), + (OrderedChoice(StrMatch("first"), RegExMatch("a|b")), ["first", RegExDef("a|b")]), + (OrderedChoice(RegExMatch("a|b"), StrMatch("first")), [RegExDef("a|b"), "first"]), + (Sequence(StrMatch("a"), RegExMatch("a|b")), ["a"]), + (Sequence(RegExMatch("a|b"), StrMatch("a")), [RegExDef("a|b")]), + (OneOrMore(StrMatch("a"), RegExMatch("a|b")), ["a"]), + (OneOrMore(RegExMatch("a|b"), StrMatch("a")), [RegExDef("a|b")]), + (ZeroOrMore(StrMatch("a"), RegExMatch("a|b")), ["a"]), + (ZeroOrMore(RegExMatch("a|b"), StrMatch("a")), [RegExDef("a|b")]), + ]) + def test_i_can_get_first_item(self, parsing_expression, expected): + sheerka = self.get_sheerka() + + visitor = BnfNodeFirstTokenVisitor(sheerka) + + visitor.visit(parsing_expression) + assert visitor.first_tokens == expected + + def test_i_cannot_parse_regex_when_no_next_matching_token_cannot_be_found(self): + sheerka, context, foo = self.init_test().with_concepts(Concept("foo", definition="r'abcd'"), + create_new=True).unpack() + + parser = BnfNodeParser(sheerka=sheerka) + res = parser.parse(context, ParserInput("abcdef")) + + assert not res.status + assert sheerka.isinstance(res.body, BuiltinConcepts.NOT_FOR_ME) + assert res.body.reason == [NoMatchingTokenError(4)] + + @pytest.mark.parametrize("text", [ + "one", + " one", + "one ", + " one " + ]) + def test_i_cannot_parse_empty_variable(self, text): + sheerka, context, parser = self.init_parser(init_from_sheerka=True) + + res = parser.parse(context, ParserInput("one")) + + assert not res.status + assert sheerka.isinstance(res.body, BuiltinConcepts.NOT_FOR_ME) + + @pytest.mark.parametrize("bnf, text", [ + (Sequence(VariableExpression("x"), StrMatch("foo")), "one foo"), + (Sequence(StrMatch("foo"), VariableExpression("x")), "foo one"), + (Sequence(StrMatch("foo"), VariableExpression("x"), StrMatch("bar")), "foo one bar"), + ]) + def test_i_cannot_parse_variable_when_unrecognized_nodes(self, bnf, text): + sheerka, context, foo = self.init_test().with_concepts( + self.bnf_concept("foo", Sequence(VariableExpression("x"), StrMatch("shoe"))) + ).unpack() + parser = BnfNodeParser() + parser.init_from_concepts(context, [foo]) + + res = parser.parse(context, ParserInput(text)) + + assert not res.status + assert sheerka.isinstance(res.body, BuiltinConcepts.NOT_FOR_ME) + + @pytest.mark.parametrize("to_match, ignore_case, multiline, explicit_flags", [ + ("xxy", None, None, re.MULTILINE), + ("xxy", True, True, re.MULTILINE), + ("xxy", False, False, re.MULTILINE), + ]) + def test_i_can_serialize_reg_ex_def(self, to_match, ignore_case, multiline, explicit_flags): + r = RegExDef(to_match, ignore_case, multiline, explicit_flags) + serialized = r.serialize() + + r2 = RegExDef().deserialize(serialized) + + assert r == r2 + + def test_i_can_resolve_parsing_expression_for_variable_concept(self): + sheerka, context, parser = self.init_parser(init_from_sheerka=True) + + expression = Sequence(VariableExpression("x"), StrMatch("x")) + resolved = parser.resolve_parsing_expression(context, expression, {}, set(), set()) + + assert isinstance(resolved.nodes[0], VariableExpression) + assert resolved.nodes[0].nodes[0] == resolved.nodes[1] + + def test_i_can_resolve_parsing_expression_when_ending_with_variable_concept(self): + sheerka, context, parser = self.init_parser(init_from_sheerka=True) + + expression = Sequence(StrMatch("x"), VariableExpression("x")) + resolved = parser.resolve_parsing_expression(context, expression, {}, set(), set()) + + assert isinstance(resolved.nodes[1], VariableExpression) + assert resolved.nodes[0].nodes == [] + # @pytest.mark.parametrize("parser_input, expected", [ # ("one", [ # (True, [CNC("bnf_one", source="one", one="one", body="one")]), diff --git a/tests/parsers/test_BnfParser.py b/tests/parsers/test_BnfParser.py index 5ac9480..184a21c 100644 --- a/tests/parsers/test_BnfParser.py +++ b/tests/parsers/test_BnfParser.py @@ -1,4 +1,5 @@ import pytest + from core.builtin_concepts import BuiltinConcepts from core.concept import Concept, DEFINITION_TYPE_BNF from core.sheerka.services.SheerkaExecute import ParserInput @@ -6,10 +7,9 @@ from core.tokenizer import Tokenizer, TokenKind, LexerError from parsers.BaseNodeParser import cnode from parsers.BaseParser import UnexpectedTokenParsingError, UnexpectedEofParsingError from parsers.BnfDefinitionParser import BnfDefinitionParser -from parsers.BnfNodeParser import BnfNodeParser +from parsers.BnfNodeParser import BnfNodeParser, RegExMatch, VariableExpression from parsers.BnfNodeParser import StrMatch, Optional, ZeroOrMore, OrderedChoice, Sequence, \ OneOrMore, ConceptExpression - from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka @@ -49,6 +49,7 @@ class TestBnfParser(TestUsingMemoryBasedSheerka): ("1", StrMatch("1")), (" 1", StrMatch("1")), (",", StrMatch(",")), + ("r'str'", RegExMatch("str")), ("'foo'?", Optional(StrMatch("foo"))), ("'foo'*", ZeroOrMore(StrMatch("foo"))), ("'foo'+", OneOrMore(StrMatch("foo"))), @@ -84,6 +85,19 @@ class TestBnfParser(TestUsingMemoryBasedSheerka): ("('foo'=var)*", ZeroOrMore(StrMatch("foo", rule_name="var"))), ("'foo'=var+", OneOrMore(StrMatch("foo", rule_name="var"))), ("('foo'=var)+", OneOrMore(StrMatch("foo", rule_name="var"))), + ("r'str'=var", RegExMatch("str", rule_name="var")), + ("r'foo'?=var", Optional(RegExMatch("foo"), rule_name="var")), + ("(r'foo'?)=var", Optional(RegExMatch("foo"), rule_name="var")), + ("r'foo'*=var", ZeroOrMore(RegExMatch("foo"), rule_name="var")), + ("(r'foo'*)=var", ZeroOrMore(RegExMatch("foo"), rule_name="var")), + ("r'foo'+=var", OneOrMore(RegExMatch("foo"), rule_name="var")), + ("(r'foo'+)=var", OneOrMore(RegExMatch("foo"), rule_name="var")), + ("r'foo'=var?", Optional(RegExMatch("foo", rule_name="var"))), + ("(r'foo'=var)?", Optional(RegExMatch("foo", rule_name="var"))), + ("r'foo'=var*", ZeroOrMore(RegExMatch("foo", rule_name="var"))), + ("(r'foo'=var)*", ZeroOrMore(RegExMatch("foo", rule_name="var"))), + ("r'foo'=var+", OneOrMore(RegExMatch("foo", rule_name="var"))), + ("(r'foo'=var)+", OneOrMore(RegExMatch("foo", rule_name="var"))), ("(1 | 2 | 3)=var", OrderedChoice(StrMatch("1"), StrMatch("2"), StrMatch("3"), rule_name="var")), ("(1 2)=var", Sequence(StrMatch("1"), StrMatch("2"), rule_name="var")), ("(1 2)+=var", OneOrMore(Sequence(StrMatch("1"), StrMatch("2")), rule_name="var")), @@ -118,6 +132,8 @@ class TestBnfParser(TestUsingMemoryBasedSheerka): ("foo=f", c("foo", "f")), ("foo=f 'constant'", Sequence(c("foo", "f"), StrMatch("constant"))), ("def 'concept'", Sequence(c("def"), StrMatch("concept"))), + ("c:foo:", c("foo")), + ("c:|1001:", c("foo")), ]) def test_i_can_parse_regex_with_concept(self, expression, expected): sheerka, context, parser, *concepts = self.init_parser("foo", "bar", "var", "def") @@ -131,6 +147,29 @@ class TestBnfParser(TestUsingMemoryBasedSheerka): assert res.value.value == expected assert res.value.source == expression + @pytest.mark.parametrize("expression, expected", [ + ("x", VariableExpression("x")), + ("x bar", Sequence(VariableExpression("x"), c("bar"))), + ("bar x", Sequence(c("bar"), VariableExpression("x"))), + ("x 'and' bar", Sequence(VariableExpression("x"), StrMatch("and"), c("bar"))), + ("x | bar", OrderedChoice(VariableExpression("x"), c("bar"))), + ("x*", ZeroOrMore(VariableExpression("x"))), + ("x+", OneOrMore(VariableExpression("x"))), + ("'str' = x", Sequence(StrMatch("str"), StrMatch("="), VariableExpression("x"))), + ("'str''='x", Sequence(StrMatch("str"), StrMatch("="), VariableExpression("x"))), + ("foo=x", VariableExpression("x")), + ]) + def test_i_can_parse_regex_with_variable(self, expression, expected): + # A variable is an identifier that cannot be resolved to a concept + sheerka, context, regex_parser, bar = self.init_parser("bar") + update_concepts_ids(sheerka, expected) + + res = regex_parser.parse(self.get_context(), expression) + + assert res.status + assert res.value.value == expected + assert res.value.source == expression + @pytest.mark.parametrize("expression, expected", [ ("foo", ConceptExpression("foo")), ("foo=f", ConceptExpression("foo", rule_name="f")), @@ -208,13 +247,18 @@ class TestBnfParser(TestUsingMemoryBasedSheerka): assert context.sheerka.isinstance(res.value, BuiltinConcepts.CANNOT_RESOLVE_CONCEPT) assert res.value.body == ('key', 'foo') - def test_i_cannot_parse_when_unknown_concept(self): + @pytest.mark.parametrize("text, expected", [ + ("c:foo:", {'key': 'foo'}), + ("c:|1001:", {'id': '1001'}), + ("c:foo|1001:", {'key': 'foo', 'id': '1001'}), + ]) + def test_i_cannot_parse_when_unknown_concept(self, text, expected): sheerka, context, regex_parser = self.init_parser() - res = regex_parser.parse(self.get_context(), "foo") + res = regex_parser.parse(self.get_context(), text) assert not res.status assert context.sheerka.isinstance(res.value, BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body == ('key', 'foo') + assert res.value.body == expected def test_concept_expression_are_correctly_created_when_isa_concept_is_detected(self): sheerka, context, parser, one, two, number, twenties = self.init_parser( diff --git a/tests/parsers/test_DefConceptParser.py b/tests/parsers/test_DefConceptParser.py index 2266db6..78d2942 100644 --- a/tests/parsers/test_DefConceptParser.py +++ b/tests/parsers/test_DefConceptParser.py @@ -2,6 +2,7 @@ import ast from dataclasses import dataclass import pytest + from core.builtin_concepts import ParserResultConcept, BuiltinConcepts, ReturnValueConcept from core.concept import DEFINITION_TYPE_BNF, DEFINITION_TYPE_DEF, Concept, CV from core.global_symbols import NotInit @@ -9,13 +10,13 @@ from core.sheerka.services.SheerkaExecute import ParserInput from core.tokenizer import Keywords, Tokenizer, LexerError from parsers.BaseNodeParser import SCWC from parsers.BaseParser import UnexpectedEofParsingError -from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch, Sequence from parsers.BnfDefinitionParser import BnfDefinitionParser +from parsers.BnfNodeParser import OrderedChoice, ConceptExpression, StrMatch, Sequence, RegExMatch, OneOrMore, \ + VariableExpression from parsers.DefConceptParser import DefConceptParser, NameNode, SyntaxErrorNode from parsers.DefConceptParser import UnexpectedTokenParsingError, DefConceptNode from parsers.FunctionParser import FunctionParser from parsers.PythonParser import PythonParser, PythonNode - from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka from tests.parsers.parsers_utils import compute_expected_array @@ -332,7 +333,7 @@ def concept add one to a as: "def concept name from bnf ", "def concept name from bnf as True", ]) - def test_i_cannot_parse_empty_bnf_definition(self, text): + def test_i_cannot_parse_empty_bnf_definition_when_no_definition(self, text): sheerka, context, parser, *concepts = self.init_parser() res = parser.parse(context, ParserInput(text)) error = res.body @@ -347,7 +348,8 @@ def concept add one to a as: node = res.value.value definition = OrderedChoice(ConceptExpression(a_concept, rule_name="a_concept"), StrMatch("a_string")) - parser_result = ParserResultConcept(BnfDefinitionParser(), "a_concept | 'a_string'", None, definition, definition) + parser_result = ParserResultConcept(BnfDefinitionParser(), "a_concept | 'a_string'", None, definition, + definition) expected = get_def_concept(name="name", body="__definition[0]", bnf_def=parser_result) assert res.status @@ -356,6 +358,22 @@ def concept add one to a as: assert isinstance(res.value, ParserResultConcept) assert node == expected + def test_i_can_parse_def_concept_from_bnf_when_using_concept_token(self): + text = "def concept name from bnf c:a_concept: 'xxx'" + sheerka, context, parser, a_concept = self.init_parser("a_concept") + res = parser.parse(context, ParserInput(text)) + + node = res.value.value + definition = Sequence(ConceptExpression(a_concept, rule_name="a_concept"), StrMatch("xxx")) + parser_result = ParserResultConcept(BnfDefinitionParser(), "c:a_concept: 'xxx'", None, definition, definition) + expected = get_def_concept(name="name", bnf_def=parser_result) + + assert res.status + assert res.who == parser.name + assert res.value.source == text + assert isinstance(res.value, ParserResultConcept) + assert node == expected + def test_i_can_parse_def_concept_where_bnf_references_itself(self): text = "def concept name from bnf 'a' + name?" sheerka, context, parser, a_concept = self.init_parser("a_concept") @@ -495,15 +513,6 @@ from give me the date ! assert res.body.body[0].message == error_msg assert res.body.body[0].text == error_text - def test_i_cannot_parse_bnf_definition_referencing_unknown_concept(self): - text = "def concept name from bnf unknown" - sheerka, context, parser, *concepts = self.init_parser() - res = parser.parse(context, ParserInput(text)) - - assert not res.status - assert context.sheerka.isinstance(res.value, BuiltinConcepts.UNKNOWN_CONCEPT) - assert res.value.body == ("key", "unknown") - def test_i_cannot_parse_bnf_definition_referencing_multiple_concepts_sharing_the_same_name(self): text = "def concept twenty one from bnf 'twenty' one" sheerka, context, parser, *concepts = self.init_parser(Concept("one", body="1"), Concept("one", body="1.0")) @@ -557,5 +566,78 @@ from give me the date ! assert isinstance(res.value, ParserResultConcept) assert node == expected + def test_i_can_parse_bnf_concept_with_regex(self): + sheerka, context, parser, number = self.init_parser("number") + text = "def concept sha512 from bnf r'^[a-f0-9]{128}$'" + res = parser.parse(context, ParserInput(text)) + assert res.status + assert res.who == parser.name + assert res.value.source == text + assert isinstance(res.value, ParserResultConcept) + node = res.value.value + parsing_expression = RegExMatch("^[a-f0-9]{128}$") + parser_result = ParserResultConcept(BnfDefinitionParser(), + "r'^[a-f0-9]{128}$'", + None, + parsing_expression, + parsing_expression) + expected = get_def_concept(name="sha512", bnf_def=parser_result) + assert node == expected + + def test_i_can_parse_bnf_concept_with_a_more_complicated_bnf(self): + sheerka, context, parser, number = self.init_parser("number") + text = "def concept foo from bnf number | r'[a-f0-9]+' | (number r'[a-f0-9]+')+" + res = parser.parse(context, ParserInput(text)) + + assert res.status + assert res.who == parser.name + assert res.value.source == text + assert isinstance(res.value, ParserResultConcept) + + node = res.value.value + parsing_expression = OrderedChoice( + ConceptExpression(number, rule_name="number"), + RegExMatch("[a-f0-9]+"), + OneOrMore(Sequence(ConceptExpression(number, rule_name="number"), RegExMatch("[a-f0-9]+"))) + ) + parser_result = ParserResultConcept(BnfDefinitionParser(), + "number | r'[a-f0-9]+' | (number r'[a-f0-9]+')+", + None, + parsing_expression, + parsing_expression) + expected = get_def_concept(name="foo", bnf_def=parser_result) + assert node == expected + + def test_i_can_parse_bnf_concept_definition_with_a_variable(self): + sheerka, context, parser, number = self.init_parser("number") + text = "def concept foo from bnf number x where x" + res = parser.parse(context, ParserInput(text)) + + node = res.value.value + definition = Sequence(ConceptExpression(number, rule_name="number"), VariableExpression("x")) + parser_result = ParserResultConcept(BnfDefinitionParser(), "number x", None, definition, definition) + expected = get_def_concept(name="foo", bnf_def=parser_result, where="x") + + assert res.status + assert res.who == parser.name + assert res.value.source == text + assert isinstance(res.value, ParserResultConcept) + assert node == expected + + def test_i_can_parse_bnf_definition_referencing_unknown_concept(self): + text = "def concept name from bnf unknown" + sheerka, context, parser, *concepts = self.init_parser() + res = parser.parse(context, ParserInput(text)) + + node = res.value.value + definition = VariableExpression("unknown") + parser_result = ParserResultConcept(BnfDefinitionParser(), "unknown", None, definition, definition) + expected = get_def_concept(name="name", bnf_def=parser_result) + + assert res.status + assert res.who == parser.name + assert res.value.source == text + assert isinstance(res.value, ParserResultConcept) + assert node == expected diff --git a/tests/parsers/test_parsers_utils.py b/tests/parsers/test_parsers_utils.py new file mode 100644 index 0000000..37d5ea1 --- /dev/null +++ b/tests/parsers/test_parsers_utils.py @@ -0,0 +1,109 @@ +from core.concept import Concept, ConceptParts, CC +from core.sheerka.services.SheerkaExecute import ParserInput +from parsers.BaseNodeParser import CNC +from parsers.BnfNodeParser import BnfNodeParser +from parsers.SyaNodeParser import SyaNodeParser +from tests.TestUsingMemoryBasedSheerka import TestUsingMemoryBasedSheerka +from tests.parsers.parsers_utils import get_test_obj + + +class TestParsersUtils(TestUsingMemoryBasedSheerka): + + def test_i_can_get_test_obj_when_CNC_from_sya(self): + sheerka, context, one, two, plus = self.init_concepts( + "one", + "two", + Concept("a plus b").def_var("a").def_var("b") + ) + + parser = SyaNodeParser().init_from_concepts(context, [one, two, plus]) + cnode = parser.parse(context, ParserInput("one plus two")).body.body[0] + + # compare all attributes + cnc_res = get_test_obj(CNC(concept_key="key", start=0, end=1, source="", exclude_body=False), cnode) + assert isinstance(cnc_res, CNC) + assert cnc_res == CNC("__var__0 plus __var__1", 0, 4, "one plus two", False, **cnode.concept.get_compiled()) + + # I can discard start, end and source + cnc_res = get_test_obj(CNC(concept_key="key"), cnode) + assert isinstance(cnc_res, CNC) + assert cnc_res == CNC("__var__0 plus __var__1", None, None, None, False, **cnode.concept.get_compiled()) + + def test_i_can_get_test_obj_when_CNC_from_bnf(self): + sheerka, context, one, two, plus = self.init_concepts( + "one", + "two", + Concept("twenties", definition="'twenty' (one | two)=unit").def_var("unit").def_var("one").def_var("two") + ) + + parser = BnfNodeParser().init_from_concepts(context, [one, two, plus]) + cnode = parser.parse(context, ParserInput("twenty one")).body.body[0] + + # compare all attributes + cnc_res = get_test_obj(CNC(concept_key="key", start=0, end=1, source="", exclude_body=False), cnode) + assert isinstance(cnc_res, CNC) + assert cnc_res == CNC("twenties", 0, 2, "twenty one", False, **cnode.concept.get_compiled()) + + # I can exclude body + cnc_res = get_test_obj(CNC(concept_key="key", exclude_body=True), cnode) + expected_compiled = {k: v for k, v in cnode.concept.get_compiled().items()} + del expected_compiled[ConceptParts.BODY] + assert isinstance(cnc_res, CNC) + assert cnc_res == CNC("twenties", None, None, None, False, **expected_compiled) + + def test_i_can_get_test_obj_when_list(self): + sheerka, context, one, two, plus = self.init_concepts( + "one", + "two", + Concept("a plus b").def_var("a").def_var("b") + ) + + parser = SyaNodeParser().init_from_concepts(context, [one, two, plus]) + cnode = parser.parse(context, ParserInput("one plus two")).body.body[0] + + res = get_test_obj([CNC("key1"), CNC("key", 0, 1, "")], [cnode, cnode]) + + assert len(res) == 2 + assert isinstance(res[0], CNC) + assert res[0] == CNC("__var__0 plus __var__1", None, None, None, False, **cnode.concept.get_compiled()) + assert isinstance(res[1], CNC) + assert res[1] == CNC("__var__0 plus __var__1", 0, 4, "one plus two", False, **cnode.concept.get_compiled()) + + def test_i_can_get_test_obj_when_dict(self): + sheerka, context, one, two, plus = self.init_concepts( + "one", + "two", + Concept("a plus b").def_var("a").def_var("b") + ) + + parser = SyaNodeParser().init_from_concepts(context, [one, two, plus]) + cnode = parser.parse(context, ParserInput("one plus two")).body.body[0] + + res = get_test_obj({"key1": CNC("key1"), "key2": CNC("key", 0, 1, "")}, {"key1": cnode, "key2": cnode}) + assert len(res) == 2 + assert isinstance(res["key1"], CNC) + assert res["key1"] == CNC("__var__0 plus __var__1", None, None, None, False, **cnode.concept.get_compiled()) + assert isinstance(res["key2"], CNC) + assert res["key2"] == CNC("__var__0 plus __var__1", 0, 4, "one plus two", False, **cnode.concept.get_compiled()) + + def test_i_can_get_test_obj_when_CC(self): + sheerka, context, one, two, plus = self.init_concepts( + "one", + "two", + Concept("twenties", definition="'twenty' (one | two)=unit").def_var("unit").def_var("one").def_var("two") + ) + + parser = BnfNodeParser().init_from_concepts(context, [one, two, plus]) + cc = parser.parse(context, ParserInput("twenty one")).body.body[0].concept + + # compare all attributes + cc_res = get_test_obj(CC(concept="key", source="", exclude_body=False), cc) + assert isinstance(cc_res, CC) + assert cc_res == CC("twenties", "twenty one", False, **cc.get_compiled()) + + # I can exclude body + cnc_res = get_test_obj(CC(concept="key", exclude_body=True), cc) + expected_compiled = {k: v for k, v in cc.get_compiled().items()} + del expected_compiled[ConceptParts.BODY] + assert isinstance(cnc_res, CC) + assert cnc_res == CC("twenties", "twenty one", True, **expected_compiled) diff --git a/tests/sheerkapickle/test_SheerkaPickler.py b/tests/sheerkapickle/test_SheerkaPickler.py index 775e071..d42edb0 100644 --- a/tests/sheerkapickle/test_SheerkaPickler.py +++ b/tests/sheerkapickle/test_SheerkaPickler.py @@ -2,7 +2,7 @@ import logging import pytest from core.concept import Concept -from core.global_symbols import NotInit, NotFound, Removed +from core.global_symbols import NotInit, NotFound, Removed, NoFirstToken from core.tokenizer import Keywords from sheerkapickle import tags from sheerkapickle.SheerkaPickler import SheerkaPickler @@ -68,6 +68,7 @@ class TestSheerkaPickler(TestUsingMemoryBasedSheerka): (NotInit, {tags.CUSTOM: NotInit.value}), (NotFound, {tags.CUSTOM: NotFound.value}), (Removed, {tags.CUSTOM: Removed.value}), + (NoFirstToken, {tags.CUSTOM: NoFirstToken.value}), ]) def test_i_can_flatten_and_restore_custom_types(self, obj, expected): sheerka = self.get_sheerka()