diff --git a/core/concept.py b/core/concept.py index 7f01891..638f66e 100644 --- a/core/concept.py +++ b/core/concept.py @@ -1,29 +1,92 @@ +import hashlib +from enum import Enum + + +class ConceptParts(Enum): + WHERE = "where" + PRE = "pre" + POST = "post" + BODY = "body" + + class Concept: """ Default concept object A concept is a the base object of our universe Everything is a concept """ + props_to_serialize = ("id", "name", "where", "pre", "post", "body", "desc") - concepts_id = 0 + key_name = "concepts" - def __init__(self, name, is_builtin=False): + def __init__(self, name=None, is_builtin=False, where=None, pre=None, post=None, body=None, desc=None): self.name = name self.is_builtin = is_builtin - self.pre = None # list of pre conditions before calling the main function - self.post = None # list of post conditions after calling the main function - self.main = None # main method, can also be the value of the concept - self.id = Concept.concepts_id - Concept.concepts_id = Concept.concepts_id + 1 + self.where = where # condition to recognize variables in name + self.pre = pre # list of pre conditions before calling the main function + self.post = post # list of post conditions after calling the main function + self.body = body # main method, can also be the value of the concept + self.desc = desc + self.key = None + self.parent = None self.props = [] # list of Property for this concept self.functions = {} # list of helper functions - def __str__(self): - return f"({self.id}){self.name}" + self.codes = {} def __repr__(self): - return f"({self.id}){self.name}" + return f"({self.key}){self.name}" + + def __eq__(self, other): + if not isinstance(other, Concept): + return False + return self.name == other.name and \ + self.where == other.where and \ + self.pre == other.pre and \ + self.post == other.post and \ + self.body == other.body + + def __hash__(self): + return hash(self.name) + + def add_codes(self, codes): + """ + From a dict of <ConceptParts, AST> + fill the codes + :param codes: + :return: + """ + possibles_codes = set(item.value for item in ConceptParts) + if codes is None: + return + for key in codes: + if key in possibles_codes: + self.codes[ConceptParts(key)] = codes[key] + + def get_digest(self): + """ + Returns the digest of the event + :return: hexa form of the sha256 + """ + return hashlib.sha256(f"Concept:{self.name}{self.pre}{self.post}{self.body}".encode("utf-8")).hexdigest() + + def to_dict(self): + props_as_dict = dict((prop, getattr(self, prop)) for prop in self.props_to_serialize) + return props_as_dict + + def from_dict(self, as_dict): + for prop in self.props_to_serialize: + setattr(self, prop, as_dict[prop]) + return self + + +class ErrorConcept(Concept): + def __init__(self, where=None, pre=None, post=None, body=None, desc=None): + Concept.__init__(self, "error", is_builtin=True, where=where, pre=pre, post=post, body=body, desc=desc) + + def __repr__(self): + return f"{self.name} : {self.body}" class Property: diff --git a/core/sheerka.py b/core/sheerka.py index fbd274f..e9fc95e 100644 --- a/core/sheerka.py +++ b/core/sheerka.py @@ -1,8 +1,9 @@ -import os from dataclasses import dataclass -from core.concept import Concept -from sdp.sheerkaDataProvider import SheerkaDataProvider +from core.concept import Concept, ErrorConcept +from parsers.PythonParser import PythonParser +from sdp.sheerkaDataProvider import SheerkaDataProvider, Event +from parsers.DefaultParser import DefaultParser, DefConceptNode class Singleton(type): @@ -54,6 +55,7 @@ class Sheerka(Concept, metaclass=Singleton): self.create_builtin_concepts() self.sdp = None + self.parsers = [] def create_builtin_concepts(self): """ @@ -76,11 +78,38 @@ class Sheerka(Concept, metaclass=Singleton): try: self.sdp = SheerkaDataProvider(root_folder) + self.parsers.append(lambda text: DefaultParser(text, PythonParser)) except IOError as e: return ReturnValue(False, self.get_concept(Sheerka.ERROR_CONCEPT_NAME, True), e) return ReturnValue(True, self.get_concept(Sheerka.SUCCESS_CONCEPT_NAME, True)) + def eval(self, text): + #evt_digest = self.sdp.save_event(Event(text)) + result = self.try_parse(text) + + return_values = [] + for parser_name, status, node in result: + if not status: + return_values.append(ReturnValue(False, ErrorConcept(body=node))) + elif status and isinstance(node, DefConceptNode): + return_values.append(self.add_concept(node)) + + return return_values + + def try_parse(self, text): + result = [] + for parser in self.parsers: + p = parser(text) + # try: + # tree = p.parse() + # result.append((p.name, tree)) + # except Exception as e: + # result.append((p.name, e)) + tree = p.parse() + result.append((p.name, not p.has_error, p.error_sink if p.has_error else tree)) + return result + def get_concept(self, name, is_builtin=False): """ Given a concept name, tries to find it @@ -93,6 +122,22 @@ class Sheerka(Concept, metaclass=Singleton): return concept return self.concepts[1] + def add_concept(self, def_concept_node: DefConceptNode): + """ + Adds a new concept to the system + :param def_concept_node: DefConceptNode + :return: digest of the new concept + """ + + concept = Concept(def_concept_node.name) + for prop in ("where", "pre", "post", "body"): + concept_part_node = getattr(def_concept_node, prop) + value = concept_part_node.source if hasattr(concept_part_node, "source") else "" + setattr(concept, prop, value) + + concept.add_codes(def_concept_node.get_codes()) + return ReturnValue(True, concept) + @staticmethod def concept_equals(concept1, concept2): """True if the two concepts refer to the same concept""" @@ -102,9 +147,4 @@ class Sheerka(Concept, metaclass=Singleton): if concept1 is None or concept2 is None: return False - return concept1.id == concept2.id - - def record_event(self, event): - self.sdp.save_event(event) - - + return concept1.key == concept2.key diff --git a/docs/syntax_v1.md b/docs/syntax_v1.md index ccd5b39..5458d53 100644 --- a/docs/syntax_v1.md +++ b/docs/syntax_v1.md @@ -84,7 +84,7 @@ concept is_the_opposite: a, b test: - a.pre == not b.pre && a.post == b.post + a.pre == not b.pre && a.post == not b.post print all concepts diff --git a/docs/syntax_v2.md b/docs/syntax_v2.md index 9cfeec6..b5708ec 100644 --- a/docs/syntax_v2.md +++ b/docs/syntax_v2.md @@ -17,4 +17,23 @@ def concept a is a number as : --> adds concept a is a number --> add the pre condition to the concept a plus b +``` + +# Define a new concept in one line +``` +def concept words +def concept words [where whereclause] [as expression] [pre precond] [post postcond] +``` + +# Define a complicated concept +``` +def concept +as: + ... +where: + ... +pre: + ... +post: + ... ``` \ No newline at end of file diff --git a/docs/syntax_v3.md b/docs/syntax_v3.md new file mode 100644 index 0000000..7d62626 --- /dev/null +++ b/docs/syntax_v3.md @@ -0,0 +1,17 @@ +``` +> "hello +-> unfinished quote " +> def concept unfinished quote q +... where: +...... q in ('"', '"') +... desc: +...... "Error detected by the default parser where the trailing quote is missing" +... input = sheerka.last_input + + +> when unfinished quote q as c: +... add rule as: +...... if q in sheerka.input: +......... sheerka.resume(c, c.input + input) +......... remove rule +``` \ No newline at end of file diff --git a/main.py b/main.py index 18322cf..fd9127a 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ import sys from core.utils import sysarg_to_string from core.sheerka import Sheerka -from sdp.sheerkaDataProvider import Event + def main(): @@ -10,12 +10,10 @@ def main(): # first, record the event event_as_string = sysarg_to_string(sys.argv[1:]) - evt_digest = sheerka.record_event(Event(event_as_string)) - - # launch the parsers + result = sheerka.eval(event_as_string) # execute the concepts - print(event_as_string) + print(result) return True diff --git a/parsers/BaseParser.py b/parsers/BaseParser.py new file mode 100644 index 0000000..ca00c5c --- /dev/null +++ b/parsers/BaseParser.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass, field +from parsers.tokenizer import TokenKind, Keywords + + +@dataclass() +class Node: + pass + + +@dataclass() +class NopNode(Node): + pass + + def __repr__(self): + return "nop" + + +@dataclass() +class ErrorNode(Node): + pass + + +class BaseParser: + def __init__(self, name, text): + self.name = name + self.text = text + self.has_error = False + self.error_sink = [] + + def parse(self): + pass + + @staticmethod + def get_text_from_tokens(tokens): + if tokens is None: + return "" + res = "" + for token in tokens: + value = Keywords(token.value).value if token.type == TokenKind.KEYWORD else token.value + res += value + return res diff --git a/parsers/DefaultParser.py b/parsers/DefaultParser.py new file mode 100644 index 0000000..d9a4cbb --- /dev/null +++ b/parsers/DefaultParser.py @@ -0,0 +1,383 @@ +from parsers.BaseParser import BaseParser, Node, NopNode, ErrorNode +from parsers.tokenizer import Tokenizer, TokenKind, Token, Keywords +from dataclasses import dataclass, field + + +@dataclass() +class DefaultParserNode(Node): + tokens: list = field(compare=False) + + def is_same(self, other): + if type(self) != type(other): + return False + + if hasattr(self, "value") and self.value != other.value: + return False + + return True + + +@dataclass() +class DefaultParserErrorNode(DefaultParserNode, ErrorNode): + pass + + +@dataclass() +class UnexpectedTokenErrorNode(DefaultParserErrorNode): + message: str + expected_tokens: list + + +@dataclass() +class SyntaxErrorNode(DefaultParserErrorNode): + message: str + pass + + +@dataclass() +class DefConceptNode(DefaultParserNode): + name: str + where: Node = None + pre: Node = None + post: Node = None + body: Node = NopNode + + def get_codes(self): + codes = {} + for prop in ["where", "pre", "post", "body"]: + prop_value = getattr(self, prop) + if hasattr(prop_value, "ast"): + codes[prop] = prop_value.ast + return codes + + +@dataclass() +class NumberNode(DefaultParserNode): + value: object + + def __repr__(self): + return str(self.value) + + +@dataclass() +class StringNode(DefaultParserNode): + value: str + quote: str + + def is_same(self, other): + if not super(StringNode, self).is_same(other): + return False + return self.quote == other.quote + + def __repr__(self): + return self.quote + self.value + self.quote + + +@dataclass() +class VariableNode(DefaultParserNode): + value: str + + def __repr__(self): + return self.value + + +@dataclass() +class TrueNode(DefaultParserNode): + pass + + def __repr__(self): + return "true" + + +@dataclass() +class FalseNode(DefaultParserNode): + pass + + def __repr__(self): + return "false" + + +@dataclass() +class NullNode(DefaultParserNode): + pass + + def __repr__(self): + return "null" + + +@dataclass() +class BinaryNode(DefaultParserNode): + operator: TokenKind + left: Node + right: Node + + def is_same(self, other): + if not super(BinaryNode, self).is_same(other): + return False + if self.operator != other.operator: + return False + if not self.left.is_same(other.left): + return False + return self.right.is_same(other.right) + + def __repr__(self): + return f"({self.left} {self.operator} {self.right})" + + +class DefaultParser(BaseParser): + def __init__(self, text, sub_parser): + BaseParser.__init__(self, "Default", text) + self.sub_parser = sub_parser + self.lexer = Tokenizer(text) + self.lexer_iter = iter(Tokenizer(text)) + self._current = None + + self.next_token() + + def collect_tokens(self, *args): + result = [] + for item in args: + if isinstance(item, Node): + result.extend(item.tokens) + else: + result.append(item) + return result + + def add_error(self, error, next_token=True): + self.has_error = True + self.error_sink.append(error) + if next_token: + self.next_token() + return error + + def get_token(self) -> Token: + return self._current + + def next_token(self, skip_whitespace=True): + try: + self._current = next(self.lexer_iter) + if skip_whitespace: + while self._current.type == TokenKind.WHITESPACE or self._current.type == TokenKind.NEWLINE: + self._current = next(self.lexer_iter) + except StopIteration: + self._current = None + return + + @staticmethod + def get_concept_name(tokens): + name = "" + first = True + for token in tokens: + if token.type == TokenKind.EOF: + break + if not first: + name += " " + name += token.value[1:-1] if token.type == TokenKind.STRING else token.value + first = False + + return name + + @staticmethod + def fix_indentation(tokens): + """ + In the following example + def concept add one to a as: + def func(x): + return x+1 + func(a) + indentations in front of 'def func(x)', 'return x+1' and 'func(a)' must be fixed to avoid a python syntax error + :param tokens: + :return: + """ + if tokens[1].type != TokenKind.COLON: + return tokens[1:] + + if len(tokens) < 3: + return UnexpectedTokenErrorNode(tokens[0:2], "Unexpected end of file", [TokenKind.NEWLINE]) + + if tokens[2].type != TokenKind.NEWLINE: + return UnexpectedTokenErrorNode([tokens[2]], "Unexpected token after colon", [TokenKind.NEWLINE]) + + if tokens[3].type != TokenKind.WHITESPACE: + return SyntaxErrorNode([tokens[3]], "Indentation not found") + indent_size = len(tokens[3].value) + + # now fix the other indentations + i = 4 + while i < len(tokens) - 1: + if tokens[i].type == TokenKind.NEWLINE: + if tokens[i + 1].type != TokenKind.WHITESPACE: + return UnexpectedTokenErrorNode([tokens[i + 1]], "Unexpected token", [TokenKind.WHITESPACE]) + + if len(tokens[i + 1].value) < indent_size: + return SyntaxErrorNode([tokens[i + 1]], "Invalid indentation.") + + tokens[i + 1].value = " " * (len(tokens[i + 1].value) - indent_size) + i += 1 + + return tokens[4:] + + def parse(self): + return self.parse_statement() + + def parse_statement(self): + token = self.get_token() + if token.value == Keywords.DEF: + self.next_token() + return self.parse_def_concept() + else: + return self.parse_expression() + + def parse_def_concept(self): + """ + def concept name [where xxx] [pre xxx] [post xxx] [as xxx] + """ + + def_concept_parts = [Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST] + + token = self.get_token() + if token.value != Keywords.CONCEPT: + return self.add_error(UnexpectedTokenErrorNode([token], "Syntax error.", [Keywords.CONCEPT])) + + self.next_token() + token = self.get_token() + + if token.value in (Keywords.AS, Keywords.WHERE, Keywords.PRE, Keywords.POST): + return self.add_error(UnexpectedTokenErrorNode([token], "Concept name is missing.", [""])) + + name_as_tokens = [] + while token.type != TokenKind.EOF and token.value not in def_concept_parts: + name_as_tokens.append(token) + self.next_token() + token = self.get_token() + name = self.get_concept_name(name_as_tokens) + + # try to parse as, where, pre and post declarations + tokens = { + Keywords.AS: None, + Keywords.WHERE: None, + Keywords.PRE: None, + Keywords.POST: None, + } + current_part = None + while token.type != TokenKind.EOF: + if token.value in def_concept_parts: + keyword = token.value + if tokens[keyword]: + return self.add_error(SyntaxErrorNode([token], f"Too many '{keyword.value}' declarations.")) + tokens[keyword] = [token] # first element of the list is the keyword + current_part = keyword + self.next_token() + else: + if current_part is None: + return self.add_error(UnexpectedTokenErrorNode([token], "Unexpected token", def_concept_parts)) + else: + tokens[current_part].append(token) + self.next_token(False) + + token = self.get_token() + + asts = { + Keywords.AS: NopNode(), + Keywords.WHERE: NopNode(), + Keywords.PRE: NopNode(), + Keywords.POST: NopNode(), + } + + # check for empty declarations + for keyword in tokens: + current_tokens = tokens[keyword] + if current_tokens is not None: + if len(current_tokens) == 0: # only one element means empty decl + return self.add_error(SyntaxErrorNode([current_tokens[0]], "Empty declaration"), False) + else: + current_tokens = self.fix_indentation(current_tokens) + if isinstance(current_tokens, ErrorNode): + self.add_error(current_tokens) + continue + + # start = current_tokens[0].index + # end = current_tokens[-1].index + len(current_tokens[-1].value) + sub_parser = self.sub_parser(current_tokens, source=keyword.value) + sub_tree = sub_parser.parse() + if isinstance(sub_tree, ErrorNode): + self.add_error(sub_tree, False) + asts[keyword] = sub_tree + + return DefConceptNode([], name, + asts[Keywords.WHERE], + asts[Keywords.PRE], + asts[Keywords.POST], + asts[Keywords.AS]) + + def parse_expression(self): + return self.parse_addition() + + def parse_addition(self): + left = self.parse_multiply() + token = self.get_token() + if token is None or token.type == TokenKind.EOF: + return left + + if token.type == TokenKind.NUMBER: # example 15 +5 or 15 -5 + right = self.parse_addition() + return BinaryNode(self.collect_tokens(left, token, right), TokenKind.PLUS, left, right) + + if token.type not in (TokenKind.PLUS, TokenKind.MINUS): + return left + + self.next_token() + right = self.parse_addition() + return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right) + + def parse_multiply(self): + left = self.parse_atom() + token = self.get_token() + if token is None or token.type == TokenKind.EOF: + return left + + if token.type not in (TokenKind.STAR, TokenKind.SLASH): + return left + + self.next_token() + right = self.parse_multiply() + return BinaryNode(self.collect_tokens(left, token, right), token.type, left, right) + + def parse_atom(self): + token = self.get_token() + if token.type == TokenKind.NUMBER: + self.next_token() + return NumberNode([token], float(token.value) if '.' in token.value else int(token.value)) + elif token.type == TokenKind.STRING: + self.next_token() + return StringNode([token], token.value[1:-1], token.value[0]) + elif token.type == TokenKind.IDENTIFIER: + if token.value == "true": + self.next_token() + return TrueNode([token]) + elif token.value == "false": + self.next_token() + return FalseNode([token]) + elif token.value == "null": + self.next_token() + return NullNode([token]) + else: + self.next_token() + return VariableNode([token], token.value) + elif token.type == TokenKind.LPAR: + self.next_token() + exp = self.parse_expression() + token = self.get_token() + self.next_token() + + if token.type != TokenKind.RPAR: + error = UnexpectedTokenErrorNode([token], "Right parenthesis not found.", [TokenKind.RPAR]) + self.add_error(error) + return error + + return exp + else: + error = UnexpectedTokenErrorNode([token], "Unexpected token", + [TokenKind.NUMBER, TokenKind.STRING, TokenKind.IDENTIFIER, "true", "false", + "null", TokenKind.LPAR]) + return self.add_error(error) diff --git a/parsers/PythonParser.py b/parsers/PythonParser.py new file mode 100644 index 0000000..2dba232 --- /dev/null +++ b/parsers/PythonParser.py @@ -0,0 +1,75 @@ +from parsers.BaseParser import BaseParser, Node, ErrorNode +from dataclasses import dataclass +import ast +import copy + + +@dataclass() +class PythonErrorNode(ErrorNode): + source: str + exception: Exception + + +@dataclass() +class PythonNode(Node): + source: str + ast: ast.AST + + def __repr__(self): + return "PythonNode(" + ast.dump(self.ast) + ")" + #return "PythonNode(" + self.source + ")" + + +class PythonParser(BaseParser): + def __init__(self, text, source=""): + text = text if isinstance(text, str) else self.get_text_from_tokens(text) + text = text.strip() + BaseParser.__init__(self, "PythonParser", text) + self.source = source + + def parse(self): + # first, try to parse an expression + res, tree, error = self.try_parse_expression() + if not res: + # then try to parse a statement + res, tree, error = self.try_parse_statement() + if not res: + self.has_error = True + error_node = PythonErrorNode(self.text, error) + self.error_sink.append(error_node) + return error_node + return PythonNode(self.text, tree) + + def try_parse_expression(self): + try: + return True, ast.parse(self.text, f"<{self.source}>", 'eval'), None + except Exception as error: + return False, None, error + + def try_parse_statement(self): + try: + return True, ast.parse(self.text, f"<{self.source}>", 'exec'), None + except Exception as error: + return False, None, error + + def expr_to_expression(self, expr): + expr.lineno = 0 + expr.col_offset = 0 + result = ast.Expression(expr.value, lineno=0, col_offset=0) + + return result + + def exec_with_return(self, code): + code_ast = ast.parse(code) + + init_ast = copy.deepcopy(code_ast) + init_ast.body = code_ast.body[:-1] + + last_ast = copy.deepcopy(code_ast) + last_ast.body = code_ast.body[-1:] + + exec(compile(init_ast, "", "exec"), globals()) + if type(last_ast.body[0]) == ast.Expr: + return eval(compile(self.expr_to_expression(last_ast.body[0]), "", "eval"), globals()) + else: + exec(compile(last_ast, "", "exec"), globals()) diff --git a/parsers/defaultparser.py b/parsers/defaultparser.py deleted file mode 100644 index b0d4a2e..0000000 --- a/parsers/defaultparser.py +++ /dev/null @@ -1,249 +0,0 @@ -from dataclasses import dataclass - - -@dataclass(frozen=True) -class Token: - type: str - value: str - index: int - line: int - column: int - - -@dataclass(frozen=True) -class LexerError(Exception): - message: str - text: str - index: int - line: int - column: int - - -class Tokens: - EOF = "eof" - WHITESPACE = "whitespace" - NEWLINE = "newline" - KEYWORD = "keyword" - IDENTIFIER = "identifier" - STRING = "string" - NUMBER = "number" - TRUE = "true" - FALSE = "false" - LPAR = "lpar" - RPAR = "rpar" - LBRACKET = "lbrace" - RBRACKET = "rbracket" - LBRACE = "lbrace" - RBRACE = "rbrace" - PLUS = "plus" - MINUS = "minus" - STAR = "star" - SLASH = "slash" - PERCENT = "percent" - COMMA = "comma" - SEMICOLON = "semicolon" - COLON = "colon" - DOT = "dot" - QMARK = "qmark" - VBAR = "vbar" - AMPER = "amper" - - -class TokenIter: - KEYWORDS = ("def", "concept", "as", "pre", "post") - - """ - Class that can iterate on the tokens - """ - - def __init__(self, text): - self.text = text - self.text_len = len(text) - - def __iter__(self): - - i = 0 - line = 1 - column = 1 - while i < self.text_len: - c = self.text[i] - if c == "+": - yield Token(Tokens.PLUS, "+", i, line, column) - i += 1 - column += 1 - elif c == "-": - if i + 1 < self.text_len and self.text[i + 1].isdigit(): - number = self.eat_number(i) - yield Token(Tokens.NUMBER, number, i, line, column) - i += len(number) - column += len(number) - else: - yield Token(Tokens.MINUS, "-", i, line, column) - i += 1 - column += 1 - elif c == "/": - yield Token(Tokens.SLASH, "/", i, line, column) - i += 1 - column += 1 - elif c == "*": - yield Token(Tokens.STAR, "*", i, line, column) - i += 1 - column += 1 - elif c == "{": - yield Token(Tokens.LBRACE, "{", i, line, column) - i += 1 - column += 1 - elif c == "}": - yield Token(Tokens.RBRACE, "}", i, line, column) - i += 1 - column += 1 - elif c == "(": - yield Token(Tokens.LPAR, "(", i, line, column) - i += 1 - column += 1 - elif c == ")": - yield Token(Tokens.RPAR, ")", i, line, column) - i += 1 - column += 1 - elif c == "[": - yield Token(Tokens.LBRACKET, "[", i, line, column) - i += 1 - column += 1 - elif c == "]": - yield Token(Tokens.RBRACKET, "]", i, line, column) - i += 1 - column += 1 - elif c == " " or c == "\t": - whitespace = self.eat_whitespace(i) - yield Token(Tokens.WHITESPACE, whitespace, i, line, column) - i += len(whitespace) - column += len(whitespace) - elif c == ",": - yield Token(Tokens.COMMA, ",", i, line, column) - i += 1 - column += 1 - elif c == ".": - yield Token(Tokens.DOT, ".", i, line, column) - i += 1 - column += 1 - elif c == ";": - yield Token(Tokens.SEMICOLON, ";", i, line, column) - i += 1 - column += 1 - elif c == ":": - yield Token(Tokens.COLON, ":", i, line, column) - i += 1 - column += 1 - elif c == "?": - yield Token(Tokens.QMARK, "?", i, line, column) - i += 1 - column += 1 - elif c == "\n" or c == "\r": - newline = self.eat_newline(i) - yield Token(Tokens.NEWLINE, newline, i, line, column) - i += len(newline) - column = 1 - line += 1 - elif c.isalpha() or c == "_": - identifier = self.eat_identifier(i) - type = Tokens.KEYWORD if identifier in self.KEYWORDS else Tokens.IDENTIFIER - yield Token(type, identifier, i, line, column) - i += len(identifier) - column += len(identifier) - elif c.isdigit(): - number = self.eat_number(i) - yield Token(Tokens.NUMBER, number, i, line, column) - i += len(number) - column += len(number) - elif c == "'" or c == '"': - string, newlines = self.eat_string(i) - yield Token(Tokens.STRING, string, i, line, column) - i += len(string) - column = 1 if newlines > 0 else column + len(string) - line += newlines - else: - raise LexerError(f"Unknown token '{c}'", self.text, i, line, column) - - yield Token(Tokens.EOF, "", i, line, column) - - def eat_whitespace(self, start): - result = self.text[start] - i = start + 1 - while i < self.text_len: - c = self.text[i] - if c == " " or c == "\t": - result += c - i += 1 - else: - break - - return result - - def eat_newline(self, start): - if start + 1 == self.text_len: - return self.text[start] - - current = self.text[start] - next = self.text[start + 1] - if current == "\n" and next == "\r" or current == "\r" and next == "\n": - return current + next - - return current - - def eat_identifier(self, start): - result = self.text[start] - i = start + 1 - while i < self.text_len: - c = self.text[i] - if c.isalpha() or c == "_" or c == "-" or c.isdigit(): - result += c - i += 1 - else: - break - - return result - - def eat_number(self, start): - result = self.text[start] - i = start + 1 - while i < self.text_len: - c = self.text[i] - if c.isdigit() or c == ".": - result += c - i += 1 - else: - break - - return result - - def eat_string(self, start): - quote = self.text[start] - result = self.text[start] - lines_count = 0 - - i = start + 1 - escape = False - newline = None - while i < self.text_len: - c = self.text[i] - result += c - i += 1 - - if newline: - lines_count += 1 - newline = c if c == newline else None - else: - if c == "\r" or c == "\n": - newline = c - - if c == "\\": - escape = True - elif c == quote and not escape: - break - else: - escape = False - - if newline: - lines_count += 1 - - return result, lines_count diff --git a/parsers/tokenizer.py b/parsers/tokenizer.py new file mode 100644 index 0000000..dc8b5a4 --- /dev/null +++ b/parsers/tokenizer.py @@ -0,0 +1,297 @@ +from dataclasses import dataclass +from enum import Enum + + +class TokenKind(Enum): + EOF = "eof" + WHITESPACE = "whitespace" + NEWLINE = "newline" + KEYWORD = "keyword" + IDENTIFIER = "identifier" + STRING = "string" + NUMBER = "number" + TRUE = "true" + FALSE = "false" + LPAR = "lpar" + RPAR = "rpar" + LBRACKET = "lbrace" + RBRACKET = "rbracket" + LBRACE = "lbrace" + RBRACE = "rbrace" + PLUS = "plus" + MINUS = "minus" + STAR = "star" + SLASH = "slash" + PERCENT = "percent" + COMMA = "comma" + SEMICOLON = "semicolon" + COLON = "colon" + DOT = "dot" + QMARK = "qmark" + VBAR = "vbar" + AMPER = "amper" + EQUALS = "=" + + +@dataclass() +class Token: + type: TokenKind + value: object + index: int + line: int + column: int + + +@dataclass() +class LexerError(Exception): + message: str + text: str + index: int + line: int + column: int + + +class Keywords(Enum): + DEF = "def" + CONCEPT = "concept" + AS = "as" + WHERE = "where" + PRE = "pre" + POST = "post" + + +class Tokenizer: + """ + Class that can iterate on the tokens + """ + + KEYWORDS = set(x.value for x in Keywords) + + def __init__(self, text): + self.text = text + self.text_len = len(text) + self.column = 1 + self.line = 1 + self.i = 0 + + def __iter__(self): + + while self.i < self.text_len: + c = self.text[self.i] + if c == "+": + if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit(): + number = self.eat_number(self.i) + yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column) + self.i += len(number) + self.column += len(number) + else: + yield Token(TokenKind.PLUS, "+", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "-": + if self.i + 1 < self.text_len and self.text[self.i + 1].isdigit(): + number = self.eat_number(self.i) + yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column) + self.i += len(number) + self.column += len(number) + else: + yield Token(TokenKind.MINUS, "-", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "/": + yield Token(TokenKind.SLASH, "/", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "*": + yield Token(TokenKind.STAR, "*", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "{": + yield Token(TokenKind.LBRACE, "{", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "}": + yield Token(TokenKind.RBRACE, "}", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "(": + yield Token(TokenKind.LPAR, "(", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == ")": + yield Token(TokenKind.RPAR, ")", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "[": + yield Token(TokenKind.LBRACKET, "[", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "]": + yield Token(TokenKind.RBRACKET, "]", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "=": + yield Token(TokenKind.EQUALS, "=", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == " " or c == "\t": + whitespace = self.eat_whitespace(self.i) + yield Token(TokenKind.WHITESPACE, whitespace, self.i, self.line, self.column) + self.i += len(whitespace) + self.column += len(whitespace) + elif c == ",": + yield Token(TokenKind.COMMA, ",", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == ".": + yield Token(TokenKind.DOT, ".", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == ";": + yield Token(TokenKind.SEMICOLON, ";", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == ":": + yield Token(TokenKind.COLON, ":", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "?": + yield Token(TokenKind.QMARK, "?", self.i, self.line, self.column) + self.i += 1 + self.column += 1 + elif c == "\n" or c == "\r": + newline = self.eat_newline(self.i) + yield Token(TokenKind.NEWLINE, newline, self.i, self.line, self.column) + self.i += len(newline) + self.column = 1 + self.line += 1 + elif c.isalpha() or c == "_": + identifier = self.eat_identifier(self.i) + token_type = TokenKind.KEYWORD if identifier in self.KEYWORDS else TokenKind.IDENTIFIER + value = Keywords(identifier) if identifier in self.KEYWORDS else identifier + yield Token(token_type, value, self.i, self.line, self.column) + self.i += len(identifier) + self.column += len(identifier) + elif c.isdigit(): + number = self.eat_number(self.i) + yield Token(TokenKind.NUMBER, number, self.i, self.line, self.column) + self.i += len(number) + self.column += len(number) + elif c == "'" or c == '"': + string, newlines = self.eat_string(self.i, self.line, self.column) + yield Token(TokenKind.STRING, string, self.i, self.line, self.column) + self.i += len(string) + self.column = 1 if newlines > 0 else self.column + len(string) + self.line += newlines + else: + raise LexerError(f"Unknown token '{c}'", self.text, self.i, self.line, self.column) + + yield Token(TokenKind.EOF, "", self.i, self.line, self.column) + + def eat_whitespace(self, start): + result = self.text[start] + i = start + 1 + while i < self.text_len: + c = self.text[i] + if c == " " or c == "\t": + result += c + i += 1 + else: + break + + return result + + def eat_newline(self, start): + if start + 1 == self.text_len: + return self.text[start] + + current = self.text[start] + next = self.text[start + 1] + if current == "\n" and next == "\r" or current == "\r" and next == "\n": + return current + next + + return current + + def eat_identifier(self, start): + result = self.text[start] + i = start + 1 + while i < self.text_len: + c = self.text[i] + if c.isalpha() or c == "_" or c == "-" or c.isdigit(): + result += c + i += 1 + else: + break + + return result + + def eat_number(self, start): + result = self.text[start] + i = start + 1 + while i < self.text_len: + c = self.text[i] + if c.isdigit() or c == ".": + result += c + i += 1 + else: + break + + return result + + def eat_string(self, start_index, start_line, start_column): + quote = self.text[start_index] + result = self.text[start_index] + lines_count = 0 + + i = start_index + 1 + escape = False + newline = None + while i < self.text_len: + c = self.text[i] + result += c + i += 1 + + if newline: + lines_count += 1 + newline = c if c == newline else None + else: + if c == "\r" or c == "\n": + newline = c + + if c == "\\": + escape = True + elif c == quote and not escape: + break + else: + escape = False + + # add trailing new line if needed + if newline: + lines_count += 1 + + if result[-1] != quote: + raise LexerError("Missing Trailing quote", result, i, start_line + lines_count, + 1 if lines_count > 0 else start_column + len(result)) + + return result, lines_count + + def seek(self, words): + if self.i == self.text_len: + return 0 + + # init + offsets = {} + start_index = self.i + + buffer = "" + while self.i < self.text_len: + c = self.text[self.i] + + # skip white space + if c in (" ", "\t"): + self.i += 1 + continue + + for word in words: + if c == word[offset]: + os diff --git a/sdp/readme.md b/sdp/readme.md index 3469897..c7190c5 100644 --- a/sdp/readme.md +++ b/sdp/readme.md @@ -1,5 +1,39 @@ # How to serialize ? +## General rule - 1 byte : type of object code - int : version of the encoder -- data : can be the json representation of the object +- data : can be the json representation of the object + +### Current supported types +- E : events +- O : object (with history management) +- P : pickle + +## How concepts are serialized ? +- get the id of the concept +- get the hash of the concept −> it will be its unique key +structure of the serialisation: +```json +{ + "id" : "id", + "parent": or "", + "name": , + "where": "", + "pre": "", + "post": "", + "body": "", + "desc": "", + ... +} +``` + +## Idea to manage ObjectSerializer +Problem: +During serialization, there is no issue. The match() method is the unique way to get the correct serialier. +During the deserialisation, all Object serializer have type = '0' and version = 1. +So how to choose the correct one ? + A possible solution will be to add the type of the object to deserialize to the saved stream + --> SHA256 for every object. Too much data saved. +The id is to let to inc the version automatically in the Serialiser (during the registration) and to keep the mapping within sdp.state + diff --git a/sdp/sheerkaDataProvider.py b/sdp/sheerkaDataProvider.py index 40d03cf..be47436 100644 --- a/sdp/sheerkaDataProvider.py +++ b/sdp/sheerkaDataProvider.py @@ -4,7 +4,7 @@ from datetime import datetime, date import hashlib import json import zlib -from sdp.sheerkaSerializer import Serializer +from sdp.sheerkaSerializer import Serializer, SerializerContext def json_default_converter(o): @@ -38,15 +38,15 @@ class Event(object): if not isinstance(self.message, str): raise NotImplementedError - return hashlib.sha256(f"{self.user}{self.date}{self.message}".encode("utf-8")).hexdigest() + return hashlib.sha256(f"Event:{self.user}{self.date}{self.message}".encode("utf-8")).hexdigest() - def to_json(self): - return json.dumps(self.__dict__, default=json_default_converter) + def to_dict(self): + return self.__dict__ - def from_json(self, json_message): - self.user = json_message["user"] - self.date = datetime.fromisoformat(json_message["date"]) - self.message = json_message["message"] + def from_dict(self, as_dict): + self.user = as_dict["user"] + self.date = datetime.fromisoformat(as_dict["date"]) + self.message = as_dict["message"] class State: @@ -120,6 +120,7 @@ class SheerkaDataProvider: EventFolder = "events" StateFolder = "state" + ObjectsFolder = "objects" CacheFolder = "cache" HeadFile = "HEAD" KeysFile = "keys" @@ -135,6 +136,9 @@ class SheerkaDataProvider: self.serializer = Serializer() + def get_obj_path(self, object_type, digest): + path.join(self.root, object_type, digest[:24], digest) + def add(self, event: Event, entry, obj): """ Adds obj to the entry 'entry' @@ -366,7 +370,7 @@ class SheerkaDataProvider: os.makedirs(path.dirname(target_path)) with open(target_path, "wb") as f: - f.write(self.serializer.serialize(event).read()) + f.write(self.serializer.serialize(event, None).read()) return digest @@ -378,7 +382,7 @@ class SheerkaDataProvider: """ target_path = path.join(self.root, SheerkaDataProvider.EventFolder, digest[:24], digest) with open(target_path, "rb") as f: - return self.serializer.deserialize(f) + return self.serializer.deserialize(f, None) def save_state(self, state: State): digest = state.get_digest() @@ -390,7 +394,7 @@ class SheerkaDataProvider: os.makedirs(path.dirname(target_path)) with open(target_path, "wb") as f: - f.write(self.serializer.serialize(state).read()) + f.write(self.serializer.serialize(state, None).read()) return digest @@ -400,7 +404,32 @@ class SheerkaDataProvider: target_path = path.join(self.root, SheerkaDataProvider.StateFolder, digest[:24], digest) with open(target_path, "rb") as f: - return self.serializer.deserialize(f) + return self.serializer.deserialize(f, None) + + def save_obj(self, obj): + if hasattr(obj, "key") and hasattr(obj, "key_name") and obj.key is None: + obj.key = self.get_next_key(obj.key_name) + + digest = obj.get_digest() + target_path = path.join(self.root, SheerkaDataProvider.ObjectsFolder, digest[:24], digest) + if path.exists(target_path): + return digest + + if not path.exists(path.dirname(target_path)): + os.makedirs(path.dirname(target_path)) + + with open(target_path, "wb") as f: + f.write(self.serializer.serialize(obj, SerializerContext("kodjo", digest)).read()) + + return digest + + def load_obj(self, digest): + if digest is None: + return State() + + target_path = path.join(self.root, SheerkaDataProvider.ObjectsFolder, digest[:24], digest) + with open(target_path, "rb") as f: + return self.serializer.deserialize(f, SerializerContext("kodjo", digest)) def get_cache_params(self, category, key): digest = hashlib.sha3_256(f"{category}:{key}".encode("utf-8")).hexdigest() @@ -507,4 +536,3 @@ class SheerkaDataProvider: keys[entry] = value self.save_keys(keys) return str(value) - diff --git a/sdp/sheerkaSerializer.py b/sdp/sheerkaSerializer.py index 34dc8e3..8688201 100644 --- a/sdp/sheerkaSerializer.py +++ b/sdp/sheerkaSerializer.py @@ -3,6 +3,7 @@ import pickle import datetime import struct import io +from dataclasses import dataclass def json_default_converter(o): @@ -17,15 +18,27 @@ def json_default_converter(o): return o.isoformat() +@dataclass() +class SerializerContext: + user_name: str + origin: str + + class Serializer: HEADER_FORMAT = "cH" + USERNAME = "user_name" # key to store user that as committed the snapshot + MODIFICATION_DATE = "modification_date" # + PARENTS = "parents" + ORIGIN = "origin" + HISTORY = "##history##" def __init__(self): self._cache = [] # add builtin serializers - self._cache.append(EventSerializer()) - self._cache.append(PickleSerializer()) + self.register(EventSerializer()) + self.register(PickleSerializer()) + self.register(ConceptSerializer()) def register(self, serializer): """ @@ -35,9 +48,10 @@ class Serializer: """ self._cache.append(serializer) - def serialize(self, obj): + def serialize(self, obj, context): """ Get the stream representation of an object + :param context: :param obj: :return: """ @@ -52,11 +66,12 @@ class Serializer: header = struct.pack(Serializer.HEADER_FORMAT, bytes(serializer.name, "utf-8"), serializer.version) stream.write(header) - return serializer.dump(stream, obj) + return serializer.dump(stream, obj, context) - def deserialize(self, stream): + def deserialize(self, stream, context): """ Loads an object from its stream representation + :param context: :param stream: :return: """ @@ -67,7 +82,7 @@ class Serializer: raise TypeError(f"Don't know how serializer name={header[0]}, version={header[1]}") serializer = serializers[0] - return serializer.load(stream) + return serializer.load(stream, context) class BaseSerializer: @@ -82,8 +97,7 @@ class BaseSerializer: self.name = name self.version = version - @staticmethod - def match(obj): + def match(self, obj): """ Returns true if self can serialize obj :param obj: @@ -91,26 +105,32 @@ class BaseSerializer: """ pass - def dump(self, stream, obj): + def dump(self, stream, obj, context): """ Returns the byte representation of how the object should be serialized - :param stream: to write to - :param obj: + :param obj: obj to serialize + :param context: additional info needed to dump :return: stream of bytes """ pass - def load(self, stream): + def load(self, stream, context): """ From a stream of bytes, create the object :param stream: + :param context: additional info needed to load :return: object """ pass @staticmethod def get_class(kls): + """ + Loads a class from its string full qualified name + :param kls: + :return: + """ parts = kls.split('.') module = ".".join(parts[:-1]) m = __import__(module) @@ -120,6 +140,11 @@ class BaseSerializer: @staticmethod def get_full_qualified_name(obj): + """ + Returns the full qualified name of a class (including its module name ) + :param obj: + :return: + """ module = obj.__class__.__module__ if module is None or module == str.__class__.__module__: return obj.__class__.__name__ # Avoid reporting __builtin__ @@ -128,40 +153,73 @@ class BaseSerializer: class EventSerializer(BaseSerializer): - @staticmethod - def match(obj): - return BaseSerializer.get_full_qualified_name(obj) == "sdp.sheerkaDataProvider.Event" - def __init__(self): BaseSerializer.__init__(self, "E", 1) - def dump(self, stream, obj): - stream.write(obj.to_json().encode("utf-8")) + def match(self, obj): + return BaseSerializer.get_full_qualified_name(obj) == "sdp.sheerkaDataProvider.Event" + + def dump(self, stream, obj, context): + stream.write(json.dumps(obj.to_dict(), default=json_default_converter).encode("utf-8")) stream.seek(0) return stream - def load(self, stream): + def load(self, stream, context): json_stream = stream.read().decode("utf-8") - json_message = json.loads(json_stream) + as_dict = json.loads(json_stream) event = BaseSerializer.get_class("sdp.sheerkaDataProvider.Event")() - event.from_json(json_message) + event.from_dict(as_dict) return event +class ObjectSerializer(BaseSerializer): + + def __init__(self, fully_qualified_name, name="O", version=1): + BaseSerializer.__init__(self, name, version) + self.fully_qualified_name = fully_qualified_name + + def match(self, obj): + return BaseSerializer.get_full_qualified_name(obj) == self.fully_qualified_name + + def dump(self, stream, obj, context): + as_json = obj.to_dict() + as_json.update({ + Serializer.HISTORY: { + Serializer.USERNAME: context.user_name, + Serializer.MODIFICATION_DATE: datetime.datetime.now().isoformat(), + Serializer.PARENTS: [getattr(obj, Serializer.ORIGIN)] if hasattr(obj, Serializer.ORIGIN) else [] + }}) + stream.write(json.dumps(as_json, default=json_default_converter).encode("utf-8")) + stream.seek(0) + return stream + + def load(self, stream, context): + json_stream = stream.read().decode("utf-8") + json_message = json.loads(json_stream) + obj = BaseSerializer.get_class(self.fully_qualified_name)() + obj.from_dict(json_message) + setattr(obj, Serializer.HISTORY, json_message[Serializer.HISTORY]) + + return obj + + class PickleSerializer(BaseSerializer): - @staticmethod - def match(obj): - return BaseSerializer.get_full_qualified_name(obj) == "sdp.sheerkaDataProvider.State" def __init__(self): BaseSerializer.__init__(self, "P", 1) - def dump(self, stream, obj): + def match(self, obj): + return BaseSerializer.get_full_qualified_name(obj) == "sdp.sheerkaDataProvider.State" + + def dump(self, stream, obj, context): stream.write(pickle.dumps(obj)) stream.seek(0) return stream - def load(self, stream): + def load(self, stream, context): return pickle.loads(stream.read()) +class ConceptSerializer(ObjectSerializer): + def __init__(self): + ObjectSerializer.__init__(self, "core.concept.Concept", "C", 1) diff --git a/tests/test_defautparser.py b/tests/test_defautparser.py index ee43553..5407963 100644 --- a/tests/test_defautparser.py +++ b/tests/test_defautparser.py @@ -1,39 +1,78 @@ import pytest -from parsers.defaultparser import TokenIter, Token, Tokens + +from parsers.PythonParser import PythonParser, PythonNode, PythonErrorNode +from parsers.tokenizer import Tokenizer, Token, TokenKind, Keywords, LexerError +from parsers.DefaultParser import DefaultParser +from parsers.DefaultParser import NumberNode, StringNode, VariableNode, TrueNode, FalseNode, NullNode, BinaryNode +from parsers.DefaultParser import Node, UnexpectedTokenErrorNode, DefConceptNode, NopNode +import ast + + +def nop(): + return NopNode() + + +def n(number): + return NumberNode([], number) + + +def s(string, quote="'"): + return StringNode([], string, quote) + + +def v(name): + return VariableNode([], name) + + +def t(): + return TrueNode([]) + + +def f(): + return FalseNode([]) + + +def null(): + return NullNode([]) + + +def b(operator, left, right): + return BinaryNode([], operator, left, right) def test_i_can_tokenize(): - source = "+*-/{}[]() ,;:.?\n\n\r\r\r\nidentifier_0\t \t10.15 10 'string\n' \"another string\"" - tokens = list(TokenIter(source)) - assert tokens[0] == Token(Tokens.PLUS, "+", 0, 1, 1) - assert tokens[1] == Token(Tokens.STAR, "*", 1, 1, 2) - assert tokens[2] == Token(Tokens.MINUS, "-", 2, 1, 3) - assert tokens[3] == Token(Tokens.SLASH, "/", 3, 1, 4) - assert tokens[4] == Token(Tokens.LBRACE, "{", 4, 1, 5) - assert tokens[5] == Token(Tokens.RBRACE, "}", 5, 1, 6) - assert tokens[6] == Token(Tokens.LBRACKET, "[", 6, 1, 7) - assert tokens[7] == Token(Tokens.RBRACKET, "]", 7, 1, 8) - assert tokens[8] == Token(Tokens.LPAR, "(", 8, 1, 9) - assert tokens[9] == Token(Tokens.RPAR, ")", 9, 1, 10) - assert tokens[10] == Token(Tokens.WHITESPACE, " ", 10, 1, 11) - assert tokens[11] == Token(Tokens.COMMA, ",", 14, 1, 15) - assert tokens[12] == Token(Tokens.SEMICOLON, ";", 15, 1, 16) - assert tokens[13] == Token(Tokens.COLON, ":", 16, 1, 17) - assert tokens[14] == Token(Tokens.DOT, ".", 17, 1, 18) - assert tokens[15] == Token(Tokens.QMARK, "?", 18, 1, 19) - assert tokens[16] == Token(Tokens.NEWLINE, "\n", 19, 1, 20) - assert tokens[17] == Token(Tokens.NEWLINE, "\n\r", 20, 2, 1) - assert tokens[18] == Token(Tokens.NEWLINE, "\r", 22, 3, 1) - assert tokens[19] == Token(Tokens.NEWLINE, "\r\n", 23, 4, 1) - assert tokens[20] == Token(Tokens.IDENTIFIER, "identifier_0", 25, 5, 1) - assert tokens[21] == Token(Tokens.WHITESPACE, "\t \t", 37, 5, 13) - assert tokens[22] == Token(Tokens.NUMBER, "10.15", 41, 5, 17) - assert tokens[23] == Token(Tokens.WHITESPACE, " ", 46, 5, 22) - assert tokens[24] == Token(Tokens.NUMBER, "10", 47, 5, 23) - assert tokens[25] == Token(Tokens.WHITESPACE, " ", 49, 5, 25) - assert tokens[26] == Token(Tokens.STRING, "'string\n'", 50, 5, 26) - assert tokens[27] == Token(Tokens.WHITESPACE, " ", 59, 6, 1) - assert tokens[28] == Token(Tokens.STRING, '"another string"', 60, 6, 2) + source = "+*-/{}[]() ,;:.?\n\n\r\r\r\nidentifier_0\t \t10.15 10 'string\n' \"another string\"=" + tokens = list(Tokenizer(source)) + assert tokens[0] == Token(TokenKind.PLUS, "+", 0, 1, 1) + assert tokens[1] == Token(TokenKind.STAR, "*", 1, 1, 2) + assert tokens[2] == Token(TokenKind.MINUS, "-", 2, 1, 3) + assert tokens[3] == Token(TokenKind.SLASH, "/", 3, 1, 4) + assert tokens[4] == Token(TokenKind.LBRACE, "{", 4, 1, 5) + assert tokens[5] == Token(TokenKind.RBRACE, "}", 5, 1, 6) + assert tokens[6] == Token(TokenKind.LBRACKET, "[", 6, 1, 7) + assert tokens[7] == Token(TokenKind.RBRACKET, "]", 7, 1, 8) + assert tokens[8] == Token(TokenKind.LPAR, "(", 8, 1, 9) + assert tokens[9] == Token(TokenKind.RPAR, ")", 9, 1, 10) + assert tokens[10] == Token(TokenKind.WHITESPACE, " ", 10, 1, 11) + assert tokens[11] == Token(TokenKind.COMMA, ",", 14, 1, 15) + assert tokens[12] == Token(TokenKind.SEMICOLON, ";", 15, 1, 16) + assert tokens[13] == Token(TokenKind.COLON, ":", 16, 1, 17) + assert tokens[14] == Token(TokenKind.DOT, ".", 17, 1, 18) + assert tokens[15] == Token(TokenKind.QMARK, "?", 18, 1, 19) + assert tokens[16] == Token(TokenKind.NEWLINE, "\n", 19, 1, 20) + assert tokens[17] == Token(TokenKind.NEWLINE, "\n\r", 20, 2, 1) + assert tokens[18] == Token(TokenKind.NEWLINE, "\r", 22, 3, 1) + assert tokens[19] == Token(TokenKind.NEWLINE, "\r\n", 23, 4, 1) + assert tokens[20] == Token(TokenKind.IDENTIFIER, "identifier_0", 25, 5, 1) + assert tokens[21] == Token(TokenKind.WHITESPACE, "\t \t", 37, 5, 13) + assert tokens[22] == Token(TokenKind.NUMBER, "10.15", 41, 5, 17) + assert tokens[23] == Token(TokenKind.WHITESPACE, " ", 46, 5, 22) + assert tokens[24] == Token(TokenKind.NUMBER, "10", 47, 5, 23) + assert tokens[25] == Token(TokenKind.WHITESPACE, " ", 49, 5, 25) + assert tokens[26] == Token(TokenKind.STRING, "'string\n'", 50, 5, 26) + assert tokens[27] == Token(TokenKind.WHITESPACE, " ", 59, 6, 1) + assert tokens[28] == Token(TokenKind.STRING, '"another string"', 60, 6, 2) + assert tokens[29] == Token(TokenKind.EQUALS, '=', 76, 6, 18) @pytest.mark.parametrize("text, expected", [ @@ -48,11 +87,26 @@ def test_i_can_tokenize(): ("-abcd", False) ]) def test_i_can_tokenize_identifiers(text, expected): - tokens = list(TokenIter(text)) - comparison = tokens[0].type == Tokens.IDENTIFIER + tokens = list(Tokenizer(text)) + comparison = tokens[0].type == TokenKind.IDENTIFIER assert comparison == expected +@pytest.mark.parametrize("text, error_text, index, line, column", [ + ("'string", "'string", 7, 1, 8), + ('"string', '"string', 7, 1, 8), + ('"a" + "string', '"string', 13, 1, 14), + ('"a"\n\n"string', '"string', 12, 3, 8), +]) +def test_i_can_detect_unfinished_strings(text, error_text, index, line, column): + with pytest.raises(LexerError) as e: + list(Tokenizer(text)) + assert e.value.text == error_text + assert e.value.index == index + assert e.value.line == line + assert e.value.column == column + + @pytest.mark.parametrize("text, expected_text, expected_newlines", [ ("'foo'", "'foo'", 0), ('"foo"', '"foo"', 0), @@ -72,8 +126,8 @@ def test_i_can_tokenize_identifiers(text, expected): ("'foo'bar'", "'foo'", 0), ]) def test_i_can_parse_strings(text, expected_text, expected_newlines): - lexer = TokenIter(text) - text_found, nb_of_newlines = lexer.eat_string(0) + lexer = Tokenizer(text) + text_found, nb_of_newlines = lexer.eat_string(0, 1, 1) assert nb_of_newlines == expected_newlines assert text_found == expected_text @@ -83,14 +137,201 @@ def test_i_can_parse_strings(text, expected_text, expected_newlines): "1", "3.1415", "0.5", "01", "-5", "-5.10" ]) def test_i_can_parse_numbers(text): - tokens = list(TokenIter(text)) - assert tokens[0].type == Tokens.NUMBER + tokens = list(Tokenizer(text)) + assert tokens[0].type == TokenKind.NUMBER assert tokens[0].value == text -@pytest.mark.parametrize("text", [ - "def", "concept", "as", "pre", "post" +@pytest.mark.parametrize("text, expected", [ + ("def", Keywords.DEF), + ("concept", Keywords.CONCEPT), + ("as", Keywords.AS), + ("pre", Keywords.PRE), + ("post", Keywords.POST) ]) -def test_i_can_recognize_keywords(text): - tokens = list(TokenIter(text)) - assert tokens[0].type == Tokens.KEYWORD +def test_i_can_recognize_keywords(text, expected): + tokens = list(Tokenizer(text)) + assert tokens[0].type == TokenKind.KEYWORD + assert tokens[0].value == expected + + +@pytest.mark.parametrize("text, expected", [ + ("1", n(1)), + ("+1", n(1)), + ("-1", n(-1)), + ("'foo'", s("foo")), + ("identifier", v("identifier")), + ("true", t()), + ("false", f()), + ("null", null()), + ("1 * 2", b(TokenKind.STAR, n(1), n(2))), + ("1 * 2/3", b(TokenKind.STAR, n(1), b(TokenKind.SLASH, n(2), n(3)))), + ("1 + 2", b(TokenKind.PLUS, n(1), n(2))), + ("1 + 2 - 3", b(TokenKind.PLUS, n(1), b(TokenKind.MINUS, n(2), n(3)))), + ("1 + 2-3", b(TokenKind.PLUS, n(1), b(TokenKind.PLUS, n(2), n(-3)))), + ("1 + 2 +-3", b(TokenKind.PLUS, n(1), b(TokenKind.PLUS, n(2), n(-3)))), + ("1 + 2 * 3", b(TokenKind.PLUS, n(1), b(TokenKind.STAR, n(2), n(3)))), + ("1 * 2 + 3", b(TokenKind.PLUS, b(TokenKind.STAR, n(1), n(2)), n(3))), + ("(1 + 2) * 3", b(TokenKind.STAR, b(TokenKind.PLUS, n(1), n(2)), n(3))), + ("1 * (2 + 3)", b(TokenKind.STAR, n(1), b(TokenKind.PLUS, n(2), n(3)))), +]) +def test_i_can_parse_simple_expression(text, expected): + parser = DefaultParser(text, None) + ast = parser.parse() + assert ast.is_same(expected) + + +@pytest.mark.parametrize("text, token_found, expected_tokens", [ + ("1+", TokenKind.EOF, + [TokenKind.NUMBER, TokenKind.STRING, TokenKind.IDENTIFIER, 'true', 'false', 'null', TokenKind.LPAR]), + ("(1+1", TokenKind.EOF, [TokenKind.RPAR]) +]) +def test_i_can_detect_unexpected_end_of_code(text, token_found, expected_tokens): + parser = DefaultParser(text, None) + parser.parse() + + assert parser.has_error + assert parser.error_sink[0].tokens[0].type == token_found + assert parser.error_sink[0].expected_tokens == expected_tokens + + +@pytest.mark.parametrize("text, expected_name, expected_expr", [ + ("def concept hello", "hello", nop()), + ("def concept hello ", "hello", nop()), + ("def concept a+b", "a + b", nop()), + ("def concept 'a+b'", "a+b", nop()), + ("def concept 'a+b'+c", "a+b + c", nop()), + ("def concept 'as if'", "as if", nop()), + ("def concept 'as' if", "as if", nop()), + ("def concept hello as 'hello'", "hello", ast.Expression(body=ast.Str(s='hello'))), + ("def concept hello as 1", "hello", ast.Expression(body=ast.Num(n=1))), + ("def concept h as 1 + 1", "h", ast.Expression(ast.BinOp(left=ast.Num(n=1), op=ast.Add(), right=ast.Num(n=1)))), +]) +def test_i_can_parse_def_concept(text, expected_name, expected_expr): + parser = DefaultParser(text, PythonParser) + tree = parser.parse() + assert isinstance(tree, DefConceptNode) + assert tree.name == expected_name + if isinstance(tree.body, PythonNode): + assert ast.dump(tree.body.ast) == ast.dump(expected_expr) + else: + assert tree.body == expected_expr + + +def compare_ast(left, right): + left_as_string = ast.dump(left) + left_as_string = left_as_string.replace(", ctx=Load()", "") + + right_as_string = right if isinstance(right, str) else ast.dump(right) + right_as_string = right_as_string.replace(", ctx=Load()", "") + + return left_as_string == right_as_string + + +def test_i_can_parse_complex_def_concept_statement(): + text = """def concept a plus b + where a,b + pre isinstance(a, int) and isinstance(b, float) + post isinstance(res, int) + as res = a + b + """ + parser = DefaultParser(text, PythonParser) + tree = parser.parse() + assert not parser.has_error + assert isinstance(tree, DefConceptNode) + assert tree.name == "a plus b" + assert tree.where.source == "a,b" + assert isinstance(tree.where.ast, ast.Expression) + assert tree.pre.source == "isinstance(a, int) and isinstance(b, float)" + assert isinstance(tree.pre.ast, ast.Expression) + assert tree.post.source == "isinstance(res, int)" + assert isinstance(tree.post.ast, ast.Expression) + assert tree.body.source == "res = a + b" + assert isinstance(tree.body.ast, ast.Module) + + +def test_i_can_use_colon_to_declare_indentation(): + text = """ +def concept add one to a as: + def func(x): + return x+1 + func(a) + """ + parser = DefaultParser(text, PythonParser) + tree = parser.parse() + assert not parser.has_error + assert isinstance(tree, DefConceptNode) + +def test_i_can_use_colon_to_declare_indentation2(): + text = """ +def concept add one to a as: + def func(x): + return x+1 + """ + parser = DefaultParser(text, PythonParser) + tree = parser.parse() + assert not parser.has_error + assert isinstance(tree, DefConceptNode) + + +def test_without_colon_i_get_an_indent_error(): + text = """ +def concept add one to a as + def func(x): + return x+1 + func(a) + """ + parser = DefaultParser(text, PythonParser) + tree = parser.parse() + assert parser.has_error + assert isinstance(tree, DefConceptNode) + assert isinstance(parser.error_sink[0].exception, IndentationError) + + +def test_i_can_detect_error(): + """ + In this test, func(b) is not correctly indented while colon is specified after the 'as' keyword + """ + + text = """ +def concept add one to a as: + def func(x): + return x+1 + func(a) +func(b) + """ + parser = DefaultParser(text, PythonParser) + tree = parser.parse() + assert parser.has_error + assert isinstance(tree, DefConceptNode) + assert isinstance(parser.error_sink[0], UnexpectedTokenErrorNode) + # check that the error is caused by 'func(b)' + assert parser.error_sink[0].tokens[0].line == 6 + assert parser.error_sink[0].tokens[0].column == 1 + + +@pytest.mark.parametrize("text, token_found, expected_tokens", [ + ("def hello as 'hello'", "hello", [Keywords.CONCEPT]), + ("def concept as", Keywords.AS, [""]), +]) +def test_i_can_detect_unexpected_token_error_in_def_concept(text, token_found, expected_tokens): + parser = DefaultParser(text, PythonParser) + parser.parse() + + assert parser.has_error + assert isinstance(parser.error_sink[0], UnexpectedTokenErrorNode) + assert parser.error_sink[0].tokens[0].value == token_found + assert parser.error_sink[0].expected_tokens == expected_tokens + + +@pytest.mark.parametrize("text", [ + "def concept hello where 1+", + "def concept hello pre 1+", + "def concept hello post 1+", + "def concept hello as 1+" +]) +def test_i_can_detect_error_in_declaration(text): + parser = DefaultParser(text, PythonParser) + parser.parse() + assert parser.has_error + assert isinstance(parser.error_sink[0], PythonErrorNode) diff --git a/tests/test_sheerka.py b/tests/test_sheerka.py index 226e8fe..5b4fecb 100644 --- a/tests/test_sheerka.py +++ b/tests/test_sheerka.py @@ -1,12 +1,17 @@ +import ast + import pytest import os from os import path import shutil -from core.concept import Concept +from core.concept import Concept, ConceptParts from core.sheerka import Sheerka +from parsers.DefaultParser import DefConceptNode, DefaultParser +from parsers.PythonParser import PythonParser tests_root = path.abspath("../build/tests") +root_folder = "init_folder" @pytest.fixture(autouse=True) @@ -25,8 +30,6 @@ def init_test(): def test_root_folder_is_created_after_initialization(): - root_folder = "init_folder" - return_value = Sheerka().initialize(root_folder) assert return_value.status, "initialisation should be successful" assert Sheerka().concept_equals(return_value.value, Sheerka().get_concept("success")) @@ -34,22 +37,56 @@ def test_root_folder_is_created_after_initialization(): def test_lists_of_concepts_is_initialized(): - root_folder = "init_folder" - Sheerka().initialize(root_folder) assert len(Sheerka().concepts) > 1 -def test_null_concept_are_equals(): - concept1 = Concept("test1") - concept2 = Concept("test2") - concept3 = Concept("test3") +# def test_null_concept_are_equals(): +# concept1 = Concept("test1") +# concept2 = Concept("test2") +# concept3 = Concept("test3") +# +# assert not Sheerka.concept_equals(concept1, None) +# assert not Sheerka.concept_equals(None, concept1) +# assert not Sheerka.concept_equals(concept1, concept2) +# assert not Sheerka.concept_equals(concept1, concept3) +# +# assert Sheerka.concept_equals(None, None) +# assert Sheerka.concept_equals(concept1, concept1) - assert not Sheerka.concept_equals(concept1, None) - assert not Sheerka.concept_equals(None, concept1) - assert not Sheerka.concept_equals(concept1, concept2) - assert not Sheerka.concept_equals(concept1, concept3) +def get_concept(): + text = """ + def concept a+b + where isinstance(a, int) and isinstance(b, int) + pre isinstance(a, int) and isinstance(b, int) + post isinstance(res, int) + as: + def func(x,y): + return x+y + func(a,b) + """ + parser = DefaultParser(text, PythonParser) + return parser.parse() - assert Sheerka.concept_equals(None, None) - assert Sheerka.concept_equals(concept1, concept1) +def test_i_can_add_a_concept(): + concept = get_concept() + sheerka = Sheerka() + sheerka.initialize(root_folder) + res = sheerka.add_concept(concept) + + assert res.status + assert res.value == Concept( + name="a + b", + where="isinstance(a, int) and isinstance(b, int)", + pre="isinstance(a, int) and isinstance(b, int)", + post="isinstance(res, int)", + body="def func(x,y):\n return x+y\nfunc(a,b)") + assert isinstance(res.value.codes[ConceptParts.WHERE], ast.Expression) + assert isinstance(res.value.codes[ConceptParts.PRE], ast.Expression) + assert isinstance(res.value.codes[ConceptParts.POST], ast.Expression) + assert isinstance(res.value.codes[ConceptParts.BODY], ast.Module) + +# def test_i_cannot_add_the_same_concept_twice(): +# concept1 = DefConceptNode(name="concept") +# sheerka = Sheerka diff --git a/tests/test_sheerkaDataProvider.py b/tests/test_sheerkaDataProvider.py index 0072ddd..39c0898 100644 --- a/tests/test_sheerkaDataProvider.py +++ b/tests/test_sheerkaDataProvider.py @@ -1,3 +1,5 @@ +import hashlib + import pytest import os from os import path @@ -6,6 +8,8 @@ from datetime import date, datetime import shutil import json +from sdp.sheerkaSerializer import ObjectSerializer, BaseSerializer, Serializer + tests_root = path.abspath("../build/tests") @@ -70,6 +74,33 @@ class ObjNoKey: return f"ObjNoKey({self.a}, {self.b})" +class ObjDumpJson: + def __init__(self, key, value): + self.key = key + self.value = value + + def __eq__(self, obj): + return isinstance(obj, ObjDumpJson) and \ + self.key == obj.key and \ + self.value == obj.value + + def __repr__(self): + return f"ObjDumpJson({self.key}, {self.value})" + + def get_key(self): + return self.key + + def get_digest(self): + return hashlib.sha256(f"Concept:{self.key}{self.value}".encode("utf-8")).hexdigest() + + def to_dict(self): + return self.__dict__ + + def from_dict(self, as_dict): + self.value = as_dict["value"] + self.key = as_dict["key"] + + @pytest.fixture(autouse=True) def init_test(): if path.exists(tests_root): @@ -570,4 +601,56 @@ def test_i_can_test_than_an_entry_exits(): assert not sdp.exists("entry") sdp.add(Event("event"), "entry", "value") - assert sdp.exists("entry") \ No newline at end of file + assert sdp.exists("entry") + + +def test_i_can_save_and_load_object_with_history(): + sdp = SheerkaDataProvider(".sheerka") + obj = ObjDumpJson("my_key", "value1") + sdp.serializer.register(ObjectSerializer(BaseSerializer.get_full_qualified_name(obj))) + + entry, key = sdp.add_ref("Obj", obj) + loaded = sdp.get(entry, key) + history = getattr(loaded, Serializer.HISTORY) + + assert key == obj.key + assert entry == "Obj" + assert loaded.key == obj.key + assert loaded.value == obj.value + + assert getattr(history, Serializer.USERNAME) == "kodjo" + assert getattr(history, Serializer.MODIFICATION_DATE) != "" + assert getattr(history, Serializer.PARENTS) == [] + + assert os.path.exists(sdp.get_obj_path(sdp.ObjectsFolder, obj.get_digest())) + + # save a second type with no modification + previous_modification_time = getattr(history, Serializer.MODIFICATION_DATE) + previous_parents = getattr(history, Serializer.PARENTS) + + sdp.add_ref("Obj", loaded) + loaded = sdp.get(entry, key) + history = getattr(loaded, Serializer.HISTORY) + + assert getattr(history, Serializer.MODIFICATION_DATE) == previous_modification_time + assert getattr(history, Serializer.PARENTS) == previous_parents + + # save again, but with a modification + previous_digest = loaded.get_digest() + loaded.value = "value2" + + sdp.add_ref("Obj", loaded) + loaded2 = sdp.get(entry, key) + history2 = getattr(loaded, Serializer.HISTORY) + + assert loaded2.key == loaded.key + assert loaded2.value == loaded.value + + assert getattr(history2, Serializer.USERNAME) == "kodjo" + assert getattr(history2, Serializer.MODIFICATION_DATE) != "" + assert getattr(history2, Serializer.PARENTS) == [previous_digest] + + + + + diff --git a/tests/test_sheerkaSerializer.py b/tests/test_sheerkaSerializer.py index 4d267ee..892389d 100644 --- a/tests/test_sheerkaSerializer.py +++ b/tests/test_sheerkaSerializer.py @@ -1,16 +1,56 @@ +import pytest +from dataclasses import dataclass + from sdp.sheerkaDataProvider import Event -from sdp.sheerkaSerializer import Serializer +from sdp.sheerkaSerializer import Serializer, ObjectSerializer, SerializerContext, BaseSerializer from datetime import datetime +@dataclass() +class Obj: + key: str = "" + prop1: str = "" + + def from_dict(self, json_object): + self.prop1 = json_object["prop1"] + self.key = json_object["key"] + return self + + def to_dict(self): + return self.__dict__ + + def test_i_can_serialize_an_event(): event = Event("test", user="user", date=datetime.fromisoformat("2019-10-21T10:20:30.999")) serializer = Serializer() - stream = serializer.serialize(event) - loaded = serializer.deserialize(stream) + stream = serializer.serialize(event, None) + loaded = serializer.deserialize(stream, None) assert event.version == loaded.version assert event.user == loaded.user assert event.date == loaded.date assert event.message == loaded.message + + +def test_i_can_serialize_an_object(): + obj = Obj("10", "value") + serializer = Serializer() + serializer.register(ObjectSerializer("tests.test_sheerkaSerializer.Obj")) + context = SerializerContext("kodjo", "6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b") + + stream = serializer.serialize(obj, context) + loaded = serializer.deserialize(stream, context) + + assert getattr(loaded, Serializer.HISTORY)[Serializer.USERNAME] == "kodjo" + assert getattr(loaded, Serializer.HISTORY)[Serializer.MODIFICATION_DATE] != "" + assert getattr(loaded, Serializer.HISTORY)[Serializer.PARENTS] == [] + assert loaded.key == "10" + assert loaded.prop1 == "value" + + +@pytest.mark.parametrize("obj, expected", [ + (Obj("10", "value"), "tests.test_sheerkaSerializer.Obj") +]) +def test_get_full_qualified_name(obj, expected): + assert expected == BaseSerializer.get_full_qualified_name(obj)