import datetime import hashlib import io import json import logging import os import pickle from threading import RLock from core.serializer import Serializer, DebugSerializer from core.utils import get_stream_digest TYPE_KEY = "__type__" TAG_PARENT = "__parent__" TAG_USER = "__user_id__" TAG_DATE = "__date__" BUFFER_SIZE = 4096 FAKE_USER_ID = "FakeUserId" logger = logging.getLogger(__name__) class DbException(Exception): pass class RefHelper: def __init__(self, get_obj_path): self.get_obj_path = get_obj_path def save_ref(self, obj): """ :param obj: :return: """ buffer = io.BytesIO() pickler = pickle.Pickler(buffer) pickler.dump(obj) digest = get_stream_digest(buffer) target_path = self.get_obj_path(digest) if not os.path.exists(os.path.dirname(target_path)): os.makedirs(os.path.dirname(target_path)) buffer.seek(0) with open(self.get_obj_path(digest), "wb") as file: while chunk := buffer.read(BUFFER_SIZE): file.write(chunk) logger.debug(f"Saved object type '{type(obj).__name__}' with digest {digest}") return digest def load_ref(self, digest): """ :param digest: :return: """ with open(self.get_obj_path(digest), 'rb') as file: return pickle.load(file) class DbEngine: """ Personal implementation of DB engine Inspire by the way git manage its files Designed to keep history of the modifications """ ObjectsFolder = "objects" # group objects in the same folder HeadFile = "head" # used to keep track the latest version of all entries def __init__(self, root: str = None): self.root = root or ".mytools_db" self.serializer = Serializer(RefHelper(self._get_obj_path)) self.debug_serializer = DebugSerializer(RefHelper(self.debug_load)) self.lock = RLock() def is_initialized(self): """ :return: """ return os.path.exists(self.root) def init(self): """ Make sure that the DbEngine is properly initialized :return: """ if not os.path.exists(self.root): logger.debug(f"Creating root folder in {os.path.abspath(self.root)}.") os.mkdir(self.root) def save(self, user_id: str, entry: str, obj: object) -> str: """ Save a snapshot of an entry :param user_id: :param entry: :param obj: snapshot to save :return: """ with self.lock: logger.info(f"Saving {user_id=}, {entry=}, {obj=}") # prepare the data as_dict = self._serialize(obj) as_dict[TAG_PARENT] = [self._get_entry_digest(entry)] as_dict[TAG_USER] = user_id as_dict[TAG_DATE] = datetime.datetime.now().strftime('%Y%m%d %H:%M:%S %z') # transform into a stream as_str = json.dumps(as_dict, sort_keys=True, indent=4) logger.debug(f"Serialized object : {as_str}") byte_stream = as_str.encode("utf-8") # compute the digest to know where to store it digest = hashlib.sha256(byte_stream).hexdigest() target_path = self._get_obj_path(digest) if os.path.exists(target_path): # the same object is already saved. Noting to do return digest # save the new value if not os.path.exists(os.path.dirname(target_path)): os.makedirs(os.path.dirname(target_path)) with open(target_path, "wb") as file: file.write(byte_stream) # update the head to remember where is the latest entry self._update_head(entry, digest) logger.debug(f"New head for entry '{entry}' is {digest}") return digest def load(self, user_id: str, entry, digest=None): """ Loads a snapshot :param user_id: :param entry: :param digest: :return: """ with self.lock: logger.info(f"Loading {user_id=}, {entry=}, {digest=}") digest_to_use = digest or self._get_entry_digest(entry) logger.debug(f"Using digest {digest_to_use}.") if digest_to_use is None: raise DbException(entry) target_file = self._get_obj_path(digest_to_use) with open(target_file, 'r', encoding='utf-8') as file: as_dict = json.load(file) return self._deserialize(as_dict) def put(self, user_id: str, entry, key: str, value: object): """ Save a specific record. This will create a new snapshot is the record is new or different You should not mix the usage of put_many() and save() as it's two different way to manage the db :param user_id: :param entry: :param key: :param value: :return: """ with self.lock: logger.info(f"Adding {user_id=}, {entry=}, {key=}, {value=}") try: entry_content = self.load(user_id, entry) except DbException: entry_content = {} # Do not save if the entry is the same if key in entry_content: old_value = entry_content[key] if old_value == value: return False entry_content[key] = value self.save(user_id, entry, entry_content) return True def put_many(self, user_id: str, entry, items: list): """ Save a list of item as one single snapshot A new snapshot will not be created if all the items already exist You should not mix the usage of put_many() and save() as it's two different way to manage the db :param user_id: :param entry: :param items: :return: """ with self.lock: logger.info(f"Adding many {user_id=}, {entry=}, {items=}") try: entry_content = self.load(user_id, entry) except DbException: entry_content = {} is_dirty = False for item in items: key = item.get_key() if key in entry_content and entry_content[key] == item: continue else: entry_content[key] = item is_dirty = True if is_dirty: self.save(user_id, entry, entry_content) return True return False def exists(self, entry: str): """ Tells if an entry exist :param user_id: :param entry: :return: """ with self.lock: return self._get_entry_digest(entry) is not None def get(self, user_id: str, entry: str, key: str | None = None, digest=None): """ Retrieve an item from the snapshot :param user_id: :param entry: :param key: :param digest: :return: """ with self.lock: logger.info(f"Getting {user_id=}, {entry=}, {key=}, {digest=}") entry_content = self.load(user_id, entry, digest) if key is None: # return all items as list return [v for k, v in entry_content.items() if not k.startswith("__")] return entry_content[key] def debug_head(self): with self.lock: head_path = os.path.join(self.root, self.HeadFile) # load try: with open(head_path, 'r') as file: head = json.load(file) except FileNotFoundError: head = {} return head def debug_load(self, digest): with self.lock: target_file = self._get_obj_path(digest) with open(target_file, 'r', encoding='utf-8') as file: as_dict = json.load(file) return self.debug_serializer.deserialize(as_dict) def _serialize(self, obj): """ Just call the serializer :param obj: :return: """ # serializer = Serializer(RefHelper(self._get_obj_path)) use_refs = getattr(obj, "use_refs")() if hasattr(obj, "use_refs") else None return self.serializer.serialize(obj, use_refs) def _deserialize(self, as_dict): return self.serializer.deserialize(as_dict) def _update_head(self, entry, digest): """ Actually dumps the snapshot in file system :param entry: :param digest: :return: """ head_path = os.path.join(self.root, self.HeadFile) # load try: with open(head_path, 'r') as file: head = json.load(file) except FileNotFoundError: head = {} # update head[entry] = digest # and save with open(head_path, 'w') as file: json.dump(head, file) def _get_entry_digest(self, entry): """ Search for the latest digest, for a given entry :param entry: :return: """ head_path = os.path.join(self.root, self.HeadFile) try: with open(head_path, 'r') as file: head = json.load(file) return head[str(entry)] except FileNotFoundError: return None except KeyError: return None def _get_head_path(self): """ Location of the Head file :return: """ return os.path.join(self.root, self.HeadFile) def _get_obj_path(self, digest): """ Location of objects :param digest: :return: """ return os.path.join(self.root, "objects", digest[:24], digest)