first commit

2025-10-17 21:08:20 +02:00
commit 878064b140
20 changed files with 1871 additions and 0 deletions
--- a/src/init.py
+++ b/src/init.py
--- a/src/core/init.py
+++ b/src/core/init.py
--- a/src/core/dbengine.py
+++ b/src/core/dbengine.py
@@ -0,0 +1,391 @@
+import datetime
+import hashlib
+import io
+import json
+import logging
+import os
+import pickle
+from threading import RLock
+
+from core.serializer import Serializer
+from core.utils import get_stream_digest
+
+TYPE_KEY = "__type__"
+TAG_PARENT = "__parent__"
+TAG_USER = "__user_id__"
+TAG_DATE = "__date__"
+BUFFER_SIZE = 4096
+
+logger = logging.getLogger(__name__)
+
+
+class DbException(Exception):
+  pass
+
+
+class RefHelper:
+  def __init__(self, get_ref_path):
+    self.get_ref_path = get_ref_path
+  
+  def save_ref(self, obj):
+    """
+
+    :param obj:
+    :return:
+    """
+    buffer = io.BytesIO()
+    pickler = pickle.Pickler(buffer)
+    pickler.dump(obj)
+    
+    digest = get_stream_digest(buffer)
+    
+    target_path = self.get_ref_path(digest)
+    if not os.path.exists(os.path.dirname(target_path)):
+      os.makedirs(os.path.dirname(target_path))
+    
+    buffer.seek(0)
+    with open(self.get_ref_path(digest), "wb") as file:
+      while chunk := buffer.read(BUFFER_SIZE):
+        file.write(chunk)
+    
+    logger.debug(f"Saved object type '{type(obj).__name__}' with digest {digest}")
+    return digest
+  
+  def load_ref(self, digest):
+    """
+
+    :param digest:
+    :return:
+    """
+    with open(self.get_ref_path(digest), 'rb') as file:
+      return pickle.load(file)
+
+
+class DbEngine:
+  """
+  Personal implementation of DB engine
+  Inspired by the way git manage its files
+  Designed to keep history of the modifications
+  """
+  ObjectsFolder = "objects"  # group objects in the same folder
+  HeadFile = "head"  # used to keep track of the latest version of all entries
+  
+  def __init__(self, root: str = None):
+    self.root = root or ".mytools_db"
+    self.lock = RLock()
+  
+  def is_initialized(self, tenant_id: str):
+    """
+
+    :return:
+    """
+    return os.path.exists(self._get_user_root(tenant_id))
+  
+  def init(self, tenant_id: str):
+    """
+    Make sure that the DbEngine is properly initialized
+    :return:
+    """
+    if not os.path.exists(self._get_user_root(tenant_id)):
+      logger.debug(f"Creating root folder in {os.path.abspath(self._get_user_root(tenant_id))}.")
+      os.makedirs(self._get_user_root(tenant_id))
+  
+  def save(self, tenant_id: str, user_id: str, entry: str, obj: object) -> str:
+    """
+    Save a snapshot of an entry
+    :param tenant_id:
+    :param user_id:
+    :param entry:
+    :param obj: snapshot to save
+    :return:
+    """
+    with self.lock:
+      logger.info(f"Saving {tenant_id=}, {entry=}, {obj=}")
+      
+      if not tenant_id:
+        raise DbException("tenant_id is None")
+      
+      if not user_id:
+        raise DbException("user_id is None")
+      
+      if not entry:
+        raise DbException("entry is None")
+      # prepare the data
+      as_dict = self._serialize(obj)
+      as_dict[TAG_PARENT] = [self._get_entry_digest(tenant_id, entry)]
+      as_dict[TAG_USER] = user_id
+      as_dict[TAG_DATE] = datetime.datetime.now().strftime('%Y%m%d %H:%M:%S %z')
+      
+      # transform into a stream
+      as_str = json.dumps(as_dict, sort_keys=True, indent=4)
+      logger.debug(f"Serialized object : {as_str}")
+      byte_stream = as_str.encode("utf-8")
+      
+      # compute the digest to know where to store it
+      digest = hashlib.sha256(byte_stream).hexdigest()
+      
+      target_path = self._get_obj_path(tenant_id, digest)
+      if os.path.exists(target_path):
+        # the same object is already saved. Noting to do
+        return digest
+      
+      # save the new value
+      if not os.path.exists(os.path.dirname(target_path)):
+        os.makedirs(os.path.dirname(target_path))
+      with open(target_path, "wb") as file:
+        file.write(byte_stream)
+      
+      # update the head to remember where the latest entry is
+      self._update_head(tenant_id, entry, digest)
+      logger.debug(f"New head for entry '{entry}' is {digest}")
+      return digest
+  
+  def load(self, tenant_id: str, entry, digest=None):
+    """
+    Loads a snapshot
+    :param tenant_id:
+    :param entry:
+    :param digest:
+    :return:
+    """
+    with self.lock:
+      logger.info(f"Loading {tenant_id=}, {entry=}, {digest=}")
+      
+      digest_to_use = digest or self._get_entry_digest(tenant_id, entry)
+      logger.debug(f"Using digest {digest_to_use}.")
+      
+      if digest_to_use is None:
+        raise DbException(entry)
+      
+      target_file = self._get_obj_path(tenant_id, digest_to_use)
+      with open(target_file, 'r', encoding='utf-8') as file:
+        as_dict = json.load(file)
+      
+      return self._deserialize(as_dict)
+  
+  def put(self, tenant_id: str, user_id, entry, key: str, value: object):
+    """
+    Save a specific record.
+    This will create a new snapshot is the record is new or different
+
+    You should not mix the usage of put_many() and save() as it's two different way to manage the db
+    :param user_id:
+    :param tenant_id:
+    :param entry:
+    :param key:
+    :param value:
+    :return:
+    """
+    with self.lock:
+      logger.info(f"Adding {tenant_id=}, {entry=}, {key=}, {value=}")
+      try:
+        entry_content = self.load(tenant_id, entry)
+      except DbException:
+        entry_content = {}
+      
+      # Do not save if the entry is the same
+      if key in entry_content:
+        old_value = entry_content[key]
+        if old_value == value:
+          return False
+      
+      entry_content[key] = value
+      self.save(tenant_id, user_id, entry, entry_content)
+      return True
+  
+  def put_many(self, tenant_id: str, user_id, entry, items: list | dict):
+    """
+    Save a list of item as one single snapshot
+    A new snapshot will not be created if all the items already exist
+
+    You should not mix the usage of put_many() and save() as it's two different way to manage the db
+    :param tenant_id:
+    :param user_id:
+    :param entry:
+    :param items:
+    :return:
+    """
+    with self.lock:
+      logger.info(f"Adding many {tenant_id=}, {entry=}, {items=}")
+      try:
+        entry_content = self.load(tenant_id, entry)
+      except DbException:
+        entry_content = {}
+      
+      is_dirty = False
+      
+      if isinstance(items, dict):
+        for key, item in items.items():
+          if key in entry_content and entry_content[key] == item:
+            continue
+          else:
+            entry_content[key] = item
+            is_dirty = True
+      
+      else:
+        
+        for item in items:
+          key = item.get_key()
+          if key in entry_content and entry_content[key] == item:
+            continue
+          else:
+            entry_content[key] = item
+            is_dirty = True
+      
+      if is_dirty:
+        self.save(tenant_id, user_id, entry, entry_content)
+        return True
+      
+      return False
+  
+  def exists(self, tenant_id, entry: str):
+    """
+    Tells if an entry exist
+    :param tenant_id:
+    :param entry:
+    :return:
+    """
+    with self.lock:
+      return self._get_entry_digest(tenant_id, entry) is not None
+  
+  def get(self, tenant_id: str, entry: str, key: str | None = None, digest=None):
+    """
+    Retrieve an item from the snapshot
+    :param tenant_id:
+    :param entry:
+    :param key:
+    :param digest:
+    :return:
+    """
+    with self.lock:
+      logger.info(f"Getting {tenant_id=}, {entry=}, {key=}, {digest=}")
+      entry_content = self.load(tenant_id, entry, digest)
+      
+      if key is None:
+        # return all items as list
+        return [v for k, v in entry_content.items() if not k.startswith("__")]
+      
+      try:
+        return entry_content[key]
+      except KeyError:
+        raise DbException(f"Key '{key}' not found in entry '{entry}'")
+  
+  def history(self, tenant_id, entry, digest=None, max_items=1000):
+    """
+    Gives the current digest and all its ancestors
+    :param tenant_id:
+    :param entry:
+    :param digest:
+    :param max_items:
+    :return:
+    """
+    with self.lock:
+      logger.info(f"History for {tenant_id=}, {entry=}, {digest=}")
+      
+      digest_to_use = digest or self._get_entry_digest(tenant_id, entry)
+      logger.debug(f"Using digest {digest_to_use}.")
+      
+      count = 0
+      history = []
+      
+      while True:
+        if count >= max_items or digest_to_use is None:
+          break
+        
+        history.append(digest_to_use)
+        count += 1
+        
+        try:
+          target_file = self._get_obj_path(tenant_id, digest_to_use)
+          with open(target_file, 'r', encoding='utf-8') as file:
+            as_dict = json.load(file)
+            
+            digest_to_use = as_dict[TAG_PARENT][0]
+        except FileNotFoundError:
+          break
+      
+      return history
+  
+  def get_digest(self, tenant_id, entry):
+    return self._get_entry_digest(tenant_id, entry)
+  
+  def _serialize(self, obj):
+    """
+    Just call the serializer
+    :param obj:
+    :return:
+    """
+    with self.lock:
+      serializer = Serializer(RefHelper(self._get_ref_path))
+      use_refs = getattr(obj, "use_refs")() if hasattr(obj, "use_refs") else None
+      return serializer.serialize(obj, use_refs)
+  
+  def _deserialize(self, as_dict):
+    with self.lock:
+      serializer = Serializer(RefHelper(self._get_ref_path))
+      return serializer.deserialize(as_dict)
+  
+  def _update_head(self, tenant_id, entry, digest):
+    """
+    Actually dumps the snapshot in file system
+    :param entry:
+    :param digest:
+    :return:
+    """
+    head_path = os.path.join(self.root, tenant_id, self.HeadFile)
+    # load
+    try:
+      with open(head_path, 'r') as file:
+        head = json.load(file)
+    except FileNotFoundError:
+      head = {}
+    
+    # update
+    head[entry] = digest
+    
+    # and save
+    with open(head_path, 'w') as file:
+      json.dump(head, file)
+  
+  def _get_user_root(self, tenant_id):
+    return os.path.join(self.root, tenant_id)
+  
+  def _get_entry_digest(self, tenant_id, entry):
+    """
+    Search for the latest digest, for a given entry
+    :param entry:
+    :return:
+    """
+    head_path = os.path.join(self._get_user_root(tenant_id), self.HeadFile)
+    try:
+      with open(head_path, 'r') as file:
+        head = json.load(file)
+        return head[str(entry)]
+    
+    except FileNotFoundError:
+      return None
+    except KeyError:
+      return None
+  
+  def _get_head_path(self, tenant_id: str):
+    """
+    Location of the Head file
+    :return:
+    """
+    return os.path.join(self._get_user_root(tenant_id), self.HeadFile)
+  
+  def _get_obj_path(self, tenant_id, digest):
+    """
+    Location of objects
+    :param digest:
+    :return:
+    """
+    return os.path.join(self._get_user_root(tenant_id), "objects", digest[:24], digest)
+  
+  def _get_ref_path(self, digest):
+    """
+    Location of reference. They are not linked to the user folder
+    :param digest:
+    :return:
+    """
+    return os.path.join(self.root, "refs", digest[:24], digest)
--- a/src/core/handlers.py
+++ b/src/core/handlers.py
@@ -0,0 +1,59 @@
+# I delegate the complexity of some data type within specific handlers
+
+import datetime
+
+from core.utils import has_tag
+
+TAG_SPECIAL = "__special__"
+
+
+class BaseHandler:
+    def is_eligible_for(self, obj):
+        pass
+
+    def tag(self):
+        pass
+
+    def serialize(self, obj) -> dict:
+        pass
+
+    def deserialize(self, data: dict) -> object:
+        pass
+
+
+class DateHandler(BaseHandler):
+    def is_eligible_for(self, obj):
+        return isinstance(obj, datetime.date)
+
+    def tag(self):
+        return "Date"
+
+    def serialize(self, obj):
+        return {
+            TAG_SPECIAL: self.tag(),
+            "year": obj.year,
+            "month": obj.month,
+            "day": obj.day,
+        }
+
+    def deserialize(self, data: dict) -> object:
+        return datetime.date(year=data["year"], month=data["month"], day=data["day"])
+
+
+class Handlers:
+
+    def __init__(self, handlers_):
+        self.handlers = handlers_
+
+    def get_handler(self, obj):
+        if has_tag(obj, TAG_SPECIAL):
+            return [h for h in self.handlers if h.tag() == obj[TAG_SPECIAL]][0]
+
+        for h in self.handlers:
+            if h.is_eligible_for(obj):
+                return h
+
+        return None
+
+
+handlers = Handlers([DateHandler()])
--- a/src/core/serializer.py
+++ b/src/core/serializer.py
@@ -0,0 +1,201 @@
+import copy
+
+from core.handlers import handlers
+from core.utils import has_tag, is_dictionary, is_list, is_object, is_set, is_tuple, is_primitive, importable_name, \
+  get_class, get_full_qualified_name, is_enum
+
+TAG_ID = "__id__"
+TAG_OBJECT = "__object__"
+TAG_TUPLE = "__tuple__"
+TAG_SET = "__set__"
+TAG_REF = "__ref__"
+TAG_ENUM = "__enum__"
+
+
+class Serializer:
+  def __init__(self, ref_helper=None):
+    self.ref_helper = ref_helper
+    
+    self.ids = {}
+    self.objs = []
+    self.id_count = 0
+  
+  def serialize(self, obj, use_refs=None):
+    """
+    From object to dictionary
+    :param obj:
+    :param use_refs: Sometimes it easier / quicker to use pickle !
+    :return:
+    """
+    if use_refs:
+      use_refs = set("root." + path for path in use_refs)
+    
+    return self._serialize(obj, use_refs or set(), "root")
+  
+  def deserialize(self, obj: dict):
+    """
+    From dictionary to object (or primitive)
+    :param obj:
+    :return:
+    """
+    if has_tag(obj, TAG_REF):
+      return self.ref_helper.load_ref(obj[TAG_REF])
+    
+    if has_tag(obj, TAG_ID):
+      return self._restore_id(obj)
+    
+    if has_tag(obj, TAG_TUPLE):
+      return tuple([self.deserialize(v) for v in obj[TAG_TUPLE]])
+    
+    if has_tag(obj, TAG_SET):
+      return set([self.deserialize(v) for v in obj[TAG_SET]])
+    
+    if has_tag(obj, TAG_ENUM):
+      return self._deserialize_enum(obj)
+    
+    if has_tag(obj, TAG_OBJECT):
+      return self._deserialize_obj_instance(obj)
+    
+    if (handler := handlers.get_handler(obj)) is not None:
+      return handler.deserialize(obj)
+    
+    if is_list(obj):
+      return [self.deserialize(v) for v in obj]
+    
+    if is_dictionary(obj):
+      return {k: self.deserialize(v) for k, v in obj.items()}
+    
+    return obj
+  
+  def _serialize(self, obj, use_refs: set | None, path):
+    if use_refs is not None and path in use_refs:
+      digest = self.ref_helper.save_ref(obj)
+      return {TAG_REF: digest}
+    
+    if is_primitive(obj):
+      return obj
+    
+    if is_tuple(obj):
+      return {TAG_TUPLE: [self._serialize(v, use_refs, path) for v in obj]}
+    
+    if is_set(obj):
+      return {TAG_SET: [self._serialize(v, use_refs, path) for v in obj]}
+    
+    if is_list(obj):
+      return [self._serialize(v, use_refs, path) for v in obj]
+    
+    if is_dictionary(obj):
+      return {k: self._serialize(v, use_refs, path) for k, v in obj.items()}
+    
+    if is_enum(obj):
+      return self._serialize_enum(obj, use_refs, path)
+    
+    if is_object(obj):
+      return self._serialize_obj_instance(obj, use_refs, path)
+    
+    raise Exception(f"Cannot serialize '{obj}'")
+  
+  def _serialize_enum(self, obj, use_refs: set | None, path):
+    # check if the object was already seen
+    if (seen := self._check_already_seen(obj)) is not None:
+      return seen
+    
+    data = {}
+    class_name = get_full_qualified_name(obj)
+    data[TAG_ENUM] = class_name + "." + obj.name
+    return data
+  
+  def _serialize_obj_instance(self, obj, use_refs: set | None, path):
+    # check if the object was already seen
+    if (seen := self._check_already_seen(obj)) is not None:
+      return seen
+    
+    # try to manage use_refs
+    current_obj_use_refs = getattr(obj, "use_refs")() if hasattr(obj, "use_refs") else None
+    if current_obj_use_refs:
+      use_refs.update(f"{path}.{sub_path}" for sub_path in current_obj_use_refs)
+    
+    if (handler := handlers.get_handler(obj)) is not None:
+      return handler.serialize(obj)
+    
+    # flatten
+    data = {}
+    cls = obj.__class__ if hasattr(obj, '__class__') else type(obj)
+    class_name = importable_name(cls)
+    data[TAG_OBJECT] = class_name
+    
+    if hasattr(obj, "__dict__"):
+      for k, v in obj.__dict__.items():
+        data[k] = self._serialize(v, use_refs, f"{path}.{k}")
+    
+    return data
+  
+  def _check_already_seen(self, obj):
+    _id = self._exist(obj)
+    if _id is not None:
+      return {TAG_ID: _id}
+    
+    # else:
+    self.ids[id(obj)] = self.id_count
+    self.objs.append(obj)
+    self.id_count = self.id_count + 1
+    
+    return None
+  
+  def _deserialize_enum(self, obj):
+    cls_name, enum_name = obj[TAG_ENUM].rsplit(".", 1)
+    cls = get_class(cls_name)
+    obj = getattr(cls, enum_name)
+    self.objs.append(obj)
+    return obj
+  
+  def _deserialize_obj_instance(self, obj):
+    
+    cls = get_class(obj[TAG_OBJECT])
+    instance = cls.__new__(cls)
+    self.objs.append(instance)
+    
+    for k, v in obj.items():
+      value = self.deserialize(v)
+      setattr(instance, k, value)
+    
+    return instance
+  
+  def _restore_id(self, obj):
+    try:
+      return self.objs[obj[TAG_ID]]
+    except IndexError:
+      pass
+  
+  def _exist(self, obj):
+    try:
+      v = self.ids[id(obj)]
+      return v
+    except KeyError:
+      return None
+
+
+class DebugSerializer(Serializer):
+  def __init__(self, ref_helper=None):
+    super().__init__(ref_helper)
+  
+  def _deserialize_obj_instance(self, obj):
+    data = {TAG_OBJECT: obj[TAG_OBJECT]}
+    self.objs.append(data)
+    
+    for k, v in obj.items():
+      value = self.deserialize(v)
+      data[k] = value
+      
+    return data
+  
+  def _deserialize_enum(self, obj):
+    cls_name, enum_name = obj[TAG_ENUM].rsplit(".", 1)
+    self.objs.append(enum_name)
+    return enum_name
+  
+  def _restore_id(self, obj):
+    try:
+      return copy.deepcopy(self.objs[obj[TAG_ID]])
+    except IndexError:
+      pass
--- a/src/core/utils.py
+++ b/src/core/utils.py
@@ -0,0 +1,195 @@
+import ast
+import hashlib
+import importlib
+import types
+from enum import Enum
+
+PRIMITIVES = (str, bool, type(None), int, float)
+
+
+def get_stream_digest(stream):
+  """
+  Compute a SHA256 from a stream
+  :param stream:
+  :type stream:
+  :return:
+  :rtype:
+  """
+  sha256_hash = hashlib.sha256()
+  stream.seek(0)
+  for byte_block in iter(lambda: stream.read(4096), b""):
+    sha256_hash.update(byte_block)
+  
+  return sha256_hash.hexdigest()
+
+
+def has_tag(obj, tag):
+  """
+
+  :param obj:
+  :param tag:
+  :return:
+  """
+  return type(obj) is dict and tag in obj
+
+
+def is_primitive(obj):
+  """
+
+  :param obj:
+  :return:
+  """
+  return isinstance(obj, PRIMITIVES)
+
+
+def is_dictionary(obj):
+  """
+
+  :param obj:
+  :return:
+  """
+  return isinstance(obj, dict)
+
+
+def is_list(obj):
+  """
+
+  :param obj:
+  :return:
+  """
+  return isinstance(obj, list)
+
+
+def is_set(obj):
+  """
+
+  :param obj:
+  :return:
+  """
+  return isinstance(obj, set)
+
+
+def is_tuple(obj):
+  """
+
+  :param obj:
+  :return:
+  """
+  return isinstance(obj, tuple)
+
+
+def is_enum(obj):
+  return isinstance(obj, Enum)
+
+
+def is_object(obj):
+  """Returns True is obj is a reference to an object instance."""
+  
+  return (isinstance(obj, object) and
+          not isinstance(obj, (type,
+                               types.FunctionType,
+                               types.BuiltinFunctionType,
+                               types.GeneratorType)))
+
+
+def get_full_qualified_name(obj):
+  """
+  Returns the full qualified name of a class (including its module name )
+  :param obj:
+  :return:
+  """
+  if obj.__class__ == type:
+    module = obj.__module__
+    if module is None or module == str.__class__.__module__:
+      return obj.__name__  # Avoid reporting __builtin__
+    else:
+      return module + '.' + obj.__name__
+  else:
+    module = obj.__class__.__module__
+    if module is None or module == str.__class__.__module__:
+      return obj.__class__.__name__  # Avoid reporting __builtin__
+    else:
+      return module + '.' + obj.__class__.__name__
+
+
+def importable_name(cls):
+  """
+  Fully qualified name (prefixed by builtin when needed)
+  """
+  # Use the fully-qualified name if available (Python >= 3.3)
+  name = getattr(cls, '__qualname__', cls.__name__)
+  
+  # manage python 2
+  lookup = dict(__builtin__='builtins', exceptions='builtins')
+  module = lookup.get(cls.__module__, cls.__module__)
+  
+  return f"{module}.{name}"
+
+
+def get_class(qualified_class_name: str):
+  """
+    Dynamically loads and returns a class type from its fully qualified name.
+    Note that the class is not instantiated.
+
+    :param qualified_class_name: Fully qualified name of the class (e.g., 'some.module.ClassName').
+    :return: The class object.
+    :raises ImportError: If the module cannot be imported.
+    :raises AttributeError: If the class cannot be resolved in the module.
+    """
+  module_name, class_name = qualified_class_name.rsplit(".", 1)
+  
+  try:
+    module = importlib.import_module(module_name)
+  except ModuleNotFoundError as e:
+    raise ImportError(f"Could not import module '{module_name}' for '{qualified_class_name}': {e}")
+  
+  if not hasattr(module, class_name):
+    raise AttributeError(f"Component '{class_name}' not found in '{module.__name__}'.")
+  
+  return getattr(module, class_name)
+
+
+class UnreferencedNamesVisitor(ast.NodeVisitor):
+  """
+  Try to find symbols that will be requested by the ast
+  It can be variable names, but also function names
+  """
+  
+  def __init__(self):
+    self.names = set()
+  
+  def get_names(self, node):
+    self.visit(node)
+    return self.names
+  
+  def visit_Name(self, node):
+    self.names.add(node.id)
+  
+  def visit_For(self, node: ast.For):
+    self.visit_selected(node, ["body", "orelse"])
+  
+  def visit_selected(self, node, to_visit):
+    """Called if no explicit visitor function exists for a node."""
+    for field in to_visit:
+      value = getattr(node, field)
+      if isinstance(value, list):
+        for item in value:
+          if isinstance(item, ast.AST):
+            self.visit(item)
+      elif isinstance(value, ast.AST):
+        self.visit(value)
+  
+  def visit_Call(self, node: ast.Call):
+    self.visit_selected(node, ["args", "keywords"])
+  
+  def visit_keyword(self, node: ast.keyword):
+    """
+    Keywords are parameters that are defined with a double star (**) in function / method definition
+    ex: def fun(positional, *args, **keywords)
+    :param node:
+    :type node:
+    :return:
+    :rtype:
+    """
+    self.names.add(node.arg)
+    self.visit_selected(node, ["value"])