first commit

This commit is contained in:
2025-10-17 21:08:20 +02:00
commit 878064b140
20 changed files with 1871 additions and 0 deletions

0
src/__init__.py Normal file
View File

0
src/core/__init__.py Normal file
View File

391
src/core/dbengine.py Normal file
View File

@@ -0,0 +1,391 @@
import datetime
import hashlib
import io
import json
import logging
import os
import pickle
from threading import RLock
from core.serializer import Serializer
from core.utils import get_stream_digest
TYPE_KEY = "__type__"
TAG_PARENT = "__parent__"
TAG_USER = "__user_id__"
TAG_DATE = "__date__"
BUFFER_SIZE = 4096
logger = logging.getLogger(__name__)
class DbException(Exception):
pass
class RefHelper:
def __init__(self, get_ref_path):
self.get_ref_path = get_ref_path
def save_ref(self, obj):
"""
:param obj:
:return:
"""
buffer = io.BytesIO()
pickler = pickle.Pickler(buffer)
pickler.dump(obj)
digest = get_stream_digest(buffer)
target_path = self.get_ref_path(digest)
if not os.path.exists(os.path.dirname(target_path)):
os.makedirs(os.path.dirname(target_path))
buffer.seek(0)
with open(self.get_ref_path(digest), "wb") as file:
while chunk := buffer.read(BUFFER_SIZE):
file.write(chunk)
logger.debug(f"Saved object type '{type(obj).__name__}' with digest {digest}")
return digest
def load_ref(self, digest):
"""
:param digest:
:return:
"""
with open(self.get_ref_path(digest), 'rb') as file:
return pickle.load(file)
class DbEngine:
"""
Personal implementation of DB engine
Inspired by the way git manage its files
Designed to keep history of the modifications
"""
ObjectsFolder = "objects" # group objects in the same folder
HeadFile = "head" # used to keep track of the latest version of all entries
def __init__(self, root: str = None):
self.root = root or ".mytools_db"
self.lock = RLock()
def is_initialized(self, tenant_id: str):
"""
:return:
"""
return os.path.exists(self._get_user_root(tenant_id))
def init(self, tenant_id: str):
"""
Make sure that the DbEngine is properly initialized
:return:
"""
if not os.path.exists(self._get_user_root(tenant_id)):
logger.debug(f"Creating root folder in {os.path.abspath(self._get_user_root(tenant_id))}.")
os.makedirs(self._get_user_root(tenant_id))
def save(self, tenant_id: str, user_id: str, entry: str, obj: object) -> str:
"""
Save a snapshot of an entry
:param tenant_id:
:param user_id:
:param entry:
:param obj: snapshot to save
:return:
"""
with self.lock:
logger.info(f"Saving {tenant_id=}, {entry=}, {obj=}")
if not tenant_id:
raise DbException("tenant_id is None")
if not user_id:
raise DbException("user_id is None")
if not entry:
raise DbException("entry is None")
# prepare the data
as_dict = self._serialize(obj)
as_dict[TAG_PARENT] = [self._get_entry_digest(tenant_id, entry)]
as_dict[TAG_USER] = user_id
as_dict[TAG_DATE] = datetime.datetime.now().strftime('%Y%m%d %H:%M:%S %z')
# transform into a stream
as_str = json.dumps(as_dict, sort_keys=True, indent=4)
logger.debug(f"Serialized object : {as_str}")
byte_stream = as_str.encode("utf-8")
# compute the digest to know where to store it
digest = hashlib.sha256(byte_stream).hexdigest()
target_path = self._get_obj_path(tenant_id, digest)
if os.path.exists(target_path):
# the same object is already saved. Noting to do
return digest
# save the new value
if not os.path.exists(os.path.dirname(target_path)):
os.makedirs(os.path.dirname(target_path))
with open(target_path, "wb") as file:
file.write(byte_stream)
# update the head to remember where the latest entry is
self._update_head(tenant_id, entry, digest)
logger.debug(f"New head for entry '{entry}' is {digest}")
return digest
def load(self, tenant_id: str, entry, digest=None):
"""
Loads a snapshot
:param tenant_id:
:param entry:
:param digest:
:return:
"""
with self.lock:
logger.info(f"Loading {tenant_id=}, {entry=}, {digest=}")
digest_to_use = digest or self._get_entry_digest(tenant_id, entry)
logger.debug(f"Using digest {digest_to_use}.")
if digest_to_use is None:
raise DbException(entry)
target_file = self._get_obj_path(tenant_id, digest_to_use)
with open(target_file, 'r', encoding='utf-8') as file:
as_dict = json.load(file)
return self._deserialize(as_dict)
def put(self, tenant_id: str, user_id, entry, key: str, value: object):
"""
Save a specific record.
This will create a new snapshot is the record is new or different
You should not mix the usage of put_many() and save() as it's two different way to manage the db
:param user_id:
:param tenant_id:
:param entry:
:param key:
:param value:
:return:
"""
with self.lock:
logger.info(f"Adding {tenant_id=}, {entry=}, {key=}, {value=}")
try:
entry_content = self.load(tenant_id, entry)
except DbException:
entry_content = {}
# Do not save if the entry is the same
if key in entry_content:
old_value = entry_content[key]
if old_value == value:
return False
entry_content[key] = value
self.save(tenant_id, user_id, entry, entry_content)
return True
def put_many(self, tenant_id: str, user_id, entry, items: list | dict):
"""
Save a list of item as one single snapshot
A new snapshot will not be created if all the items already exist
You should not mix the usage of put_many() and save() as it's two different way to manage the db
:param tenant_id:
:param user_id:
:param entry:
:param items:
:return:
"""
with self.lock:
logger.info(f"Adding many {tenant_id=}, {entry=}, {items=}")
try:
entry_content = self.load(tenant_id, entry)
except DbException:
entry_content = {}
is_dirty = False
if isinstance(items, dict):
for key, item in items.items():
if key in entry_content and entry_content[key] == item:
continue
else:
entry_content[key] = item
is_dirty = True
else:
for item in items:
key = item.get_key()
if key in entry_content and entry_content[key] == item:
continue
else:
entry_content[key] = item
is_dirty = True
if is_dirty:
self.save(tenant_id, user_id, entry, entry_content)
return True
return False
def exists(self, tenant_id, entry: str):
"""
Tells if an entry exist
:param tenant_id:
:param entry:
:return:
"""
with self.lock:
return self._get_entry_digest(tenant_id, entry) is not None
def get(self, tenant_id: str, entry: str, key: str | None = None, digest=None):
"""
Retrieve an item from the snapshot
:param tenant_id:
:param entry:
:param key:
:param digest:
:return:
"""
with self.lock:
logger.info(f"Getting {tenant_id=}, {entry=}, {key=}, {digest=}")
entry_content = self.load(tenant_id, entry, digest)
if key is None:
# return all items as list
return [v for k, v in entry_content.items() if not k.startswith("__")]
try:
return entry_content[key]
except KeyError:
raise DbException(f"Key '{key}' not found in entry '{entry}'")
def history(self, tenant_id, entry, digest=None, max_items=1000):
"""
Gives the current digest and all its ancestors
:param tenant_id:
:param entry:
:param digest:
:param max_items:
:return:
"""
with self.lock:
logger.info(f"History for {tenant_id=}, {entry=}, {digest=}")
digest_to_use = digest or self._get_entry_digest(tenant_id, entry)
logger.debug(f"Using digest {digest_to_use}.")
count = 0
history = []
while True:
if count >= max_items or digest_to_use is None:
break
history.append(digest_to_use)
count += 1
try:
target_file = self._get_obj_path(tenant_id, digest_to_use)
with open(target_file, 'r', encoding='utf-8') as file:
as_dict = json.load(file)
digest_to_use = as_dict[TAG_PARENT][0]
except FileNotFoundError:
break
return history
def get_digest(self, tenant_id, entry):
return self._get_entry_digest(tenant_id, entry)
def _serialize(self, obj):
"""
Just call the serializer
:param obj:
:return:
"""
with self.lock:
serializer = Serializer(RefHelper(self._get_ref_path))
use_refs = getattr(obj, "use_refs")() if hasattr(obj, "use_refs") else None
return serializer.serialize(obj, use_refs)
def _deserialize(self, as_dict):
with self.lock:
serializer = Serializer(RefHelper(self._get_ref_path))
return serializer.deserialize(as_dict)
def _update_head(self, tenant_id, entry, digest):
"""
Actually dumps the snapshot in file system
:param entry:
:param digest:
:return:
"""
head_path = os.path.join(self.root, tenant_id, self.HeadFile)
# load
try:
with open(head_path, 'r') as file:
head = json.load(file)
except FileNotFoundError:
head = {}
# update
head[entry] = digest
# and save
with open(head_path, 'w') as file:
json.dump(head, file)
def _get_user_root(self, tenant_id):
return os.path.join(self.root, tenant_id)
def _get_entry_digest(self, tenant_id, entry):
"""
Search for the latest digest, for a given entry
:param entry:
:return:
"""
head_path = os.path.join(self._get_user_root(tenant_id), self.HeadFile)
try:
with open(head_path, 'r') as file:
head = json.load(file)
return head[str(entry)]
except FileNotFoundError:
return None
except KeyError:
return None
def _get_head_path(self, tenant_id: str):
"""
Location of the Head file
:return:
"""
return os.path.join(self._get_user_root(tenant_id), self.HeadFile)
def _get_obj_path(self, tenant_id, digest):
"""
Location of objects
:param digest:
:return:
"""
return os.path.join(self._get_user_root(tenant_id), "objects", digest[:24], digest)
def _get_ref_path(self, digest):
"""
Location of reference. They are not linked to the user folder
:param digest:
:return:
"""
return os.path.join(self.root, "refs", digest[:24], digest)

59
src/core/handlers.py Normal file
View File

@@ -0,0 +1,59 @@
# I delegate the complexity of some data type within specific handlers
import datetime
from core.utils import has_tag
TAG_SPECIAL = "__special__"
class BaseHandler:
def is_eligible_for(self, obj):
pass
def tag(self):
pass
def serialize(self, obj) -> dict:
pass
def deserialize(self, data: dict) -> object:
pass
class DateHandler(BaseHandler):
def is_eligible_for(self, obj):
return isinstance(obj, datetime.date)
def tag(self):
return "Date"
def serialize(self, obj):
return {
TAG_SPECIAL: self.tag(),
"year": obj.year,
"month": obj.month,
"day": obj.day,
}
def deserialize(self, data: dict) -> object:
return datetime.date(year=data["year"], month=data["month"], day=data["day"])
class Handlers:
def __init__(self, handlers_):
self.handlers = handlers_
def get_handler(self, obj):
if has_tag(obj, TAG_SPECIAL):
return [h for h in self.handlers if h.tag() == obj[TAG_SPECIAL]][0]
for h in self.handlers:
if h.is_eligible_for(obj):
return h
return None
handlers = Handlers([DateHandler()])

201
src/core/serializer.py Normal file
View File

@@ -0,0 +1,201 @@
import copy
from core.handlers import handlers
from core.utils import has_tag, is_dictionary, is_list, is_object, is_set, is_tuple, is_primitive, importable_name, \
get_class, get_full_qualified_name, is_enum
TAG_ID = "__id__"
TAG_OBJECT = "__object__"
TAG_TUPLE = "__tuple__"
TAG_SET = "__set__"
TAG_REF = "__ref__"
TAG_ENUM = "__enum__"
class Serializer:
def __init__(self, ref_helper=None):
self.ref_helper = ref_helper
self.ids = {}
self.objs = []
self.id_count = 0
def serialize(self, obj, use_refs=None):
"""
From object to dictionary
:param obj:
:param use_refs: Sometimes it easier / quicker to use pickle !
:return:
"""
if use_refs:
use_refs = set("root." + path for path in use_refs)
return self._serialize(obj, use_refs or set(), "root")
def deserialize(self, obj: dict):
"""
From dictionary to object (or primitive)
:param obj:
:return:
"""
if has_tag(obj, TAG_REF):
return self.ref_helper.load_ref(obj[TAG_REF])
if has_tag(obj, TAG_ID):
return self._restore_id(obj)
if has_tag(obj, TAG_TUPLE):
return tuple([self.deserialize(v) for v in obj[TAG_TUPLE]])
if has_tag(obj, TAG_SET):
return set([self.deserialize(v) for v in obj[TAG_SET]])
if has_tag(obj, TAG_ENUM):
return self._deserialize_enum(obj)
if has_tag(obj, TAG_OBJECT):
return self._deserialize_obj_instance(obj)
if (handler := handlers.get_handler(obj)) is not None:
return handler.deserialize(obj)
if is_list(obj):
return [self.deserialize(v) for v in obj]
if is_dictionary(obj):
return {k: self.deserialize(v) for k, v in obj.items()}
return obj
def _serialize(self, obj, use_refs: set | None, path):
if use_refs is not None and path in use_refs:
digest = self.ref_helper.save_ref(obj)
return {TAG_REF: digest}
if is_primitive(obj):
return obj
if is_tuple(obj):
return {TAG_TUPLE: [self._serialize(v, use_refs, path) for v in obj]}
if is_set(obj):
return {TAG_SET: [self._serialize(v, use_refs, path) for v in obj]}
if is_list(obj):
return [self._serialize(v, use_refs, path) for v in obj]
if is_dictionary(obj):
return {k: self._serialize(v, use_refs, path) for k, v in obj.items()}
if is_enum(obj):
return self._serialize_enum(obj, use_refs, path)
if is_object(obj):
return self._serialize_obj_instance(obj, use_refs, path)
raise Exception(f"Cannot serialize '{obj}'")
def _serialize_enum(self, obj, use_refs: set | None, path):
# check if the object was already seen
if (seen := self._check_already_seen(obj)) is not None:
return seen
data = {}
class_name = get_full_qualified_name(obj)
data[TAG_ENUM] = class_name + "." + obj.name
return data
def _serialize_obj_instance(self, obj, use_refs: set | None, path):
# check if the object was already seen
if (seen := self._check_already_seen(obj)) is not None:
return seen
# try to manage use_refs
current_obj_use_refs = getattr(obj, "use_refs")() if hasattr(obj, "use_refs") else None
if current_obj_use_refs:
use_refs.update(f"{path}.{sub_path}" for sub_path in current_obj_use_refs)
if (handler := handlers.get_handler(obj)) is not None:
return handler.serialize(obj)
# flatten
data = {}
cls = obj.__class__ if hasattr(obj, '__class__') else type(obj)
class_name = importable_name(cls)
data[TAG_OBJECT] = class_name
if hasattr(obj, "__dict__"):
for k, v in obj.__dict__.items():
data[k] = self._serialize(v, use_refs, f"{path}.{k}")
return data
def _check_already_seen(self, obj):
_id = self._exist(obj)
if _id is not None:
return {TAG_ID: _id}
# else:
self.ids[id(obj)] = self.id_count
self.objs.append(obj)
self.id_count = self.id_count + 1
return None
def _deserialize_enum(self, obj):
cls_name, enum_name = obj[TAG_ENUM].rsplit(".", 1)
cls = get_class(cls_name)
obj = getattr(cls, enum_name)
self.objs.append(obj)
return obj
def _deserialize_obj_instance(self, obj):
cls = get_class(obj[TAG_OBJECT])
instance = cls.__new__(cls)
self.objs.append(instance)
for k, v in obj.items():
value = self.deserialize(v)
setattr(instance, k, value)
return instance
def _restore_id(self, obj):
try:
return self.objs[obj[TAG_ID]]
except IndexError:
pass
def _exist(self, obj):
try:
v = self.ids[id(obj)]
return v
except KeyError:
return None
class DebugSerializer(Serializer):
def __init__(self, ref_helper=None):
super().__init__(ref_helper)
def _deserialize_obj_instance(self, obj):
data = {TAG_OBJECT: obj[TAG_OBJECT]}
self.objs.append(data)
for k, v in obj.items():
value = self.deserialize(v)
data[k] = value
return data
def _deserialize_enum(self, obj):
cls_name, enum_name = obj[TAG_ENUM].rsplit(".", 1)
self.objs.append(enum_name)
return enum_name
def _restore_id(self, obj):
try:
return copy.deepcopy(self.objs[obj[TAG_ID]])
except IndexError:
pass

195
src/core/utils.py Normal file
View File

@@ -0,0 +1,195 @@
import ast
import hashlib
import importlib
import types
from enum import Enum
PRIMITIVES = (str, bool, type(None), int, float)
def get_stream_digest(stream):
"""
Compute a SHA256 from a stream
:param stream:
:type stream:
:return:
:rtype:
"""
sha256_hash = hashlib.sha256()
stream.seek(0)
for byte_block in iter(lambda: stream.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def has_tag(obj, tag):
"""
:param obj:
:param tag:
:return:
"""
return type(obj) is dict and tag in obj
def is_primitive(obj):
"""
:param obj:
:return:
"""
return isinstance(obj, PRIMITIVES)
def is_dictionary(obj):
"""
:param obj:
:return:
"""
return isinstance(obj, dict)
def is_list(obj):
"""
:param obj:
:return:
"""
return isinstance(obj, list)
def is_set(obj):
"""
:param obj:
:return:
"""
return isinstance(obj, set)
def is_tuple(obj):
"""
:param obj:
:return:
"""
return isinstance(obj, tuple)
def is_enum(obj):
return isinstance(obj, Enum)
def is_object(obj):
"""Returns True is obj is a reference to an object instance."""
return (isinstance(obj, object) and
not isinstance(obj, (type,
types.FunctionType,
types.BuiltinFunctionType,
types.GeneratorType)))
def get_full_qualified_name(obj):
"""
Returns the full qualified name of a class (including its module name )
:param obj:
:return:
"""
if obj.__class__ == type:
module = obj.__module__
if module is None or module == str.__class__.__module__:
return obj.__name__ # Avoid reporting __builtin__
else:
return module + '.' + obj.__name__
else:
module = obj.__class__.__module__
if module is None or module == str.__class__.__module__:
return obj.__class__.__name__ # Avoid reporting __builtin__
else:
return module + '.' + obj.__class__.__name__
def importable_name(cls):
"""
Fully qualified name (prefixed by builtin when needed)
"""
# Use the fully-qualified name if available (Python >= 3.3)
name = getattr(cls, '__qualname__', cls.__name__)
# manage python 2
lookup = dict(__builtin__='builtins', exceptions='builtins')
module = lookup.get(cls.__module__, cls.__module__)
return f"{module}.{name}"
def get_class(qualified_class_name: str):
"""
Dynamically loads and returns a class type from its fully qualified name.
Note that the class is not instantiated.
:param qualified_class_name: Fully qualified name of the class (e.g., 'some.module.ClassName').
:return: The class object.
:raises ImportError: If the module cannot be imported.
:raises AttributeError: If the class cannot be resolved in the module.
"""
module_name, class_name = qualified_class_name.rsplit(".", 1)
try:
module = importlib.import_module(module_name)
except ModuleNotFoundError as e:
raise ImportError(f"Could not import module '{module_name}' for '{qualified_class_name}': {e}")
if not hasattr(module, class_name):
raise AttributeError(f"Component '{class_name}' not found in '{module.__name__}'.")
return getattr(module, class_name)
class UnreferencedNamesVisitor(ast.NodeVisitor):
"""
Try to find symbols that will be requested by the ast
It can be variable names, but also function names
"""
def __init__(self):
self.names = set()
def get_names(self, node):
self.visit(node)
return self.names
def visit_Name(self, node):
self.names.add(node.id)
def visit_For(self, node: ast.For):
self.visit_selected(node, ["body", "orelse"])
def visit_selected(self, node, to_visit):
"""Called if no explicit visitor function exists for a node."""
for field in to_visit:
value = getattr(node, field)
if isinstance(value, list):
for item in value:
if isinstance(item, ast.AST):
self.visit(item)
elif isinstance(value, ast.AST):
self.visit(value)
def visit_Call(self, node: ast.Call):
self.visit_selected(node, ["args", "keywords"])
def visit_keyword(self, node: ast.keyword):
"""
Keywords are parameters that are defined with a double star (**) in function / method definition
ex: def fun(positional, *args, **keywords)
:param node:
:type node:
:return:
:rtype:
"""
self.names.add(node.arg)
self.visit_selected(node, ["value"])