Refactored DocumentService to save document in the filesystem. Fixed docker application

This commit is contained in:
2025-09-20 21:06:27 +02:00
parent f1b551d243
commit 9564cfadd5
28 changed files with 1577 additions and 2442 deletions

View File

@@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions.
"""
import hashlib
import magic
import os
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any, Tuple
from typing import List, Optional, Dict, Any
from motor.motor_asyncio import AsyncIOMotorClientSession
import magic
from pymongo.errors import PyMongoError
from app.database.connection import get_database
from app.config.settings import get_objects_folder
from app.database.repositories.document_repository import FileDocumentRepository
from app.database.repositories.document_content_repository import DocumentContentRepository
from app.models.document import (
FileDocument,
DocumentContent,
FileType,
ProcessingStatus
)
from app.models.types import PyObjectId
@@ -34,13 +31,25 @@ class DocumentService:
and their content while ensuring data consistency through transactions.
"""
def __init__(self):
"""Initialize the document service with repository dependencies."""
self.db = get_database()
self.file_repository = FileDocumentRepository(self.db)
self.content_repository = DocumentContentRepository(self.db)
def __init__(self, database, objects_folder: str = None):
"""
Initialize the document service with repository dependencies.
Args:
database: Database instance
objects_folder: folder to store files by their hash
"""
self.db = database
self.document_repository = FileDocumentRepository(self.db)
self.objects_folder = objects_folder or get_objects_folder()
def _calculate_file_hash(self, file_bytes: bytes) -> str:
async def initialize(self):
await self.document_repository.initialize()
return self
@staticmethod
def _calculate_file_hash(file_bytes: bytes) -> str:
"""
Calculate SHA256 hash of file content.
@@ -52,7 +61,8 @@ class DocumentService:
"""
return hashlib.sha256(file_bytes).hexdigest()
def _detect_file_type(self, file_path: str) -> FileType:
@staticmethod
def _detect_file_type(file_path: str) -> FileType:
"""
Detect file type from file extension.
@@ -72,7 +82,8 @@ class DocumentService:
except ValueError:
raise ValueError(f"Unsupported file type: {extension}")
def _detect_mime_type(self, file_bytes: bytes) -> str:
@staticmethod
def _detect_mime_type(file_bytes: bytes) -> str:
"""
Detect MIME type from file content.
@@ -84,6 +95,25 @@ class DocumentService:
"""
return magic.from_buffer(file_bytes, mime=True)
def _get_document_path(self, file_hash):
"""
:param file_hash:
:return:
"""
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
def save_content_if_needed(self, file_hash, content: bytes):
target_path = self._get_document_path(file_hash)
if os.path.exists(target_path):
return
if not os.path.exists(os.path.dirname(target_path)):
os.makedirs(os.path.dirname(target_path))
with open(target_path, "wb") as f:
f.write(content)
async def create_document(
self,
file_path: str,
@@ -115,50 +145,32 @@ class DocumentService:
mime_type = self._detect_mime_type(file_bytes)
file_size = len(file_bytes)
filename = Path(file_path).name
detected_at = datetime.utcnow()
detected_at = datetime.now()
# Start MongoDB transaction
async with await self.db.client.start_session() as session:
async with session.start_transaction():
try:
# Check if content already exists
existing_content = await self.content_repository.find_document_content_by_file_hash(
file_hash, session=session
)
# Create DocumentContent if it doesn't exist
if not existing_content:
content_data = DocumentContent(
file_hash=file_hash,
content="", # Will be populated by processing workers
encoding=encoding,
file_size=file_size,
mime_type=mime_type
)
await self.content_repository.create_document_content(
content_data, session=session
)
# Create FileDocument
file_data = FileDocument(
filename=filename,
filepath=file_path,
file_type=file_type,
extraction_method=None, # Will be set by processing workers
metadata={}, # Empty for now
detected_at=detected_at,
file_hash=file_hash
)
created_file = await self.file_repository.create_document(
file_data, session=session
)
return created_file
except Exception as e:
# Transaction will automatically rollback
raise PyMongoError(f"Failed to create document: {str(e)}")
try:
self.save_content_if_needed(file_hash, file_bytes)
# Create FileDocument
file_data = FileDocument(
filename=filename,
filepath=file_path,
file_type=file_type,
extraction_method=None, # Will be set by processing workers
metadata={}, # Empty for now
detected_at=detected_at,
file_hash=file_hash,
encoding=encoding,
file_size=file_size,
mime_type=mime_type
)
created_file = await self.document_repository.create_document(file_data)
return created_file
except Exception as e:
# Transaction will automatically rollback if supported
raise PyMongoError(f"Failed to create document: {str(e)}")
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
"""
@@ -170,7 +182,7 @@ class DocumentService:
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_id(document_id)
return await self.document_repository.find_document_by_id(str(document_id))
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
"""
@@ -182,7 +194,7 @@ class DocumentService:
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_hash(file_hash)
return await self.document_repository.find_document_by_hash(file_hash)
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
"""
@@ -194,32 +206,15 @@ class DocumentService:
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_filepath(filepath)
return await self.document_repository.find_document_by_filepath(filepath)
async def get_document_with_content(
self,
document_id: PyObjectId
) -> Optional[Tuple[FileDocument, DocumentContent]]:
"""
Retrieve a document with its associated content.
Args:
document_id: Document ObjectId
Returns:
Tuple of (FileDocument, DocumentContent) if found, None otherwise
"""
document = await self.get_document_by_id(document_id)
if not document:
async def get_document_content_by_hash(self, file_hash):
target_path = self._get_document_path(file_hash)
if not os.path.exists(target_path):
return None
content = await self.content_repository.find_document_content_by_file_hash(
document.file_hash
)
if not content:
return None
return (document, content)
with open(target_path, "rb") as f:
return f.read()
async def list_documents(
self,
@@ -236,7 +231,7 @@ class DocumentService:
Returns:
List of FileDocument instances
"""
return await self.file_repository.list_documents(skip=skip, limit=limit)
return await self.document_repository.list_documents(skip=skip, limit=limit)
async def count_documents(self) -> int:
"""
@@ -245,7 +240,7 @@ class DocumentService:
Returns:
Total document count
"""
return await self.file_repository.count_documents()
return await self.document_repository.count_documents()
async def update_document(
self,
@@ -262,7 +257,12 @@ class DocumentService:
Returns:
Updated FileDocument if found, None otherwise
"""
return await self.file_repository.update_document(document_id, update_data)
if "file_bytes" in update_data:
file_hash = self._calculate_file_hash(update_data["file_bytes"])
update_data["file_hash"] = file_hash
self.save_content_if_needed(file_hash, update_data["file_bytes"])
return await self.document_repository.update_document(document_id, update_data)
async def delete_document(self, document_id: PyObjectId) -> bool:
"""
@@ -281,100 +281,31 @@ class DocumentService:
Raises:
PyMongoError: If database operation fails
"""
# Start MongoDB transaction
async with await self.db.client.start_session() as session:
async with session.start_transaction():
# Start transaction
try:
# Get document to find its hash
document = await self.document_repository.find_document_by_id(document_id)
if not document:
return False
# Delete the document
deleted = await self.document_repository.delete_document(document_id)
if not deleted:
return False
# Check if content is orphaned
remaining_files = await self.document_repository.find_document_by_hash(document.file_hash)
# If no other files reference this content, delete it
if not remaining_files:
try:
# Get document to find its hash
document = await self.file_repository.find_document_by_id(
document_id, session=session
)
if not document:
return False
# Delete the document
deleted = await self.file_repository.delete_document(
document_id, session=session
)
if not deleted:
return False
# Check if content is orphaned
remaining_files = await self.file_repository.find_document_by_hash(
document.file_hash, session=session
)
# If no other files reference this content, delete it
if not remaining_files:
content = await self.content_repository.find_document_content_by_file_hash(
document.file_hash, session=session
)
if content:
await self.content_repository.delete_document_content(
content.id, session=session
)
return True
except Exception as e:
# Transaction will automatically rollback
raise PyMongoError(f"Failed to delete document: {str(e)}")
async def content_exists(self, file_hash: str) -> bool:
"""
Check if content with given hash exists.
Args:
file_hash: SHA256 hash of file content
Returns:
True if content exists, False otherwise
"""
return await self.content_repository.content_exists(file_hash)
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
"""
Retrieve content by file hash.
Args:
file_hash: SHA256 hash of file content
Returns:
DocumentContent if found, None otherwise
"""
return await self.content_repository.find_document_content_by_file_hash(file_hash)
async def update_document_content(
self,
file_hash: str,
content: str,
encoding: str = "utf-8"
) -> Optional[DocumentContent]:
"""
Update the extracted content for a document.
This method is typically called by processing workers to store
the extracted text content.
Args:
file_hash: SHA256 hash of file content
content: Extracted text content
encoding: Character encoding
Returns:
Updated DocumentContent if found, None otherwise
"""
existing_content = await self.content_repository.find_document_content_by_file_hash(
file_hash
)
if not existing_content:
return None
os.remove(self._get_document_path(document.file_hash))
except Exception:
pass
return True
update_data = {
"content": content,
"encoding": encoding
}
return await self.content_repository.update_document_content(
existing_content.id, update_data
)
except Exception as e:
# Transaction will automatically rollback if supported
raise PyMongoError(f"Failed to delete document: {str(e)}")