312 lines
8.7 KiB
Python
312 lines
8.7 KiB
Python
"""
|
|
Document service for orchestrated file and content management.
|
|
|
|
This service coordinates between FileDocument and DocumentContent repositories
|
|
while maintaining data consistency through MongoDB transactions.
|
|
"""
|
|
|
|
import hashlib
|
|
import os
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any
|
|
|
|
import magic
|
|
from pymongo.errors import PyMongoError
|
|
|
|
from app.config.settings import get_objects_folder
|
|
from app.database.repositories.document_repository import FileDocumentRepository
|
|
from app.models.document import (
|
|
FileDocument,
|
|
FileType,
|
|
)
|
|
from app.models.types import PyObjectId
|
|
|
|
|
|
class DocumentService:
|
|
"""
|
|
Service for orchestrated document and content management.
|
|
|
|
Provides high-level operations that coordinate between file documents
|
|
and their content while ensuring data consistency through transactions.
|
|
"""
|
|
|
|
def __init__(self, database, objects_folder: str = None):
|
|
"""
|
|
Initialize the document service with repository dependencies.
|
|
|
|
Args:
|
|
database: Database instance
|
|
objects_folder: folder to store files by their hash
|
|
"""
|
|
|
|
self.db = database
|
|
self.document_repository = FileDocumentRepository(self.db)
|
|
self.objects_folder = objects_folder or get_objects_folder()
|
|
|
|
async def initialize(self):
|
|
await self.document_repository.initialize()
|
|
return self
|
|
|
|
@staticmethod
|
|
def _calculate_file_hash(file_bytes: bytes) -> str:
|
|
"""
|
|
Calculate SHA256 hash of file content.
|
|
|
|
Args:
|
|
file_bytes: Raw file content as bytes
|
|
|
|
Returns:
|
|
Hexadecimal SHA256 hash string
|
|
"""
|
|
return hashlib.sha256(file_bytes).hexdigest()
|
|
|
|
@staticmethod
|
|
def _detect_file_type(file_path: str) -> FileType:
|
|
"""
|
|
Detect file type from file extension.
|
|
|
|
Args:
|
|
file_path: Path to the file
|
|
|
|
Returns:
|
|
FileType enum value
|
|
|
|
Raises:
|
|
ValueError: If file type is not supported
|
|
"""
|
|
extension = Path(file_path).suffix.lower().lstrip('.')
|
|
|
|
try:
|
|
return FileType(extension)
|
|
except ValueError:
|
|
raise ValueError(f"Unsupported file type: {extension}")
|
|
|
|
@staticmethod
|
|
def _detect_mime_type(file_bytes: bytes) -> str:
|
|
"""
|
|
Detect MIME type from file content.
|
|
|
|
Args:
|
|
file_bytes: Raw file content as bytes
|
|
|
|
Returns:
|
|
MIME type string
|
|
"""
|
|
return magic.from_buffer(file_bytes, mime=True)
|
|
|
|
def _get_document_path(self, file_hash):
|
|
"""
|
|
|
|
:param file_hash:
|
|
:return:
|
|
"""
|
|
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
|
|
|
def save_content_if_needed(self, file_hash, content: bytes):
|
|
target_path = self._get_document_path(file_hash)
|
|
if os.path.exists(target_path):
|
|
return
|
|
|
|
if not os.path.exists(os.path.dirname(target_path)):
|
|
os.makedirs(os.path.dirname(target_path))
|
|
|
|
with open(target_path, "wb") as f:
|
|
f.write(content)
|
|
|
|
async def create_document(
|
|
self,
|
|
file_path: str,
|
|
file_bytes: bytes,
|
|
encoding: str = "utf-8"
|
|
) -> FileDocument:
|
|
"""
|
|
Create a new document with automatic deduplication.
|
|
|
|
This method handles the creation of both FileDocument and DocumentContent
|
|
with proper deduplication based on file hash. If content with the same
|
|
hash already exists, only a new FileDocument is created.
|
|
|
|
Args:
|
|
file_path: Full path to the file
|
|
file_bytes: Raw file content as bytes
|
|
encoding: Character encoding for text content
|
|
|
|
Returns:
|
|
Created FileDocument instance
|
|
|
|
Raises:
|
|
ValueError: If file type is not supported
|
|
PyMongoError: If database operation fails
|
|
"""
|
|
# Calculate automatic attributes
|
|
file_hash = self._calculate_file_hash(file_bytes)
|
|
file_type = self._detect_file_type(file_path)
|
|
mime_type = self._detect_mime_type(file_bytes)
|
|
file_size = len(file_bytes)
|
|
filename = Path(file_path).name
|
|
detected_at = datetime.now()
|
|
|
|
try:
|
|
self.save_content_if_needed(file_hash, file_bytes)
|
|
|
|
# Create FileDocument
|
|
file_data = FileDocument(
|
|
filename=filename,
|
|
filepath=file_path,
|
|
file_type=file_type,
|
|
extraction_method=None, # Will be set by processing workers
|
|
metadata={}, # Empty for now
|
|
detected_at=detected_at,
|
|
file_hash=file_hash,
|
|
encoding=encoding,
|
|
file_size=file_size,
|
|
mime_type=mime_type
|
|
)
|
|
|
|
created_file = await self.document_repository.create_document(file_data)
|
|
|
|
return created_file
|
|
|
|
except Exception as e:
|
|
# Transaction will automatically rollback if supported
|
|
raise PyMongoError(f"Failed to create document: {str(e)}")
|
|
|
|
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
|
"""
|
|
Retrieve a document by its ID.
|
|
|
|
Args:
|
|
document_id: Document ObjectId
|
|
|
|
Returns:
|
|
FileDocument if found, None otherwise
|
|
"""
|
|
return await self.document_repository.find_document_by_id(str(document_id))
|
|
|
|
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
|
"""
|
|
Retrieve a document by its file hash.
|
|
|
|
Args:
|
|
file_hash: SHA256 hash of file content
|
|
|
|
Returns:
|
|
FileDocument if found, None otherwise
|
|
"""
|
|
return await self.document_repository.find_document_by_hash(file_hash)
|
|
|
|
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
|
"""
|
|
Retrieve a document by its file path.
|
|
|
|
Args:
|
|
filepath: Full path to the file
|
|
|
|
Returns:
|
|
FileDocument if found, None otherwise
|
|
"""
|
|
return await self.document_repository.find_document_by_filepath(filepath)
|
|
|
|
async def get_document_content_by_hash(self, file_hash):
|
|
target_path = self._get_document_path(file_hash)
|
|
if not os.path.exists(target_path):
|
|
return None
|
|
|
|
with open(target_path, "rb") as f:
|
|
return f.read()
|
|
|
|
async def list_documents(
|
|
self,
|
|
skip: int = 0,
|
|
limit: int = 100
|
|
) -> List[FileDocument]:
|
|
"""
|
|
List documents with pagination.
|
|
|
|
Args:
|
|
skip: Number of documents to skip
|
|
limit: Maximum number of documents to return
|
|
|
|
Returns:
|
|
List of FileDocument instances
|
|
"""
|
|
return await self.document_repository.list_documents(skip=skip, limit=limit)
|
|
|
|
async def count_documents(self) -> int:
|
|
"""
|
|
Get total number of documents.
|
|
|
|
Returns:
|
|
Total document count
|
|
"""
|
|
return await self.document_repository.count_documents()
|
|
|
|
async def update_document(
|
|
self,
|
|
document_id: PyObjectId,
|
|
update_data: Dict[str, Any]
|
|
) -> Optional[FileDocument]:
|
|
"""
|
|
Update document metadata.
|
|
|
|
Args:
|
|
document_id: Document ObjectId
|
|
update_data: Dictionary with fields to update
|
|
|
|
Returns:
|
|
Updated FileDocument if found, None otherwise
|
|
"""
|
|
if "file_bytes" in update_data:
|
|
file_hash = self._calculate_file_hash(update_data["file_bytes"])
|
|
update_data["file_hash"] = file_hash
|
|
self.save_content_if_needed(file_hash, update_data["file_bytes"])
|
|
|
|
return await self.document_repository.update_document(document_id, update_data)
|
|
|
|
async def delete_document(self, document_id: PyObjectId) -> bool:
|
|
"""
|
|
Delete a document and its orphaned content.
|
|
|
|
This method removes the FileDocument and checks if the associated
|
|
DocumentContent is orphaned (no other files reference it). If orphaned,
|
|
the content is also deleted.
|
|
|
|
Args:
|
|
document_id: Document ObjectId
|
|
|
|
Returns:
|
|
True if document was deleted, False otherwise
|
|
|
|
Raises:
|
|
PyMongoError: If database operation fails
|
|
"""
|
|
# Start transaction
|
|
|
|
try:
|
|
# Get document to find its hash
|
|
document = await self.document_repository.find_document_by_id(document_id)
|
|
if not document:
|
|
return False
|
|
|
|
# Delete the document
|
|
deleted = await self.document_repository.delete_document(document_id)
|
|
if not deleted:
|
|
return False
|
|
|
|
# Check if content is orphaned
|
|
remaining_files = await self.document_repository.find_document_by_hash(document.file_hash)
|
|
|
|
# If no other files reference this content, delete it
|
|
if not remaining_files:
|
|
try:
|
|
os.remove(self._get_document_path(document.file_hash))
|
|
except Exception:
|
|
pass
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
# Transaction will automatically rollback if supported
|
|
raise PyMongoError(f"Failed to delete document: {str(e)}")
|