Refactored DocumentService to save document in the filesystem. Fixed docker application
This commit is contained in:
@@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import magic
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClientSession
|
||||
import magic
|
||||
from pymongo.errors import PyMongoError
|
||||
|
||||
from app.database.connection import get_database
|
||||
from app.config.settings import get_objects_folder
|
||||
from app.database.repositories.document_repository import FileDocumentRepository
|
||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
||||
from app.models.document import (
|
||||
FileDocument,
|
||||
DocumentContent,
|
||||
FileType,
|
||||
ProcessingStatus
|
||||
)
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
@@ -34,13 +31,25 @@ class DocumentService:
|
||||
and their content while ensuring data consistency through transactions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the document service with repository dependencies."""
|
||||
self.db = get_database()
|
||||
self.file_repository = FileDocumentRepository(self.db)
|
||||
self.content_repository = DocumentContentRepository(self.db)
|
||||
def __init__(self, database, objects_folder: str = None):
|
||||
"""
|
||||
Initialize the document service with repository dependencies.
|
||||
|
||||
Args:
|
||||
database: Database instance
|
||||
objects_folder: folder to store files by their hash
|
||||
"""
|
||||
|
||||
self.db = database
|
||||
self.document_repository = FileDocumentRepository(self.db)
|
||||
self.objects_folder = objects_folder or get_objects_folder()
|
||||
|
||||
def _calculate_file_hash(self, file_bytes: bytes) -> str:
|
||||
async def initialize(self):
|
||||
await self.document_repository.initialize()
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def _calculate_file_hash(file_bytes: bytes) -> str:
|
||||
"""
|
||||
Calculate SHA256 hash of file content.
|
||||
|
||||
@@ -52,7 +61,8 @@ class DocumentService:
|
||||
"""
|
||||
return hashlib.sha256(file_bytes).hexdigest()
|
||||
|
||||
def _detect_file_type(self, file_path: str) -> FileType:
|
||||
@staticmethod
|
||||
def _detect_file_type(file_path: str) -> FileType:
|
||||
"""
|
||||
Detect file type from file extension.
|
||||
|
||||
@@ -72,7 +82,8 @@ class DocumentService:
|
||||
except ValueError:
|
||||
raise ValueError(f"Unsupported file type: {extension}")
|
||||
|
||||
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
||||
@staticmethod
|
||||
def _detect_mime_type(file_bytes: bytes) -> str:
|
||||
"""
|
||||
Detect MIME type from file content.
|
||||
|
||||
@@ -84,6 +95,25 @@ class DocumentService:
|
||||
"""
|
||||
return magic.from_buffer(file_bytes, mime=True)
|
||||
|
||||
def _get_document_path(self, file_hash):
|
||||
"""
|
||||
|
||||
:param file_hash:
|
||||
:return:
|
||||
"""
|
||||
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
||||
|
||||
def save_content_if_needed(self, file_hash, content: bytes):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if os.path.exists(target_path):
|
||||
return
|
||||
|
||||
if not os.path.exists(os.path.dirname(target_path)):
|
||||
os.makedirs(os.path.dirname(target_path))
|
||||
|
||||
with open(target_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
async def create_document(
|
||||
self,
|
||||
file_path: str,
|
||||
@@ -115,50 +145,32 @@ class DocumentService:
|
||||
mime_type = self._detect_mime_type(file_bytes)
|
||||
file_size = len(file_bytes)
|
||||
filename = Path(file_path).name
|
||||
detected_at = datetime.utcnow()
|
||||
detected_at = datetime.now()
|
||||
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
try:
|
||||
# Check if content already exists
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash, session=session
|
||||
)
|
||||
|
||||
# Create DocumentContent if it doesn't exist
|
||||
if not existing_content:
|
||||
content_data = DocumentContent(
|
||||
file_hash=file_hash,
|
||||
content="", # Will be populated by processing workers
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
await self.content_repository.create_document_content(
|
||||
content_data, session=session
|
||||
)
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash
|
||||
)
|
||||
|
||||
created_file = await self.file_repository.create_document(
|
||||
file_data, session=session
|
||||
)
|
||||
|
||||
return created_file
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
try:
|
||||
self.save_content_if_needed(file_hash, file_bytes)
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash,
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
|
||||
created_file = await self.document_repository.create_document(file_data)
|
||||
|
||||
return created_file
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
|
||||
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -170,7 +182,7 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_id(document_id)
|
||||
return await self.document_repository.find_document_by_id(str(document_id))
|
||||
|
||||
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -182,7 +194,7 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_hash(file_hash)
|
||||
return await self.document_repository.find_document_by_hash(file_hash)
|
||||
|
||||
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -194,32 +206,15 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_filepath(filepath)
|
||||
return await self.document_repository.find_document_by_filepath(filepath)
|
||||
|
||||
async def get_document_with_content(
|
||||
self,
|
||||
document_id: PyObjectId
|
||||
) -> Optional[Tuple[FileDocument, DocumentContent]]:
|
||||
"""
|
||||
Retrieve a document with its associated content.
|
||||
|
||||
Args:
|
||||
document_id: Document ObjectId
|
||||
|
||||
Returns:
|
||||
Tuple of (FileDocument, DocumentContent) if found, None otherwise
|
||||
"""
|
||||
document = await self.get_document_by_id(document_id)
|
||||
if not document:
|
||||
async def get_document_content_by_hash(self, file_hash):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if not os.path.exists(target_path):
|
||||
return None
|
||||
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash
|
||||
)
|
||||
if not content:
|
||||
return None
|
||||
|
||||
return (document, content)
|
||||
with open(target_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
async def list_documents(
|
||||
self,
|
||||
@@ -236,7 +231,7 @@ class DocumentService:
|
||||
Returns:
|
||||
List of FileDocument instances
|
||||
"""
|
||||
return await self.file_repository.list_documents(skip=skip, limit=limit)
|
||||
return await self.document_repository.list_documents(skip=skip, limit=limit)
|
||||
|
||||
async def count_documents(self) -> int:
|
||||
"""
|
||||
@@ -245,7 +240,7 @@ class DocumentService:
|
||||
Returns:
|
||||
Total document count
|
||||
"""
|
||||
return await self.file_repository.count_documents()
|
||||
return await self.document_repository.count_documents()
|
||||
|
||||
async def update_document(
|
||||
self,
|
||||
@@ -262,7 +257,12 @@ class DocumentService:
|
||||
Returns:
|
||||
Updated FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.update_document(document_id, update_data)
|
||||
if "file_bytes" in update_data:
|
||||
file_hash = self._calculate_file_hash(update_data["file_bytes"])
|
||||
update_data["file_hash"] = file_hash
|
||||
self.save_content_if_needed(file_hash, update_data["file_bytes"])
|
||||
|
||||
return await self.document_repository.update_document(document_id, update_data)
|
||||
|
||||
async def delete_document(self, document_id: PyObjectId) -> bool:
|
||||
"""
|
||||
@@ -281,100 +281,31 @@ class DocumentService:
|
||||
Raises:
|
||||
PyMongoError: If database operation fails
|
||||
"""
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
# Start transaction
|
||||
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = await self.document_repository.find_document_by_id(document_id)
|
||||
if not document:
|
||||
return False
|
||||
|
||||
# Delete the document
|
||||
deleted = await self.document_repository.delete_document(document_id)
|
||||
if not deleted:
|
||||
return False
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = await self.document_repository.find_document_by_hash(document.file_hash)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = await self.file_repository.find_document_by_id(
|
||||
document_id, session=session
|
||||
)
|
||||
if not document:
|
||||
return False
|
||||
|
||||
# Delete the document
|
||||
deleted = await self.file_repository.delete_document(
|
||||
document_id, session=session
|
||||
)
|
||||
if not deleted:
|
||||
return False
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = await self.file_repository.find_document_by_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
if content:
|
||||
await self.content_repository.delete_document_content(
|
||||
content.id, session=session
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if content with given hash exists.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
True if content exists, False otherwise
|
||||
"""
|
||||
return await self.content_repository.content_exists(file_hash)
|
||||
|
||||
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Retrieve content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
DocumentContent if found, None otherwise
|
||||
"""
|
||||
return await self.content_repository.find_document_content_by_file_hash(file_hash)
|
||||
|
||||
async def update_document_content(
|
||||
self,
|
||||
file_hash: str,
|
||||
content: str,
|
||||
encoding: str = "utf-8"
|
||||
) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update the extracted content for a document.
|
||||
|
||||
This method is typically called by processing workers to store
|
||||
the extracted text content.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
content: Extracted text content
|
||||
encoding: Character encoding
|
||||
|
||||
Returns:
|
||||
Updated DocumentContent if found, None otherwise
|
||||
"""
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash
|
||||
)
|
||||
if not existing_content:
|
||||
return None
|
||||
os.remove(self._get_document_path(document.file_hash))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
update_data = {
|
||||
"content": content,
|
||||
"encoding": encoding
|
||||
}
|
||||
|
||||
return await self.content_repository.update_document_content(
|
||||
existing_content.id, update_data
|
||||
)
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
Reference in New Issue
Block a user