Adding document service

This commit is contained in:
2025-09-19 22:59:41 +02:00
parent e8b306ac4a
commit f1b551d243
13 changed files with 1734 additions and 24 deletions

View File

@@ -0,0 +1,214 @@
from typing import List, Optional
from datetime import datetime
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
from pymongo.errors import DuplicateKeyError, PyMongoError
from bson import ObjectId
from app.models.document import DocumentContent
class DocumentContentRepository:
"""
Repository class for document content CRUD operations in MongoDB.
This class handles all database operations related to document content,
following the repository pattern with dependency injection and async/await.
"""
def __init__(self, database: AsyncIOMotorDatabase):
"""
Initialize repository with database dependency.
Args:
database (AsyncIOMotorDatabase): MongoDB database instance
"""
self.db = database
self.collection: AsyncIOMotorCollection = database.document_contents
self._ensure_indexes()
async def initialize(self):
"""
Initialize repository by ensuring required indexes exist.
Should be called after repository instantiation to setup database indexes.
"""
await self._ensure_indexes()
async def _ensure_indexes(self):
"""
Ensure required database indexes exist.
Creates unique index on file_hash field to prevent duplicates.
"""
try:
await self.collection.create_index("file_hash", unique=True)
except PyMongoError:
# Index might already exist, ignore error
pass
async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
"""
Create a new document content in the database.
Args:
document_content (DocumentContent): Document content data
Returns:
DocumentContent: Created document content with database ID
Raises:
DuplicateKeyError: If file_hash already exists
ValueError: If document content creation fails due to validation
"""
document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
# Remove _id if it's None to let MongoDB generate it
if document_dict.get("_id") is None:
document_dict.pop("_id", None)
try:
result = await self.collection.insert_one(document_dict)
document_dict["_id"] = result.inserted_id
return DocumentContent(**document_dict)
except DuplicateKeyError as e:
raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
except PyMongoError as e:
raise ValueError(f"Failed to create document content: {e}")
async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
"""
Find document content by ID.
Args:
document_id (str): Document content ID to search for
Returns:
DocumentContent or None: Document content if found, None otherwise
"""
try:
if not ObjectId.is_valid(document_id):
return None
document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
if document_doc:
return DocumentContent(**document_doc)
return None
except PyMongoError:
return None
async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
"""
Find document content by file hash.
Args:
file_hash (str): File hash to search for
Returns:
DocumentContent or None: Document content if found, None otherwise
"""
try:
document_doc = await self.collection.find_one({"file_hash": file_hash})
if document_doc:
return DocumentContent(**document_doc)
return None
except PyMongoError:
return None
async def content_exists(self, file_hash: str) -> bool:
"""
Check if document content exists by file hash.
Args:
file_hash (str): File hash to check
Returns:
bool: True if document content exists, False otherwise
"""
try:
count = await self.collection.count_documents({"file_hash": file_hash})
return count > 0
except PyMongoError:
return False
async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
"""
Update document content information.
Args:
document_id (str): Document content ID to update
update_data (dict): Updated document content data
Returns:
DocumentContent or None: Updated document content if found, None otherwise
"""
try:
if not ObjectId.is_valid(document_id):
return None
# Remove None values and _id from update data
clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
if not clean_update_data:
return await self.find_document_content_by_id(document_id)
result = await self.collection.find_one_and_update(
{"_id": ObjectId(document_id)},
{"$set": clean_update_data},
return_document=True
)
if result:
return DocumentContent(**result)
return None
except PyMongoError:
return None
async def delete_document_content(self, document_id: str) -> bool:
"""
Delete document content from database.
Args:
document_id (str): Document content ID to delete
Returns:
bool: True if document content was deleted, False otherwise
"""
try:
if not ObjectId.is_valid(document_id):
return False
result = await self.collection.delete_one({"_id": ObjectId(document_id)})
return result.deleted_count > 0
except PyMongoError:
return False
async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
"""
List document contents with pagination.
Args:
skip (int): Number of document contents to skip (default: 0)
limit (int): Maximum number of document contents to return (default: 100)
Returns:
List[DocumentContent]: List of document contents
"""
try:
cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
document_docs = await cursor.to_list(length=limit)
return [DocumentContent(**document_doc) for document_doc in document_docs]
except PyMongoError:
return []
async def count_document_contents(self) -> int:
"""
Count total number of document contents.
Returns:
int: Total number of document contents in database
"""
try:
return await self.collection.count_documents({})
except PyMongoError:
return 0

View File

@@ -8,11 +8,9 @@ in MongoDB with proper error handling and type safety.
from typing import Optional, List
from bson import ObjectId
from pymongo.errors import DuplicateKeyError, PyMongoError
from difflib import SequenceMatcher
from motor.motor_asyncio import AsyncIOMotorCollection
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
from app.models.document import FileDocument
from app.database.connection import get_database
from app.utils.ducment_matching import fuzzy_matching, subsequence_matching
from app.utils.document_matching import fuzzy_matching, subsequence_matching
class MatchMethodBase:
@@ -36,9 +34,9 @@ class FileDocumentRepository:
with proper error handling and data validation.
"""
def __init__(self):
def __init__(self, database: AsyncIOMotorDatabase):
"""Initialize file repository with database connection."""
self.db = get_database()
self.db = database
self.collection: AsyncIOMotorCollection = self.db.files
self._ensure_indexes()

View File

@@ -86,7 +86,7 @@ class DocumentContent(BaseModel):
"""Model for document content."""
id: Optional[PyObjectId] = Field(default=None, alias="_id")
file_hash: Optional[str] = Field(..., description="SHA256 hash of file content")
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
content: str = Field(..., description="File content")
encoding: str = Field(default="utf-8", description="Character encoding for text files")
file_size: int = Field(..., ge=0, description="File size in bytes")

View File

View File

@@ -0,0 +1,380 @@
"""
Document service for orchestrated file and content management.
This service coordinates between FileDocument and DocumentContent repositories
while maintaining data consistency through MongoDB transactions.
"""
import hashlib
import magic
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any, Tuple
from motor.motor_asyncio import AsyncIOMotorClientSession
from pymongo.errors import PyMongoError
from app.database.connection import get_database
from app.database.repositories.document_repository import FileDocumentRepository
from app.database.repositories.document_content_repository import DocumentContentRepository
from app.models.document import (
FileDocument,
DocumentContent,
FileType,
ProcessingStatus
)
from app.models.types import PyObjectId
class DocumentService:
"""
Service for orchestrated document and content management.
Provides high-level operations that coordinate between file documents
and their content while ensuring data consistency through transactions.
"""
def __init__(self):
"""Initialize the document service with repository dependencies."""
self.db = get_database()
self.file_repository = FileDocumentRepository(self.db)
self.content_repository = DocumentContentRepository(self.db)
def _calculate_file_hash(self, file_bytes: bytes) -> str:
"""
Calculate SHA256 hash of file content.
Args:
file_bytes: Raw file content as bytes
Returns:
Hexadecimal SHA256 hash string
"""
return hashlib.sha256(file_bytes).hexdigest()
def _detect_file_type(self, file_path: str) -> FileType:
"""
Detect file type from file extension.
Args:
file_path: Path to the file
Returns:
FileType enum value
Raises:
ValueError: If file type is not supported
"""
extension = Path(file_path).suffix.lower().lstrip('.')
try:
return FileType(extension)
except ValueError:
raise ValueError(f"Unsupported file type: {extension}")
def _detect_mime_type(self, file_bytes: bytes) -> str:
"""
Detect MIME type from file content.
Args:
file_bytes: Raw file content as bytes
Returns:
MIME type string
"""
return magic.from_buffer(file_bytes, mime=True)
async def create_document(
self,
file_path: str,
file_bytes: bytes,
encoding: str = "utf-8"
) -> FileDocument:
"""
Create a new document with automatic deduplication.
This method handles the creation of both FileDocument and DocumentContent
with proper deduplication based on file hash. If content with the same
hash already exists, only a new FileDocument is created.
Args:
file_path: Full path to the file
file_bytes: Raw file content as bytes
encoding: Character encoding for text content
Returns:
Created FileDocument instance
Raises:
ValueError: If file type is not supported
PyMongoError: If database operation fails
"""
# Calculate automatic attributes
file_hash = self._calculate_file_hash(file_bytes)
file_type = self._detect_file_type(file_path)
mime_type = self._detect_mime_type(file_bytes)
file_size = len(file_bytes)
filename = Path(file_path).name
detected_at = datetime.utcnow()
# Start MongoDB transaction
async with await self.db.client.start_session() as session:
async with session.start_transaction():
try:
# Check if content already exists
existing_content = await self.content_repository.find_document_content_by_file_hash(
file_hash, session=session
)
# Create DocumentContent if it doesn't exist
if not existing_content:
content_data = DocumentContent(
file_hash=file_hash,
content="", # Will be populated by processing workers
encoding=encoding,
file_size=file_size,
mime_type=mime_type
)
await self.content_repository.create_document_content(
content_data, session=session
)
# Create FileDocument
file_data = FileDocument(
filename=filename,
filepath=file_path,
file_type=file_type,
extraction_method=None, # Will be set by processing workers
metadata={}, # Empty for now
detected_at=detected_at,
file_hash=file_hash
)
created_file = await self.file_repository.create_document(
file_data, session=session
)
return created_file
except Exception as e:
# Transaction will automatically rollback
raise PyMongoError(f"Failed to create document: {str(e)}")
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
"""
Retrieve a document by its ID.
Args:
document_id: Document ObjectId
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_id(document_id)
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
"""
Retrieve a document by its file hash.
Args:
file_hash: SHA256 hash of file content
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_hash(file_hash)
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
"""
Retrieve a document by its file path.
Args:
filepath: Full path to the file
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_filepath(filepath)
async def get_document_with_content(
self,
document_id: PyObjectId
) -> Optional[Tuple[FileDocument, DocumentContent]]:
"""
Retrieve a document with its associated content.
Args:
document_id: Document ObjectId
Returns:
Tuple of (FileDocument, DocumentContent) if found, None otherwise
"""
document = await self.get_document_by_id(document_id)
if not document:
return None
content = await self.content_repository.find_document_content_by_file_hash(
document.file_hash
)
if not content:
return None
return (document, content)
async def list_documents(
self,
skip: int = 0,
limit: int = 100
) -> List[FileDocument]:
"""
List documents with pagination.
Args:
skip: Number of documents to skip
limit: Maximum number of documents to return
Returns:
List of FileDocument instances
"""
return await self.file_repository.list_documents(skip=skip, limit=limit)
async def count_documents(self) -> int:
"""
Get total number of documents.
Returns:
Total document count
"""
return await self.file_repository.count_documents()
async def update_document(
self,
document_id: PyObjectId,
update_data: Dict[str, Any]
) -> Optional[FileDocument]:
"""
Update document metadata.
Args:
document_id: Document ObjectId
update_data: Dictionary with fields to update
Returns:
Updated FileDocument if found, None otherwise
"""
return await self.file_repository.update_document(document_id, update_data)
async def delete_document(self, document_id: PyObjectId) -> bool:
"""
Delete a document and its orphaned content.
This method removes the FileDocument and checks if the associated
DocumentContent is orphaned (no other files reference it). If orphaned,
the content is also deleted.
Args:
document_id: Document ObjectId
Returns:
True if document was deleted, False otherwise
Raises:
PyMongoError: If database operation fails
"""
# Start MongoDB transaction
async with await self.db.client.start_session() as session:
async with session.start_transaction():
try:
# Get document to find its hash
document = await self.file_repository.find_document_by_id(
document_id, session=session
)
if not document:
return False
# Delete the document
deleted = await self.file_repository.delete_document(
document_id, session=session
)
if not deleted:
return False
# Check if content is orphaned
remaining_files = await self.file_repository.find_document_by_hash(
document.file_hash, session=session
)
# If no other files reference this content, delete it
if not remaining_files:
content = await self.content_repository.find_document_content_by_file_hash(
document.file_hash, session=session
)
if content:
await self.content_repository.delete_document_content(
content.id, session=session
)
return True
except Exception as e:
# Transaction will automatically rollback
raise PyMongoError(f"Failed to delete document: {str(e)}")
async def content_exists(self, file_hash: str) -> bool:
"""
Check if content with given hash exists.
Args:
file_hash: SHA256 hash of file content
Returns:
True if content exists, False otherwise
"""
return await self.content_repository.content_exists(file_hash)
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
"""
Retrieve content by file hash.
Args:
file_hash: SHA256 hash of file content
Returns:
DocumentContent if found, None otherwise
"""
return await self.content_repository.find_document_content_by_file_hash(file_hash)
async def update_document_content(
self,
file_hash: str,
content: str,
encoding: str = "utf-8"
) -> Optional[DocumentContent]:
"""
Update the extracted content for a document.
This method is typically called by processing workers to store
the extracted text content.
Args:
file_hash: SHA256 hash of file content
content: Extracted text content
encoding: Character encoding
Returns:
Updated DocumentContent if found, None otherwise
"""
existing_content = await self.content_repository.find_document_content_by_file_hash(
file_hash
)
if not existing_content:
return None
update_data = {
"content": content,
"encoding": encoding
}
return await self.content_repository.update_document_content(
existing_content.id, update_data
)

View File

@@ -8,3 +8,4 @@ pymongo==4.15.0
pydantic==2.11.9
redis==6.4.0
uvicorn==0.35.0
python-magic==0.4.27