Adding document service
This commit is contained in:
@@ -0,0 +1,214 @@
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from bson import ObjectId
|
||||
|
||||
from app.models.document import DocumentContent
|
||||
|
||||
|
||||
class DocumentContentRepository:
|
||||
"""
|
||||
Repository class for document content CRUD operations in MongoDB.
|
||||
|
||||
This class handles all database operations related to document content,
|
||||
following the repository pattern with dependency injection and async/await.
|
||||
"""
|
||||
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
"""
|
||||
Initialize repository with database dependency.
|
||||
|
||||
Args:
|
||||
database (AsyncIOMotorDatabase): MongoDB database instance
|
||||
"""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = database.document_contents
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
|
||||
Creates unique index on file_hash field to prevent duplicates.
|
||||
"""
|
||||
try:
|
||||
await self.collection.create_index("file_hash", unique=True)
|
||||
except PyMongoError:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
|
||||
async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
|
||||
"""
|
||||
Create a new document content in the database.
|
||||
|
||||
Args:
|
||||
document_content (DocumentContent): Document content data
|
||||
|
||||
Returns:
|
||||
DocumentContent: Created document content with database ID
|
||||
|
||||
Raises:
|
||||
DuplicateKeyError: If file_hash already exists
|
||||
ValueError: If document content creation fails due to validation
|
||||
"""
|
||||
document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
|
||||
|
||||
# Remove _id if it's None to let MongoDB generate it
|
||||
if document_dict.get("_id") is None:
|
||||
document_dict.pop("_id", None)
|
||||
|
||||
try:
|
||||
result = await self.collection.insert_one(document_dict)
|
||||
document_dict["_id"] = result.inserted_id
|
||||
return DocumentContent(**document_dict)
|
||||
except DuplicateKeyError as e:
|
||||
raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
|
||||
except PyMongoError as e:
|
||||
raise ValueError(f"Failed to create document content: {e}")
|
||||
|
||||
async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Find document content by ID.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to search for
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return None
|
||||
|
||||
document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
|
||||
if document_doc:
|
||||
return DocumentContent(**document_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Find document content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash (str): File hash to search for
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
document_doc = await self.collection.find_one({"file_hash": file_hash})
|
||||
if document_doc:
|
||||
return DocumentContent(**document_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if document content exists by file hash.
|
||||
|
||||
Args:
|
||||
file_hash (str): File hash to check
|
||||
|
||||
Returns:
|
||||
bool: True if document content exists, False otherwise
|
||||
"""
|
||||
try:
|
||||
count = await self.collection.count_documents({"file_hash": file_hash})
|
||||
return count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update document content information.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to update
|
||||
update_data (dict): Updated document content data
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Updated document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return None
|
||||
|
||||
# Remove None values and _id from update data
|
||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
|
||||
|
||||
if not clean_update_data:
|
||||
return await self.find_document_content_by_id(document_id)
|
||||
|
||||
result = await self.collection.find_one_and_update(
|
||||
{"_id": ObjectId(document_id)},
|
||||
{"$set": clean_update_data},
|
||||
return_document=True
|
||||
)
|
||||
|
||||
if result:
|
||||
return DocumentContent(**result)
|
||||
return None
|
||||
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def delete_document_content(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete document content from database.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to delete
|
||||
|
||||
Returns:
|
||||
bool: True if document content was deleted, False otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return False
|
||||
|
||||
result = await self.collection.delete_one({"_id": ObjectId(document_id)})
|
||||
return result.deleted_count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
|
||||
"""
|
||||
List document contents with pagination.
|
||||
|
||||
Args:
|
||||
skip (int): Number of document contents to skip (default: 0)
|
||||
limit (int): Maximum number of document contents to return (default: 100)
|
||||
|
||||
Returns:
|
||||
List[DocumentContent]: List of document contents
|
||||
"""
|
||||
try:
|
||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
|
||||
document_docs = await cursor.to_list(length=limit)
|
||||
return [DocumentContent(**document_doc) for document_doc in document_docs]
|
||||
except PyMongoError:
|
||||
return []
|
||||
|
||||
async def count_document_contents(self) -> int:
|
||||
"""
|
||||
Count total number of document contents.
|
||||
|
||||
Returns:
|
||||
int: Total number of document contents in database
|
||||
"""
|
||||
try:
|
||||
return await self.collection.count_documents({})
|
||||
except PyMongoError:
|
||||
return 0
|
||||
@@ -8,11 +8,9 @@ in MongoDB with proper error handling and type safety.
|
||||
from typing import Optional, List
|
||||
from bson import ObjectId
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from difflib import SequenceMatcher
|
||||
from motor.motor_asyncio import AsyncIOMotorCollection
|
||||
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
|
||||
from app.models.document import FileDocument
|
||||
from app.database.connection import get_database
|
||||
from app.utils.ducment_matching import fuzzy_matching, subsequence_matching
|
||||
from app.utils.document_matching import fuzzy_matching, subsequence_matching
|
||||
|
||||
|
||||
class MatchMethodBase:
|
||||
@@ -36,9 +34,9 @@ class FileDocumentRepository:
|
||||
with proper error handling and data validation.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
"""Initialize file repository with database connection."""
|
||||
self.db = get_database()
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = self.db.files
|
||||
self._ensure_indexes()
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ class DocumentContent(BaseModel):
|
||||
"""Model for document content."""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_hash: Optional[str] = Field(..., description="SHA256 hash of file content")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
content: str = Field(..., description="File content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
|
||||
0
src/file-processor/app/models/job.py
Normal file
0
src/file-processor/app/models/job.py
Normal file
380
src/file-processor/app/services/document_service.py
Normal file
380
src/file-processor/app/services/document_service.py
Normal file
@@ -0,0 +1,380 @@
|
||||
"""
|
||||
Document service for orchestrated file and content management.
|
||||
|
||||
This service coordinates between FileDocument and DocumentContent repositories
|
||||
while maintaining data consistency through MongoDB transactions.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import magic
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClientSession
|
||||
from pymongo.errors import PyMongoError
|
||||
|
||||
from app.database.connection import get_database
|
||||
from app.database.repositories.document_repository import FileDocumentRepository
|
||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
||||
from app.models.document import (
|
||||
FileDocument,
|
||||
DocumentContent,
|
||||
FileType,
|
||||
ProcessingStatus
|
||||
)
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
|
||||
class DocumentService:
|
||||
"""
|
||||
Service for orchestrated document and content management.
|
||||
|
||||
Provides high-level operations that coordinate between file documents
|
||||
and their content while ensuring data consistency through transactions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the document service with repository dependencies."""
|
||||
self.db = get_database()
|
||||
self.file_repository = FileDocumentRepository(self.db)
|
||||
self.content_repository = DocumentContentRepository(self.db)
|
||||
|
||||
def _calculate_file_hash(self, file_bytes: bytes) -> str:
|
||||
"""
|
||||
Calculate SHA256 hash of file content.
|
||||
|
||||
Args:
|
||||
file_bytes: Raw file content as bytes
|
||||
|
||||
Returns:
|
||||
Hexadecimal SHA256 hash string
|
||||
"""
|
||||
return hashlib.sha256(file_bytes).hexdigest()
|
||||
|
||||
def _detect_file_type(self, file_path: str) -> FileType:
|
||||
"""
|
||||
Detect file type from file extension.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
|
||||
Returns:
|
||||
FileType enum value
|
||||
|
||||
Raises:
|
||||
ValueError: If file type is not supported
|
||||
"""
|
||||
extension = Path(file_path).suffix.lower().lstrip('.')
|
||||
|
||||
try:
|
||||
return FileType(extension)
|
||||
except ValueError:
|
||||
raise ValueError(f"Unsupported file type: {extension}")
|
||||
|
||||
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
||||
"""
|
||||
Detect MIME type from file content.
|
||||
|
||||
Args:
|
||||
file_bytes: Raw file content as bytes
|
||||
|
||||
Returns:
|
||||
MIME type string
|
||||
"""
|
||||
return magic.from_buffer(file_bytes, mime=True)
|
||||
|
||||
async def create_document(
|
||||
self,
|
||||
file_path: str,
|
||||
file_bytes: bytes,
|
||||
encoding: str = "utf-8"
|
||||
) -> FileDocument:
|
||||
"""
|
||||
Create a new document with automatic deduplication.
|
||||
|
||||
This method handles the creation of both FileDocument and DocumentContent
|
||||
with proper deduplication based on file hash. If content with the same
|
||||
hash already exists, only a new FileDocument is created.
|
||||
|
||||
Args:
|
||||
file_path: Full path to the file
|
||||
file_bytes: Raw file content as bytes
|
||||
encoding: Character encoding for text content
|
||||
|
||||
Returns:
|
||||
Created FileDocument instance
|
||||
|
||||
Raises:
|
||||
ValueError: If file type is not supported
|
||||
PyMongoError: If database operation fails
|
||||
"""
|
||||
# Calculate automatic attributes
|
||||
file_hash = self._calculate_file_hash(file_bytes)
|
||||
file_type = self._detect_file_type(file_path)
|
||||
mime_type = self._detect_mime_type(file_bytes)
|
||||
file_size = len(file_bytes)
|
||||
filename = Path(file_path).name
|
||||
detected_at = datetime.utcnow()
|
||||
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
try:
|
||||
# Check if content already exists
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash, session=session
|
||||
)
|
||||
|
||||
# Create DocumentContent if it doesn't exist
|
||||
if not existing_content:
|
||||
content_data = DocumentContent(
|
||||
file_hash=file_hash,
|
||||
content="", # Will be populated by processing workers
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
await self.content_repository.create_document_content(
|
||||
content_data, session=session
|
||||
)
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash
|
||||
)
|
||||
|
||||
created_file = await self.file_repository.create_document(
|
||||
file_data, session=session
|
||||
)
|
||||
|
||||
return created_file
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
|
||||
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||
"""
|
||||
Retrieve a document by its ID.
|
||||
|
||||
Args:
|
||||
document_id: Document ObjectId
|
||||
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_id(document_id)
|
||||
|
||||
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Retrieve a document by its file hash.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_hash(file_hash)
|
||||
|
||||
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Retrieve a document by its file path.
|
||||
|
||||
Args:
|
||||
filepath: Full path to the file
|
||||
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_filepath(filepath)
|
||||
|
||||
async def get_document_with_content(
|
||||
self,
|
||||
document_id: PyObjectId
|
||||
) -> Optional[Tuple[FileDocument, DocumentContent]]:
|
||||
"""
|
||||
Retrieve a document with its associated content.
|
||||
|
||||
Args:
|
||||
document_id: Document ObjectId
|
||||
|
||||
Returns:
|
||||
Tuple of (FileDocument, DocumentContent) if found, None otherwise
|
||||
"""
|
||||
document = await self.get_document_by_id(document_id)
|
||||
if not document:
|
||||
return None
|
||||
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash
|
||||
)
|
||||
if not content:
|
||||
return None
|
||||
|
||||
return (document, content)
|
||||
|
||||
async def list_documents(
|
||||
self,
|
||||
skip: int = 0,
|
||||
limit: int = 100
|
||||
) -> List[FileDocument]:
|
||||
"""
|
||||
List documents with pagination.
|
||||
|
||||
Args:
|
||||
skip: Number of documents to skip
|
||||
limit: Maximum number of documents to return
|
||||
|
||||
Returns:
|
||||
List of FileDocument instances
|
||||
"""
|
||||
return await self.file_repository.list_documents(skip=skip, limit=limit)
|
||||
|
||||
async def count_documents(self) -> int:
|
||||
"""
|
||||
Get total number of documents.
|
||||
|
||||
Returns:
|
||||
Total document count
|
||||
"""
|
||||
return await self.file_repository.count_documents()
|
||||
|
||||
async def update_document(
|
||||
self,
|
||||
document_id: PyObjectId,
|
||||
update_data: Dict[str, Any]
|
||||
) -> Optional[FileDocument]:
|
||||
"""
|
||||
Update document metadata.
|
||||
|
||||
Args:
|
||||
document_id: Document ObjectId
|
||||
update_data: Dictionary with fields to update
|
||||
|
||||
Returns:
|
||||
Updated FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.update_document(document_id, update_data)
|
||||
|
||||
async def delete_document(self, document_id: PyObjectId) -> bool:
|
||||
"""
|
||||
Delete a document and its orphaned content.
|
||||
|
||||
This method removes the FileDocument and checks if the associated
|
||||
DocumentContent is orphaned (no other files reference it). If orphaned,
|
||||
the content is also deleted.
|
||||
|
||||
Args:
|
||||
document_id: Document ObjectId
|
||||
|
||||
Returns:
|
||||
True if document was deleted, False otherwise
|
||||
|
||||
Raises:
|
||||
PyMongoError: If database operation fails
|
||||
"""
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = await self.file_repository.find_document_by_id(
|
||||
document_id, session=session
|
||||
)
|
||||
if not document:
|
||||
return False
|
||||
|
||||
# Delete the document
|
||||
deleted = await self.file_repository.delete_document(
|
||||
document_id, session=session
|
||||
)
|
||||
if not deleted:
|
||||
return False
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = await self.file_repository.find_document_by_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
if content:
|
||||
await self.content_repository.delete_document_content(
|
||||
content.id, session=session
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if content with given hash exists.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
True if content exists, False otherwise
|
||||
"""
|
||||
return await self.content_repository.content_exists(file_hash)
|
||||
|
||||
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Retrieve content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
DocumentContent if found, None otherwise
|
||||
"""
|
||||
return await self.content_repository.find_document_content_by_file_hash(file_hash)
|
||||
|
||||
async def update_document_content(
|
||||
self,
|
||||
file_hash: str,
|
||||
content: str,
|
||||
encoding: str = "utf-8"
|
||||
) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update the extracted content for a document.
|
||||
|
||||
This method is typically called by processing workers to store
|
||||
the extracted text content.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
content: Extracted text content
|
||||
encoding: Character encoding
|
||||
|
||||
Returns:
|
||||
Updated DocumentContent if found, None otherwise
|
||||
"""
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash
|
||||
)
|
||||
if not existing_content:
|
||||
return None
|
||||
|
||||
update_data = {
|
||||
"content": content,
|
||||
"encoding": encoding
|
||||
}
|
||||
|
||||
return await self.content_repository.update_document_content(
|
||||
existing_content.id, update_data
|
||||
)
|
||||
@@ -8,3 +8,4 @@ pymongo==4.15.0
|
||||
pydantic==2.11.9
|
||||
redis==6.4.0
|
||||
uvicorn==0.35.0
|
||||
python-magic==0.4.27
|
||||
Reference in New Issue
Block a user