Adding document service

2025-09-19 22:59:41 +02:00
parent e8b306ac4a
commit f1b551d243
13 changed files with 1734 additions and 24 deletions
--- a/src/file-processor/app/database/repositories/document_content_repository.py
+++ b/src/file-processor/app/database/repositories/document_content_repository.py
@@ -0,0 +1,214 @@
+from typing import List, Optional
+from datetime import datetime
+from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
+from pymongo.errors import DuplicateKeyError, PyMongoError
+from bson import ObjectId
+
+from app.models.document import DocumentContent
+
+
+class DocumentContentRepository:
+  """
+  Repository class for document content CRUD operations in MongoDB.
+
+  This class handles all database operations related to document content,
+  following the repository pattern with dependency injection and async/await.
+  """
+  
+  def __init__(self, database: AsyncIOMotorDatabase):
+    """
+    Initialize repository with database dependency.
+
+    Args:
+        database (AsyncIOMotorDatabase): MongoDB database instance
+    """
+    self.db = database
+    self.collection: AsyncIOMotorCollection = database.document_contents
+    self._ensure_indexes()
+  
+  async def initialize(self):
+    """
+    Initialize repository by ensuring required indexes exist.
+
+    Should be called after repository instantiation to setup database indexes.
+    """
+    await self._ensure_indexes()
+  
+  async def _ensure_indexes(self):
+    """
+    Ensure required database indexes exist.
+
+    Creates unique index on file_hash field to prevent duplicates.
+    """
+    try:
+      await self.collection.create_index("file_hash", unique=True)
+    except PyMongoError:
+      # Index might already exist, ignore error
+      pass
+  
+  async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
+    """
+    Create a new document content in the database.
+
+    Args:
+        document_content (DocumentContent): Document content data
+
+    Returns:
+        DocumentContent: Created document content with database ID
+
+    Raises:
+        DuplicateKeyError: If file_hash already exists
+        ValueError: If document content creation fails due to validation
+    """
+    document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
+    
+    # Remove _id if it's None to let MongoDB generate it
+    if document_dict.get("_id") is None:
+      document_dict.pop("_id", None)
+    
+    try:
+      result = await self.collection.insert_one(document_dict)
+      document_dict["_id"] = result.inserted_id
+      return DocumentContent(**document_dict)
+    except DuplicateKeyError as e:
+      raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
+    except PyMongoError as e:
+      raise ValueError(f"Failed to create document content: {e}")
+  
+  async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
+    """
+    Find document content by ID.
+
+    Args:
+        document_id (str): Document content ID to search for
+
+    Returns:
+        DocumentContent or None: Document content if found, None otherwise
+    """
+    try:
+      if not ObjectId.is_valid(document_id):
+        return None
+      
+      document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
+      if document_doc:
+        return DocumentContent(**document_doc)
+      return None
+    except PyMongoError:
+      return None
+  
+  async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
+    """
+    Find document content by file hash.
+
+    Args:
+        file_hash (str): File hash to search for
+
+    Returns:
+        DocumentContent or None: Document content if found, None otherwise
+    """
+    try:
+      document_doc = await self.collection.find_one({"file_hash": file_hash})
+      if document_doc:
+        return DocumentContent(**document_doc)
+      return None
+    except PyMongoError:
+      return None
+  
+  async def content_exists(self, file_hash: str) -> bool:
+    """
+    Check if document content exists by file hash.
+
+    Args:
+        file_hash (str): File hash to check
+
+    Returns:
+        bool: True if document content exists, False otherwise
+    """
+    try:
+      count = await self.collection.count_documents({"file_hash": file_hash})
+      return count > 0
+    except PyMongoError:
+      return False
+  
+  async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
+    """
+    Update document content information.
+
+    Args:
+        document_id (str): Document content ID to update
+        update_data (dict): Updated document content data
+
+    Returns:
+        DocumentContent or None: Updated document content if found, None otherwise
+    """
+    try:
+      if not ObjectId.is_valid(document_id):
+        return None
+      
+      # Remove None values and _id from update data
+      clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
+      
+      if not clean_update_data:
+        return await self.find_document_content_by_id(document_id)
+      
+      result = await self.collection.find_one_and_update(
+        {"_id": ObjectId(document_id)},
+        {"$set": clean_update_data},
+        return_document=True
+      )
+      
+      if result:
+        return DocumentContent(**result)
+      return None
+    
+    except PyMongoError:
+      return None
+  
+  async def delete_document_content(self, document_id: str) -> bool:
+    """
+    Delete document content from database.
+
+    Args:
+        document_id (str): Document content ID to delete
+
+    Returns:
+        bool: True if document content was deleted, False otherwise
+    """
+    try:
+      if not ObjectId.is_valid(document_id):
+        return False
+      
+      result = await self.collection.delete_one({"_id": ObjectId(document_id)})
+      return result.deleted_count > 0
+    except PyMongoError:
+      return False
+  
+  async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
+    """
+    List document contents with pagination.
+
+    Args:
+        skip (int): Number of document contents to skip (default: 0)
+        limit (int): Maximum number of document contents to return (default: 100)
+
+    Returns:
+        List[DocumentContent]: List of document contents
+    """
+    try:
+      cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
+      document_docs = await cursor.to_list(length=limit)
+      return [DocumentContent(**document_doc) for document_doc in document_docs]
+    except PyMongoError:
+      return []
+  
+  async def count_document_contents(self) -> int:
+    """
+    Count total number of document contents.
+
+    Returns:
+        int: Total number of document contents in database
+    """
+    try:
+      return await self.collection.count_documents({})
+    except PyMongoError:
+      return 0
--- a/src/file-processor/app/database/repositories/document_repository.py
+++ b/src/file-processor/app/database/repositories/document_repository.py
@@ -8,11 +8,9 @@ in MongoDB with proper error handling and type safety.
 from typing import Optional, List
 from bson import ObjectId
 from pymongo.errors import DuplicateKeyError, PyMongoError
-from difflib import SequenceMatcher
-from motor.motor_asyncio import AsyncIOMotorCollection
+from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
 from app.models.document import FileDocument
-from app.database.connection import get_database
-from app.utils.ducment_matching import fuzzy_matching, subsequence_matching
+from app.utils.document_matching import fuzzy_matching, subsequence_matching


 class MatchMethodBase:
@@ -36,9 +34,9 @@ class FileDocumentRepository:
  with proper error handling and data validation.
  """
  
-  def __init__(self):
+  def __init__(self, database: AsyncIOMotorDatabase):
    """Initialize file repository with database connection."""
-    self.db = get_database()
+    self.db = database
    self.collection: AsyncIOMotorCollection = self.db.files
    self._ensure_indexes()
  
--- a/src/file-processor/app/models/document.py
+++ b/src/file-processor/app/models/document.py
@@ -86,7 +86,7 @@ class DocumentContent(BaseModel):
  """Model for document content."""
  
  id: Optional[PyObjectId] = Field(default=None, alias="_id")
-  file_hash: Optional[str] = Field(..., description="SHA256 hash of file content")
+  file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
  content: str = Field(..., description="File content")
  encoding: str = Field(default="utf-8", description="Character encoding for text files")
  file_size: int = Field(..., ge=0, description="File size in bytes")
--- a/src/file-processor/app/models/job.py
+++ b/src/file-processor/app/models/job.py
--- a/src/file-processor/app/services/document_service.py
+++ b/src/file-processor/app/services/document_service.py
@@ -0,0 +1,380 @@
+"""
+Document service for orchestrated file and content management.
+
+This service coordinates between FileDocument and DocumentContent repositories
+while maintaining data consistency through MongoDB transactions.
+"""
+
+import hashlib
+import magic
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional, Dict, Any, Tuple
+
+from motor.motor_asyncio import AsyncIOMotorClientSession
+from pymongo.errors import PyMongoError
+
+from app.database.connection import get_database
+from app.database.repositories.document_repository import FileDocumentRepository
+from app.database.repositories.document_content_repository import DocumentContentRepository
+from app.models.document import (
+  FileDocument,
+  DocumentContent,
+  FileType,
+  ProcessingStatus
+)
+from app.models.types import PyObjectId
+
+
+class DocumentService:
+  """
+  Service for orchestrated document and content management.
+
+  Provides high-level operations that coordinate between file documents
+  and their content while ensuring data consistency through transactions.
+  """
+  
+  def __init__(self):
+    """Initialize the document service with repository dependencies."""
+    self.db = get_database()
+    self.file_repository = FileDocumentRepository(self.db)
+    self.content_repository = DocumentContentRepository(self.db)
+  
+  def _calculate_file_hash(self, file_bytes: bytes) -> str:
+    """
+    Calculate SHA256 hash of file content.
+
+    Args:
+        file_bytes: Raw file content as bytes
+
+    Returns:
+        Hexadecimal SHA256 hash string
+    """
+    return hashlib.sha256(file_bytes).hexdigest()
+  
+  def _detect_file_type(self, file_path: str) -> FileType:
+    """
+    Detect file type from file extension.
+
+    Args:
+        file_path: Path to the file
+
+    Returns:
+        FileType enum value
+
+    Raises:
+        ValueError: If file type is not supported
+    """
+    extension = Path(file_path).suffix.lower().lstrip('.')
+    
+    try:
+      return FileType(extension)
+    except ValueError:
+      raise ValueError(f"Unsupported file type: {extension}")
+  
+  def _detect_mime_type(self, file_bytes: bytes) -> str:
+    """
+    Detect MIME type from file content.
+
+    Args:
+        file_bytes: Raw file content as bytes
+
+    Returns:
+        MIME type string
+    """
+    return magic.from_buffer(file_bytes, mime=True)
+  
+  async def create_document(
+      self,
+      file_path: str,
+      file_bytes: bytes,
+      encoding: str = "utf-8"
+  ) -> FileDocument:
+    """
+    Create a new document with automatic deduplication.
+
+    This method handles the creation of both FileDocument and DocumentContent
+    with proper deduplication based on file hash. If content with the same
+    hash already exists, only a new FileDocument is created.
+
+    Args:
+        file_path: Full path to the file
+        file_bytes: Raw file content as bytes
+        encoding: Character encoding for text content
+
+    Returns:
+        Created FileDocument instance
+
+    Raises:
+        ValueError: If file type is not supported
+        PyMongoError: If database operation fails
+    """
+    # Calculate automatic attributes
+    file_hash = self._calculate_file_hash(file_bytes)
+    file_type = self._detect_file_type(file_path)
+    mime_type = self._detect_mime_type(file_bytes)
+    file_size = len(file_bytes)
+    filename = Path(file_path).name
+    detected_at = datetime.utcnow()
+    
+    # Start MongoDB transaction
+    async with await self.db.client.start_session() as session:
+      async with session.start_transaction():
+        try:
+          # Check if content already exists
+          existing_content = await self.content_repository.find_document_content_by_file_hash(
+            file_hash, session=session
+          )
+          
+          # Create DocumentContent if it doesn't exist
+          if not existing_content:
+            content_data = DocumentContent(
+              file_hash=file_hash,
+              content="",  # Will be populated by processing workers
+              encoding=encoding,
+              file_size=file_size,
+              mime_type=mime_type
+            )
+            await self.content_repository.create_document_content(
+              content_data, session=session
+            )
+          
+          # Create FileDocument
+          file_data = FileDocument(
+            filename=filename,
+            filepath=file_path,
+            file_type=file_type,
+            extraction_method=None,  # Will be set by processing workers
+            metadata={},  # Empty for now
+            detected_at=detected_at,
+            file_hash=file_hash
+          )
+          
+          created_file = await self.file_repository.create_document(
+            file_data, session=session
+          )
+          
+          return created_file
+        
+        except Exception as e:
+          # Transaction will automatically rollback
+          raise PyMongoError(f"Failed to create document: {str(e)}")
+  
+  async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
+    """
+    Retrieve a document by its ID.
+
+    Args:
+        document_id: Document ObjectId
+
+    Returns:
+        FileDocument if found, None otherwise
+    """
+    return await self.file_repository.find_document_by_id(document_id)
+  
+  async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
+    """
+    Retrieve a document by its file hash.
+
+    Args:
+        file_hash: SHA256 hash of file content
+
+    Returns:
+        FileDocument if found, None otherwise
+    """
+    return await self.file_repository.find_document_by_hash(file_hash)
+  
+  async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
+    """
+    Retrieve a document by its file path.
+
+    Args:
+        filepath: Full path to the file
+
+    Returns:
+        FileDocument if found, None otherwise
+    """
+    return await self.file_repository.find_document_by_filepath(filepath)
+  
+  async def get_document_with_content(
+      self,
+      document_id: PyObjectId
+  ) -> Optional[Tuple[FileDocument, DocumentContent]]:
+    """
+    Retrieve a document with its associated content.
+
+    Args:
+        document_id: Document ObjectId
+
+    Returns:
+        Tuple of (FileDocument, DocumentContent) if found, None otherwise
+    """
+    document = await self.get_document_by_id(document_id)
+    if not document:
+      return None
+    
+    content = await self.content_repository.find_document_content_by_file_hash(
+      document.file_hash
+    )
+    if not content:
+      return None
+    
+    return (document, content)
+  
+  async def list_documents(
+      self,
+      skip: int = 0,
+      limit: int = 100
+  ) -> List[FileDocument]:
+    """
+    List documents with pagination.
+
+    Args:
+        skip: Number of documents to skip
+        limit: Maximum number of documents to return
+
+    Returns:
+        List of FileDocument instances
+    """
+    return await self.file_repository.list_documents(skip=skip, limit=limit)
+  
+  async def count_documents(self) -> int:
+    """
+    Get total number of documents.
+
+    Returns:
+        Total document count
+    """
+    return await self.file_repository.count_documents()
+  
+  async def update_document(
+      self,
+      document_id: PyObjectId,
+      update_data: Dict[str, Any]
+  ) -> Optional[FileDocument]:
+    """
+    Update document metadata.
+
+    Args:
+        document_id: Document ObjectId
+        update_data: Dictionary with fields to update
+
+    Returns:
+        Updated FileDocument if found, None otherwise
+    """
+    return await self.file_repository.update_document(document_id, update_data)
+  
+  async def delete_document(self, document_id: PyObjectId) -> bool:
+    """
+    Delete a document and its orphaned content.
+
+    This method removes the FileDocument and checks if the associated
+    DocumentContent is orphaned (no other files reference it). If orphaned,
+    the content is also deleted.
+
+    Args:
+        document_id: Document ObjectId
+
+    Returns:
+        True if document was deleted, False otherwise
+
+    Raises:
+        PyMongoError: If database operation fails
+    """
+    # Start MongoDB transaction
+    async with await self.db.client.start_session() as session:
+      async with session.start_transaction():
+        try:
+          # Get document to find its hash
+          document = await self.file_repository.find_document_by_id(
+            document_id, session=session
+          )
+          if not document:
+            return False
+          
+          # Delete the document
+          deleted = await self.file_repository.delete_document(
+            document_id, session=session
+          )
+          if not deleted:
+            return False
+          
+          # Check if content is orphaned
+          remaining_files = await self.file_repository.find_document_by_hash(
+            document.file_hash, session=session
+          )
+          
+          # If no other files reference this content, delete it
+          if not remaining_files:
+            content = await self.content_repository.find_document_content_by_file_hash(
+              document.file_hash, session=session
+            )
+            if content:
+              await self.content_repository.delete_document_content(
+                content.id, session=session
+              )
+          
+          return True
+        
+        except Exception as e:
+          # Transaction will automatically rollback
+          raise PyMongoError(f"Failed to delete document: {str(e)}")
+  
+  async def content_exists(self, file_hash: str) -> bool:
+    """
+    Check if content with given hash exists.
+
+    Args:
+        file_hash: SHA256 hash of file content
+
+    Returns:
+        True if content exists, False otherwise
+    """
+    return await self.content_repository.content_exists(file_hash)
+  
+  async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
+    """
+    Retrieve content by file hash.
+
+    Args:
+        file_hash: SHA256 hash of file content
+
+    Returns:
+        DocumentContent if found, None otherwise
+    """
+    return await self.content_repository.find_document_content_by_file_hash(file_hash)
+  
+  async def update_document_content(
+      self,
+      file_hash: str,
+      content: str,
+      encoding: str = "utf-8"
+  ) -> Optional[DocumentContent]:
+    """
+    Update the extracted content for a document.
+
+    This method is typically called by processing workers to store
+    the extracted text content.
+
+    Args:
+        file_hash: SHA256 hash of file content
+        content: Extracted text content
+        encoding: Character encoding
+
+    Returns:
+        Updated DocumentContent if found, None otherwise
+    """
+    existing_content = await self.content_repository.find_document_content_by_file_hash(
+      file_hash
+    )
+    if not existing_content:
+      return None
+    
+    update_data = {
+        "content": content,
+        "encoding": encoding
+    }
+    
+    return await self.content_repository.update_document_content(
+      existing_content.id, update_data
+    )
--- a/src/file-processor/app/utils/document_matching.py
+++ b/src/file-processor/app/utils/document_matching.py
--- a/src/file-processor/requirements.txt
+++ b/src/file-processor/requirements.txt
@@ -8,3 +8,4 @@ pymongo==4.15.0
 pydantic==2.11.9
 redis==6.4.0
 uvicorn==0.35.0
+python-magic==0.4.27