MyDocManager/src/file-processor/app/services/document_service.py

"""
Document service for orchestrated file and content management.

This service coordinates between FileDocument and DocumentContent repositories
while maintaining data consistency through MongoDB transactions.
"""

import hashlib
import os
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any

import magic
from pymongo.errors import PyMongoError

from app.config.settings import get_objects_folder
from app.database.repositories.document_repository import FileDocumentRepository
from app.models.document import (
  FileDocument,
  FileType,
)
from app.models.types import PyObjectId


class DocumentService:
  """
  Service for orchestrated document and content management.

  Provides high-level operations that coordinate between file documents
  and their content while ensuring data consistency through transactions.
  """

  def __init__(self, database, objects_folder: str = None):
    """
    Initialize the document service with repository dependencies.

    Args:
        database: Database instance
        objects_folder: folder to store files by their hash
    """

    self.db = database
    self.document_repository = FileDocumentRepository(self.db)
    self.objects_folder = objects_folder or get_objects_folder()

  async def initialize(self):
    await self.document_repository.initialize()
    return self

  @staticmethod
  def _calculate_file_hash(file_bytes: bytes) -> str:
    """
    Calculate SHA256 hash of file content.

    Args:
        file_bytes: Raw file content as bytes

    Returns:
        Hexadecimal SHA256 hash string
    """
    return hashlib.sha256(file_bytes).hexdigest()

  @staticmethod
  def _detect_file_type(file_path: str) -> FileType:
    """
    Detect file type from file extension.

    Args:
        file_path: Path to the file

    Returns:
        FileType enum value

    Raises:
        ValueError: If file type is not supported
    """
    extension = Path(file_path).suffix.lower().lstrip('.')

    try:
      return FileType(extension)
    except ValueError:
      raise ValueError(f"Unsupported file type: {extension}")

  @staticmethod
  def _detect_mime_type(file_bytes: bytes) -> str:
    """
    Detect MIME type from file content.

    Args:
        file_bytes: Raw file content as bytes

    Returns:
        MIME type string
    """
    return magic.from_buffer(file_bytes, mime=True)

  def _get_document_path(self, file_hash):
    """

    :param file_hash:
    :return:
    """
    return os.path.join(self.objects_folder, file_hash[:24], file_hash)

  def save_content_if_needed(self, file_hash, content: bytes):
    target_path = self._get_document_path(file_hash)
    if os.path.exists(target_path):
      return

    if not os.path.exists(os.path.dirname(target_path)):
      os.makedirs(os.path.dirname(target_path))

    with open(target_path, "wb") as f:
      f.write(content)

  async def create_document(
      self,
      file_path: str,
      file_bytes: bytes,
      encoding: str = "utf-8"
  ) -> FileDocument:
    """
    Create a new document with automatic deduplication.

    This method handles the creation of both FileDocument and DocumentContent
    with proper deduplication based on file hash. If content with the same
    hash already exists, only a new FileDocument is created.

    Args:
        file_path: Full path to the file
        file_bytes: Raw file content as bytes
        encoding: Character encoding for text content

    Returns:
        Created FileDocument instance

    Raises:
        ValueError: If file type is not supported
        PyMongoError: If database operation fails
    """
    # Calculate automatic attributes
    file_hash = self._calculate_file_hash(file_bytes)
    file_type = self._detect_file_type(file_path)
    mime_type = self._detect_mime_type(file_bytes)
    file_size = len(file_bytes)
    filename = Path(file_path).name
    detected_at = datetime.now()

    try:
      self.save_content_if_needed(file_hash, file_bytes)

      # Create FileDocument
      file_data = FileDocument(
        filename=filename,
        filepath=file_path,
        file_type=file_type,
        extraction_method=None,  # Will be set by processing workers
        metadata={},  # Empty for now
        detected_at=detected_at,
        file_hash=file_hash,
        encoding=encoding,
        file_size=file_size,
        mime_type=mime_type
      )

      created_file = await self.document_repository.create_document(file_data)

      return created_file

    except Exception as e:
      # Transaction will automatically rollback if supported
      raise PyMongoError(f"Failed to create document: {str(e)}")

  async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
    """
    Retrieve a document by its ID.

    Args:
        document_id: Document ObjectId

    Returns:
        FileDocument if found, None otherwise
    """
    return await self.document_repository.find_document_by_id(str(document_id))

  async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
    """
    Retrieve a document by its file hash.

    Args:
        file_hash: SHA256 hash of file content

    Returns:
        FileDocument if found, None otherwise
    """
    return await self.document_repository.find_document_by_hash(file_hash)

  async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
    """
    Retrieve a document by its file path.

    Args:
        filepath: Full path to the file

    Returns:
        FileDocument if found, None otherwise
    """
    return await self.document_repository.find_document_by_filepath(filepath)

  async def get_document_content_by_hash(self, file_hash):
    target_path = self._get_document_path(file_hash)
    if not os.path.exists(target_path):
      return None

    with open(target_path, "rb") as f:
      return f.read()

  async def list_documents(
      self,
      skip: int = 0,
      limit: int = 100
  ) -> List[FileDocument]:
    """
    List documents with pagination.

    Args:
        skip: Number of documents to skip
        limit: Maximum number of documents to return

    Returns:
        List of FileDocument instances
    """
    return await self.document_repository.list_documents(skip=skip, limit=limit)

  async def count_documents(self) -> int:
    """
    Get total number of documents.

    Returns:
        Total document count
    """
    return await self.document_repository.count_documents()

  async def update_document(
      self,
      document_id: PyObjectId,
      update_data: Dict[str, Any]
  ) -> Optional[FileDocument]:
    """
    Update document metadata.

    Args:
        document_id: Document ObjectId
        update_data: Dictionary with fields to update

    Returns:
        Updated FileDocument if found, None otherwise
    """
    if "file_bytes" in update_data:
      file_hash = self._calculate_file_hash(update_data["file_bytes"])
      update_data["file_hash"] = file_hash
      self.save_content_if_needed(file_hash, update_data["file_bytes"])

    return await self.document_repository.update_document(document_id, update_data)

  async def delete_document(self, document_id: PyObjectId) -> bool:
    """
    Delete a document and its orphaned content.

    This method removes the FileDocument and checks if the associated
    DocumentContent is orphaned (no other files reference it). If orphaned,
    the content is also deleted.

    Args:
        document_id: Document ObjectId

    Returns:
        True if document was deleted, False otherwise

    Raises:
        PyMongoError: If database operation fails
    """
    # Start transaction

    try:
      # Get document to find its hash
      document = await self.document_repository.find_document_by_id(document_id)
      if not document:
        return False

      # Delete the document
      deleted = await self.document_repository.delete_document(document_id)
      if not deleted:
        return False

      # Check if content is orphaned
      remaining_files = await self.document_repository.find_document_by_hash(document.file_hash)

      # If no other files reference this content, delete it
      if not remaining_files:
        try:
          os.remove(self._get_document_path(document.file_hash))
        except Exception:
          pass

      return True

    except Exception as e:
      # Transaction will automatically rollback if supported
      raise PyMongoError(f"Failed to delete document: {str(e)}")