Implemented default pipeline

2025-09-26 22:08:39 +02:00
parent f1b551d243
commit 4de732b0ae
56 changed files with 4534 additions and 2837 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 volumes
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[codz]
--- a/Readme.md
+++ b/Readme.md
@@ -13,7 +13,7 @@ architecture with Redis for task queuing and MongoDB for data persistence.
 - **Backend API**: FastAPI (Python 3.12)
 - **Task Processing**: Celery with Redis broker
 - **Document Processing**: EasyOCR, PyMuPDF, python-docx, pdfplumber
- **Database**: MongoDB
+- **Database**: MongoDB (pymongo)
 - **Frontend**: React
 - **Containerization**: Docker & Docker Compose
 - **File Monitoring**: Python watchdog library
@@ -95,25 +95,32 @@ MyDocManager/
 │   │   ├── requirements.txt
 │   │   ├── app/
 │   │   │   ├── main.py
-│   │   │   ├── file_watcher.py
+│   │   │   ├── file_watcher.py             # FileWatcher class with observer thread
-│   │   │   ├── celery_app.py
+│   │   │   ├── celery_app.py               # Celery Configuration 
 │   │   │   ├── config/
 │   │   │   │   ├── __init__.py
 │   │   │   │   └── settings.py              # JWT, MongoDB config
 │   │   │   ├── models/
 │   │   │   │   ├── __init__.py
 │   │   │   │   ├── user.py                  # User Pydantic models
-│   │   │   │   └── auth.py                  # Auth Pydantic models
+│   │   │   │   ├── auth.py                  # Auth Pydantic models
 │   │   │   │   ├── document.py              # Document Pydantic models
 │   │   │   │   ├── job.py                   # Job Processing Pydantic models
 │   │   │   │   └── types.py                 # PyObjectId and other useful types
 │   │   │   ├── database/
 │   │   │   │   ├── __init__.py
-│   │   │   │   ├── connection.py            # MongoDB connection
+│   │   │   │   ├── connection.py            # MongoDB connection (pymongo)
 │   │   │   │   └── repositories/
 │   │   │   │       ├── __init__.py
-│   │   │   │       └── user_repository.py   # User CRUD operations
+│   │   │   │       ├── user_repository.py      # User CRUD operations (synchronous)
 │   │   │   │       ├── document_repository.py  # Document CRUD operations (synchronous)
 │   │   │   │       └── job_repository.py       # Job CRUD operations (synchronous)
 │   │   │   ├── services/
 │   │   │   │   ├── __init__.py
-│   │   │   │   ├── auth_service.py          # JWT & password logic
+│   │   │   │   ├── auth_service.py          # JWT & password logic (synchronous)
-│   │   │   │   ├── user_service.py          # User business logic
+│   │   │   │   ├── user_service.py          # User business logic (synchronous)
 │   │   │   │   ├── document_service.py      # Document business logic (synchronous)
 │   │   │   │   ├── job_service.py           # Job processing logic (synchronous)
 │   │   │   │   └── init_service.py          # Admin creation at startup
 │   │   │   ├── api/
 │   │   │   │   ├── __init__.py
@@ -125,7 +132,7 @@ MyDocManager/
 │   │   │   └── utils/
 │   │   │       ├── __init__.py
 │   │   │       ├── security.py             # Password utilities
-│   │   │       └── exceptions.py           # Custom exceptions
+│   │   │       └── document_matching.py    # Fuzzy matching Algorithms
 │   ├── worker/
 │   │   ├── Dockerfile
 │   │   ├── requirements.txt
@@ -133,7 +140,13 @@ MyDocManager/
 │   └── frontend/
 │       ├── Dockerfile
 │       ├── package.json
 │       ├── index.html
 │       └── src/
 │           ├── assets/
 │           ├── App.css
 │           ├── App.jsx
 │           ├── main.css
 │           └── main.jsx
 ├── tests/
 │   ├── file-processor/
 │   │   ├── test_auth/
@@ -224,78 +237,76 @@ On first startup, the application automatically creates a default admin user:
 #### Files Collection
-Stores file metadata and extracted content:
+Stores file metadata and extracted content using Pydantic models:
-```json
+```python
-{
+class FileDocument(BaseModel):
-  "_id": "ObjectId",
+  """
-  "filename": "document.pdf",
+  Model for file documents stored in the 'files' collection.
  "filepath": "/watched_files/document.pdf",
  "file_type": "pdf",
  "extraction_method": "direct_text", // direct_text, ocr, hybrid
  "metadata": {
    "page_count": 15,        // for PDFs
    "word_count": 250,       // for text files  
    "image_dimensions": {    // for images
      "width": 1920,
      "height": 1080
    }
  },
  "detected_at": "2024-01-15T10:29:00Z",
  "file_hash": "sha256_hash_value"
 }
 ```
 #### Document Contents Collection
-Stores actual file content and technical metadata:
+  Represents a file detected in the watched directory with its
-```json
+  metadata and extracted content.
-{
+  """
-  "_id": "ObjectId",
+  
-  "file_hash": "sha256_hash_value",
+  id: Optional[PyObjectId] = Field(default=None, alias="_id")
-  "content": "extracted text content...",
+  filename: str = Field(..., description="Original filename")
-  "encoding": "utf-8",
+  filepath: str = Field(..., description="Full path to the file")
-  "file_size": 2048576,
+  file_type: FileType = Field(..., description="Type of the file")
-  "mime_type": "application/pdf"
+  extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content")
-}
+  metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
  detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
  file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
  encoding: str = Field(default="utf-8", description="Character encoding for text files")
  file_size: int = Field(..., ge=0, description="File size in bytes")
  mime_type: str = Field(..., description="MIME type detected")
  @field_validator('filepath')
  @classmethod
  def validate_filepath(cls, v: str) -> str:
    """Validate filepath format."""
    if not v.strip():
      raise ValueError("Filepath cannot be empty")
    return v.strip()
  @field_validator('filename')
  @classmethod
  def validate_filename(cls, v: str) -> str:
    """Validate filename format."""
    if not v.strip():
      raise ValueError("Filename cannot be empty")
    return v.strip()
 ```
 #### Processing Jobs Collection
 Tracks processing status and lifecycle:
-```json
+```python
-{
+class ProcessingJob(BaseModel):
-  "_id": "ObjectId",
+  """
-  "file_id": "reference_to_files_collection",
+  Model for processing jobs stored in the 'processing_jobs' collection.
-  "status": "completed",
+
-  // pending, processing, completed, failed
+  Tracks the lifecycle and status of document processing tasks.
-  "task_id": "celery_task_uuid",
+  """
-  "created_at": "2024-01-15T10:29:00Z",
+  
-  "started_at": "2024-01-15T10:29:30Z",
+  id: Optional[PyObjectId] = Field(default=None, alias="_id")
-  "completed_at": "2024-01-15T10:30:00Z",
+  file_id: PyObjectId = Field(..., description="Reference to file document")
-  "error_message": null
+  status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
-}
+  task_id: Optional[str] = Field(default=None, description="Celery task UUID")
  created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
  started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
  completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
  error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
  @field_validator('error_message')
  @classmethod
  def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
    """Clean up error message."""
    if v is not None:
      return v.strip() if v.strip() else None
    return v
 ```
 ### Data Storage Strategy
 - **Choice**: Three separate collections for files, content, and processing status
 - **Rationale**: Normalization prevents content duplication when multiple files have identical content
 - **Benefits**:
    - Content deduplication via SHA256 hash
    - Better query performance for metadata vs content searches
    - Clear separation of concerns between file metadata, content, and processing lifecycle
    - Multiple files can reference the same content (e.g., identical copies in different locations)
 ### Content Storage Location
 - **Choice**: Store extracted content in separate `document_contents` collection
 - **Rationale**: Content normalization and deduplication
 - **Benefits**: 
    - Single content storage per unique file hash
    - Multiple file entries can reference same content
    - Efficient storage for duplicate files
 ### Supported File Types (Initial Implementation)
 - **Text Files** (`.txt`): Direct content reading
@@ -306,7 +317,7 @@ Tracks processing status and lifecycle:
 #### Watchdog Implementation
- **Choice**: Dedicated observer thread (Option A)
+- **Choice**: Dedicated observer thread
 - **Rationale**: Standard approach, clean separation of concerns
 - **Implementation**: Watchdog observer runs in separate thread from FastAPI
@@ -327,17 +338,94 @@ Tracks processing status and lifecycle:
 #### Content Storage Location
- **Choice**: Store extracted content in `files` collection
+- **Choice**: Store files in the file system, using the SHA256 hash as filename
- **Rationale**: Content is intrinsic property of the file
+- **Rationale**: MongoDB is not meant for large files, better performance. Files remain in the file system for easy
- **Benefits**: Single query to get file + content, simpler data model
+  access.
-### Implementation Order
+#### Repository and Services Implementation
 - **Choice**: Synchronous implementation using pymongo
 - **Rationale**: Full compatibility with Celery workers and simplified workflow
 - **Implementation**: All repositories and services operate synchronously for seamless integration
 ### Implementation Status
 1. ✅ Pydantic models for MongoDB collections
-2. ✅ Repository layer for data access (files + processing_jobs)
+2. ✅ Repository layer for data access (files + processing_jobs + users + documents) - synchronous
-3. ✅ Celery tasks for document processing
+3. ✅ Service layer for business logic (auth, user, document, job) - synchronous
-4. ✅ Watchdog file monitoring implementation
+4. ✅ Celery tasks for document processing
-5. ✅ FastAPI integration and startup coordination
+5. ✅ Watchdog file monitoring implementation
 6. ✅ FastAPI integration and startup coordination
 ## Job Management Layer
 ### Repository Pattern Implementation
 The job management system follows the repository pattern for clean separation between data access and business logic.
 #### JobRepository
 Handles direct MongoDB operations for processing jobs using synchronous pymongo:
 **CRUD Operations:**
 - `create_job()` - Create new processing job with automatic `created_at` timestamp
 - `get_job_by_id()` - Retrieve job by ObjectId
 - `update_job_status()` - Update job status with automatic timestamp management
 - `delete_job()` - Remove job from database
 - `get_jobs_by_file_id()` - Get all jobs for specific file
 - `get_jobs_by_status()` - Get jobs filtered by processing status
 **Automatic Timestamp Management:**
 - `created_at`: Set automatically during job creation
 - `started_at`: Set automatically when status changes to PROCESSING  
 - `completed_at`: Set automatically when status changes to COMPLETED or FAILED
 #### JobService
 Provides synchronous business logic layer with strict status transition validation:
 **Status Transition Methods:**
 - `mark_job_as_started()` - PENDING → PROCESSING
 - `mark_job_as_completed()` - PROCESSING → COMPLETED
 - `mark_job_as_failed()` - PROCESSING → FAILED
 **Validation Rules:**
 - Strict status transitions (invalid transitions raise exceptions)
 - Job existence verification before any operation
 - Automatic timestamp management through repository layer
 #### Custom Exceptions
 **InvalidStatusTransitionError**: Raised for invalid status transitions  
 **JobRepositoryError**: Raised for MongoDB operation failures
 #### Valid Status Transitions
 ```
 PENDING → PROCESSING    (via mark_job_as_started)
 PROCESSING → COMPLETED  (via mark_job_as_completed)
 PROCESSING → FAILED     (via mark_job_as_failed)
 ```
 All other transitions are forbidden and will raise `InvalidStatusTransitionError`.
 ### File Structure
 ```
 src/file-processor/app/
 ├── database/repositories/
 │   ├── job_repository.py           # JobRepository class (synchronous)
 │   ├── user_repository.py          # UserRepository class (synchronous)
 │   ├── document_repository.py      # DocumentRepository class (synchronous)
 │   └── file_repository.py          # FileRepository class (synchronous)
 ├── services/  
 │   ├── job_service.py              # JobService class (synchronous)
 │   ├── auth_service.py             # AuthService class (synchronous)
 │   ├── user_service.py             # UserService class (synchronous)
 │   └── document_service.py         # DocumentService class (synchronous)
 └── exceptions/
    └── job_exceptions.py           # Custom exceptions
 ```
 ### Processing Pipeline Features
@@ -346,87 +434,7 @@ Tracks processing status and lifecycle:
 - **Status Tracking**: Real-time processing status via `processing_jobs` collection
 - **Extensible Metadata**: Flexible metadata storage per file type
 - **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches
-
+- **Synchronous Operations**: All database operations use pymongo for Celery compatibility
 ## Document Service Architecture
 ### Service Overview
 The document service provides orchestrated access to file documents and their content through a single interface that coordinates between `FileDocument` and `DocumentContent` repositories.
 ### Service Design
 - **Architecture Pattern**: Service orchestration with separate repositories
 - **Transaction Support**: MongoDB ACID transactions for data consistency
 - **Content Deduplication**: Multiple files can reference the same content via SHA256 hash
 - **Error Handling**: MongoDB standard exceptions with transaction rollback
 ### Document Service (`document_service.py`)
 Orchestrates operations between file and content repositories while maintaining data consistency.
 #### Core Functionality
 ##### `create_document(file_path: str, file_bytes: bytes, encoding: str)`
 Creates a new document with automatic attribute calculation and content deduplication.
 **Automatic Calculations:**
 - `file_hash`: SHA256 hash of file bytes
 - `file_type`: Detection based on file extension 
 - `mime_type`: Detection via `python-magic` library
 - `file_size`: Length of provided bytes
 - `detected_at`: Current timestamp
 - `metadata`: Empty dictionary (reserved for future extension)
 **Deduplication Logic:**
 1. Calculate SHA256 hash of file content
 2. Check if `DocumentContent` with this hash already exists
 3. If EXISTS: Create only `FileDocument` referencing existing content
 4. If NOT EXISTS: Create both `FileDocument` and `DocumentContent` in transaction
 **Transaction Flow:**
 ```
 BEGIN TRANSACTION
  IF content_exists(file_hash):
    CREATE FileDocument with content reference
  ELSE:
    CREATE DocumentContent
    CREATE FileDocument with content reference
 COMMIT TRANSACTION
 ```
 #### Available Methods
 - `create_document(file_path, file_bytes, encoding)`: Create with deduplication
 - `get_document_by_id(document_id)`: Retrieve by document ID
 - `get_document_by_hash(file_hash)`: Retrieve by file hash
 - `get_document_by_filepath(filepath)`: Retrieve by file path
 - `list_documents(skip, limit)`: Paginated document listing
 - `count_documents()`: Total document count
 - `update_document(document_id, update_data)`: Update document metadata
 - `delete_document(document_id)`: Remove document and orphaned content
 ### Repository Dependencies
 The document service coordinates two existing repositories:
 #### File Repository (`file_repository.py`)
 - `create_document()`, `find_document_by_id()`, `find_document_by_hash()`
 - `find_document_by_filepath()`, `find_document_by_name()`
 - `list_documents()`, `count_documents()`
 - `update_document()`, `delete_document()`
 #### Document Content Repository (`document_content_repository.py`)
 - `create_document_content()`, `find_document_content_by_id()`
 - `find_document_content_by_file_hash()`, `content_exists()`
 - `update_document_content()`, `delete_document_content()`
 - `list_document_contents()`, `count_document_contents()`
 ### Dependencies
 - `python-magic`: MIME type detection
 - `hashlib`: SHA256 hashing (standard library)
 - `pymongo`: MongoDB transactions support
 ## Key Implementation Notes
@@ -449,6 +457,7 @@ The document service coordinates two existing repositories:
 - **Package Manager**: pip (standard)
 - **External Dependencies**: Listed in each service's requirements.txt
 - **Standard Library First**: Prefer standard library when possible
 - **Database Driver**: pymongo for synchronous MongoDB operations
 ### Testing Strategy
@@ -473,6 +482,7 @@ The document service coordinates two existing repositories:
 12. **Content in Files Collection**: Extracted content stored with file metadata
 13. **Direct Task Dispatch**: File watcher directly creates Celery tasks
 14. **SHA256 Duplicate Detection**: Prevents reprocessing identical files
 15. **Synchronous Implementation**: All repositories and services use pymongo for Celery compatibility
 ### Development Process Requirements
@@ -483,21 +493,15 @@ The document service coordinates two existing repositories:
 ### Next Implementation Steps
-1. ✅ Create docker-compose.yml with all services => Done
+1. **TODO**: Complete file processing pipeline =>
-2. ✅ Define user management and authentication architecture => Done
+    1. ✅ Create Pydantic models for files and processing_jobs collections
-3. ✅ Implement user models and authentication services =>
+    2. ✅ Implement repository layer for file and processing job data access (synchronous)
-    1. models/user.py => Done
+    3. ✅ Implement service layer for business logic (synchronous)
-    2. models/auth.py => Done
+    4. ✅ Create Celery tasks for document processing (.txt, .pdf, .docx)
-    3. database/repositories/user_repository.py => Done
+    5. ✅ Implement Watchdog file monitoring with dedicated observer
-4. ✅ Add automatic admin user creation if it does not exists => Done
+    6. ✅ Integrate file watcher with FastAPI startup
-5. **IN PROGRESS**: Implement file processing pipeline =>
+2. Create protected API routes for user management
-    1. Create Pydantic models for files and processing_jobs collections
+3. Build React monitoring interface with authentication
    2. Implement repository layer for file and processing job data access
    3. Create Celery tasks for document processing (.txt, .pdf, .docx)
    4. Implement Watchdog file monitoring with dedicated observer
    5. Integrate file watcher with FastAPI startup
 6. Create protected API routes for user management
 7. Build React monitoring interface with authentication
 ## Annexes
@@ -586,4 +590,4 @@ docker-compose up --scale worker=3
 - **file-processor**: Hot-reload enabled via `--reload` flag
    - Code changes in `src/file-processor/app/` automatically restart FastAPI
 - **worker**: No hot-reload (manual restart required for stability)
-    - Code changes in `src/worker/tasks/` require: `docker-compose restart worker`
+    - Code changes in `src/worker/tasks/` require: `docker-compose restart worker`
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,7 +19,7 @@ services:
      MONGO_INITDB_ROOT_PASSWORD: password123
      MONGO_INITDB_DATABASE: mydocmanager
    volumes:
-      - mongodb-data:/data/db
+      - ./volumes/db:/data/db
    networks:
      - mydocmanager-network
@@ -34,10 +34,12 @@ services:
    environment:
      - REDIS_URL=redis://redis:6379/0
      - MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin
-      - PYTHONPATH=/app
+      - PYTHONPATH=/app:/tasks  # Added /tasks to Python path
    volumes:
      - ./src/file-processor:/app
      - ./src/worker/tasks:/app/tasks          # <- Added: shared access to worker tasks
      - ./volumes/watched_files:/watched_files
      - ./volumes/objects:/objects
    depends_on:
      - redis
      - mongodb
@@ -56,14 +58,29 @@ services:
      - MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin
      - PYTHONPATH=/app
    volumes:
-      - ./src/worker/tasks:/app
+      - ./src/worker:/app
      - ./src/file-processor/app:/app/app     # <- Added: shared access file-processor app
      - ./volumes/watched_files:/watched_files
    depends_on:
      - redis
      - mongodb
    networks:
      - mydocmanager-network
-    command: celery -A main worker --loglevel=info
+    command: celery -A tasks.main worker --loglevel=info
  # Frontend - React application with Vite
  frontend:
    build:
      context: ./src/frontend
      dockerfile: Dockerfile
    container_name: mydocmanager-frontend
    ports:
      - "5173:5173"
    volumes:
      - ./src/frontend:/app
      - /app/node_modules  # Anonymous volume to prevent node_modules override
    networks:
      - mydocmanager-network
 volumes:
  mongodb-data:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,20 +1,30 @@
 amqp==5.3.1
 annotated-types==0.7.0
 anyio==4.10.0
 asgiref==3.9.1
 bcrypt==4.3.0
 billiard==4.2.1
 celery==5.5.3
 certifi==2025.8.3
 cffi==2.0.0
 click==8.2.1
 click-didyoumean==0.3.1
 click-plugins==1.1.1.2
 click-repl==0.3.0
 cryptography==46.0.1
 dnspython==2.8.0
 ecdsa==0.19.1
 email-validator==2.3.0
 fastapi==0.116.1
 h11==0.16.0
 hiredis==3.2.1
 httpcore==1.0.9
 httptools==0.6.4
 httpx==0.28.1
 idna==3.10
 importlib_metadata==8.7.0
 iniconfig==2.1.0
 izulu==0.50.0
 kombu==5.5.4
 mongomock==4.3.0
 mongomock-motor==0.0.36
@@ -23,9 +33,13 @@ packaging==25.0
 pipdeptree==2.28.0
 pluggy==1.6.0
 prompt_toolkit==3.0.52
 pyasn1==0.6.1
 pycparser==2.23
 pycron==3.2.0
 pydantic==2.11.9
 pydantic_core==2.33.2
 Pygments==2.19.2
 PyJWT==2.10.1
 pymongo==4.15.1
 pytest==8.4.2
 pytest-asyncio==1.2.0
@@ -35,6 +49,8 @@ python-dotenv==1.1.1
 python-magic==0.4.27
 pytz==2025.2
 PyYAML==6.0.2
 redis==6.4.0
 rsa==4.9.1
 sentinels==1.1.1
 six==1.17.0
 sniffio==1.3.1
@@ -45,6 +61,8 @@ tzdata==2025.2
 uvicorn==0.35.0
 uvloop==0.21.0
 vine==5.1.0
 watchdog==6.0.0
 watchfiles==1.1.0
 wcwidth==0.2.13
 websockets==15.0.1
 zipp==3.23.0
--- a/src/file-processor/Dockerfile
+++ b/src/file-processor/Dockerfile
@@ -3,6 +3,12 @@ FROM python:3.12-slim
 # Set working directory
 WORKDIR /app
 # Install libmagic
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libmagic1 \
    file \
 && rm -rf /var/lib/apt/lists/*
 # Copy requirements and install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
--- a/src/file-processor/app/api/init.py
+++ b/src/file-processor/app/api/init.py
--- a/src/file-processor/app/api/dependencies.py
+++ b/src/file-processor/app/api/dependencies.py
@@ -0,0 +1,100 @@
 import jwt
 from fastapi import Depends, HTTPException
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from jwt import InvalidTokenError
 from starlette import status
 from app.config import settings
 from app.database.connection import get_database
 from app.models.auth import UserRole
 from app.models.user import UserInDB
 from app.services.auth_service import AuthService
 from app.services.user_service import UserService
 security = HTTPBearer()
 def get_auth_service() -> AuthService:
  """Dependency to get AuthService instance."""
  return AuthService()
 def get_user_service() -> UserService:
  """Dependency to get UserService instance."""
  database = get_database()
  return UserService(database)
 def get_current_user(
    credentials: HTTPAuthorizationCredentials = Depends(security),
    user_service: UserService = Depends(get_user_service)
 ) -> UserInDB:
  """
  Dependency to get current authenticated user from JWT token.
  Args:
      credentials: HTTP Bearer credentials
      user_service: Auth service instance
  Returns:
      User: Current authenticated user
  Raises:
      HTTPException: If token is invalid or user not found
  """
  try:
    payload = jwt.decode(
      credentials.credentials,
      settings.get_jwt_secret_key(),
      algorithms=[settings.get_jwt_algorithm()]
    )
    username: str = payload.get("sub")
    if username is None:
      raise HTTPException(
        status_code=status.HTTP_401_UNAUTHORIZED,
        detail="Could not validate credentials",
        headers={"WWW-Authenticate": "Bearer"},
      )
  except InvalidTokenError:
    raise HTTPException(
      status_code=status.HTTP_401_UNAUTHORIZED,
      detail="Could not validate credentials",
      headers={"WWW-Authenticate": "Bearer"},
    )
  user = user_service.get_user_by_username(username)
  if user is None:
    raise HTTPException(
      status_code=status.HTTP_401_UNAUTHORIZED,
      detail="Could not validate credentials",
      headers={"WWW-Authenticate": "Bearer"},
    )
  if not user.is_active:
    raise HTTPException(
      status_code=status.HTTP_400_BAD_REQUEST,
      detail="Inactive user"
    )
  return user
 def   get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
  """
  Dependency to ensure current user has admin role.
  Args:
      current_user: Current authenticated user
  Returns:
      User: Current user if admin
  Raises:
      HTTPException: If user is not admin
  """
  if current_user.role != UserRole.ADMIN:
    raise HTTPException(
      status_code=status.HTTP_403_FORBIDDEN,
      detail="Not enough permissions"
    )
  return current_user
--- a/src/file-processor/app/api/routes/init.py
+++ b/src/file-processor/app/api/routes/init.py
--- a/src/file-processor/app/api/routes/auth.py
+++ b/src/file-processor/app/api/routes/auth.py
@@ -0,0 +1,80 @@
 from fastapi import APIRouter, Depends, HTTPException, status
 from fastapi.security import OAuth2PasswordRequestForm
 from app.api.dependencies import get_auth_service, get_current_user, get_user_service
 from app.models.auth import LoginResponse, UserResponse
 from app.models.user import UserInDB
 from app.services.auth_service import AuthService
 from app.services.user_service import UserService
 router = APIRouter(tags=["authentication"])
@router.post("/login", response_model=LoginResponse)
 def login(
    form_data: OAuth2PasswordRequestForm = Depends(),
    auth_service: AuthService = Depends(get_auth_service),
    user_service: UserService = Depends(get_user_service)
 ):
  """
  Authenticate user and return JWT token.
  Args:
      form_data: OAuth2 password form data
      auth_service: Auth service instance
      user_service: User service instance
  Returns:
      LoginResponse: JWT token and user info
  Raises:
      HTTPException: If authentication fails
  """
  incorrect_username_or_pwd = HTTPException(
    status_code=status.HTTP_401_UNAUTHORIZED,
    detail="Incorrect username or password",
    headers={"WWW-Authenticate": "Bearer"},
  )
  user = user_service.get_user_by_username(form_data.username)
  if (not user or
      not user.is_active or
      not auth_service.verify_user_password(form_data.password, user.hashed_password)):
    raise incorrect_username_or_pwd
  access_token = auth_service.create_access_token(data={"sub": user.username})
  return LoginResponse(
    access_token=access_token,
    user=UserResponse(
      _id=user.id,
      username=user.username,
      email=user.email,
      role=user.role,
      is_active=user.is_active,
      created_at=user.created_at,
      updated_at=user.updated_at
    )
  )
@router.get("/me", response_model=UserResponse)
 def get_current_user_profile(current_user: UserInDB = Depends(get_current_user)):
  """
  Get current user profile.
  Args:
      current_user: Current authenticated user
  Returns:
      UserResponse: Current user profile without sensitive data
  """
  return UserResponse(
    _id=current_user.id,
    username=current_user.username,
    email=current_user.email,
    role=current_user.role,
    is_active=current_user.is_active,
    created_at=current_user.created_at,
    updated_at=current_user.updated_at
  )
--- a/src/file-processor/app/api/routes/users.py
+++ b/src/file-processor/app/api/routes/users.py
@@ -0,0 +1,172 @@
 from fastapi import APIRouter, Depends, HTTPException
 from starlette import status
 from app.api.dependencies import get_admin_user, get_user_service
 from app.models.auth import UserResponse, MessageResponse
 from app.models.types import PyObjectId
 from app.models.user import UserInDB, UserCreate, UserUpdate
 from app.services.user_service import UserService
 router = APIRouter(tags=["users"])
@router.get("", response_model=list[UserInDB])
 def list_users(
    admin_user: UserInDB = Depends(get_admin_user),
    user_service: UserService = Depends(get_user_service)
 ):
  """
  List all users (admin only).
  Args:
      admin_user: Current admin user
      user_service: User service instance
  Returns:
      List[UserResponse]: List of all users without sensitive data
  """
  return user_service.list_users()
@router.get("/{user_id}", response_model=UserResponse)
 def get_user_by_id(
    user_id: PyObjectId,
    admin_user: UserInDB = Depends(get_admin_user),
    user_service: UserService = Depends(get_user_service)
 ):
  """
  Get specific user by ID (admin only).
  Args:
      user_id: User ID to retrieve
      admin_user: Current admin user
      user_service: User service instance
  Returns:
      UserResponse: User information without sensitive data
  Raises:
      HTTPException: If user not found
  """
  user = user_service.get_user_by_id(str(user_id))
  if not user:
    raise HTTPException(
      status_code=status.HTTP_404_NOT_FOUND,
      detail="User not found"
    )
  return user
@router.post("", response_model=UserResponse, status_code=status.HTTP_201_CREATED)
 def create_user(
    user_data: UserCreate,
    admin_user: UserInDB = Depends(get_admin_user),
    user_service: UserService = Depends(get_user_service)
 ):
  """
  Create new user (admin only).
  Args:
      user_data: User creation data
      admin_user: Current admin user
      user_service: User service instance
  Returns:
      UserResponse: Created user information without sensitive data
  Raises:
      HTTPException: If user creation fails
  """
  try:
    user = user_service.create_user(user_data)
    return UserResponse(
      _id=user.id,
      username=user.username,
      email=user.email,
      role=user.role,
      is_active=user.is_active,
      created_at=user.created_at,
      updated_at=user.updated_at
    )
  except ValueError as e:
    raise HTTPException(
      status_code=status.HTTP_400_BAD_REQUEST,
      detail=str(e)
    )
@router.put("/{user_id}", response_model=UserResponse)
 def update_user(
    user_id: PyObjectId,
    user_data: UserUpdate,
    admin_user: UserInDB = Depends(get_admin_user),
    user_service: UserService = Depends(get_user_service)
 ):
  """
  Update existing user (admin only).
  Args:
      user_id: User ID to update
      user_data: User update data
      admin_user: Current admin user
      user_service: User service instance
  Returns:
      UserResponse: Updated user information without sensitive data
  Raises:
      HTTPException: If user not found or update fails
  """
  try:
    user = user_service.update_user(str(user_id), user_data)
    if not user:
      raise HTTPException(
        status_code=status.HTTP_404_NOT_FOUND,
        detail="User not found"
      )
    return UserResponse(
      _id=user.id,
      username=user.username,
      email=user.email,
      role=user.role,
      is_active=user.is_active,
      created_at=user.created_at,
      updated_at=user.updated_at
    )
  except ValueError as e:
    raise HTTPException(
      status_code=status.HTTP_400_BAD_REQUEST,
      detail=str(e)
    )
@router.delete("/{user_id}", response_model=MessageResponse)
 def delete_user(
    user_id: PyObjectId,
    admin_user: UserInDB = Depends(get_admin_user),
    user_service: UserService = Depends(get_user_service)
 ):
  """
  Delete user by ID (admin only).
  Args:
      user_id: User ID to delete
      admin_user: Current admin user
      user_service: User service instance
  Returns:
      MessageResponse: Success message
  Raises:
      HTTPException: If user not found or deletion fails
  """
  success = user_service.delete_user(str(user_id))
  if not success:
    raise HTTPException(
      status_code=status.HTTP_404_NOT_FOUND,
      detail="User not found"
    )
  return MessageResponse(message="User successfully deleted")
--- a/src/file-processor/app/config/settings.py
+++ b/src/file-processor/app/config/settings.py
@@ -6,7 +6,6 @@ using simple os.getenv() approach without external validation libraries.
 """
 import os
 from typing import Optional
 def get_mongodb_url() -> str:
@@ -31,6 +30,26 @@ def get_mongodb_database_name() -> str:
  return os.getenv("MONGODB_DATABASE", "mydocmanager")
 def get_redis_url() -> str:
  return os.getenv("REDIS_URL", "redis://localhost:6379/0")
 # def get_redis_host() -> str:
 #   redis_url = get_redis_url()
 #   if redis_url.startswith("redis://"):
 #     return redis_url.split("redis://")[1].split("/")[0]
 #   else:
 #     return redis_url
 #
 #
 # def get_redis_port() -> int:
 #   redis_url = get_redis_url()
 #   if redis_url.startswith("redis://"):
 #     return int(redis_url.split("redis://")[1].split("/")[0].split(":")[1])
 #   else:
 #     return int(redis_url.split(":")[1])
 def get_jwt_secret_key() -> str:
  """
  Get JWT secret key from environment variables.
@@ -82,4 +101,19 @@ def is_development_environment() -> bool:
  Returns:
      bool: True if development environment
  """
-  return os.getenv("ENVIRONMENT", "development").lower() == "development"
+  return os.getenv("ENVIRONMENT", "development").lower() == "development"
 def get_objects_folder() -> str:
  """
  Get Vault path from environment variables.
  Returns:
      str: Vault path
  """
  return os.getenv("OBJECTS_FOLDER", "/objects")
 def watch_directory() -> str:
  """Directory to monitor for new files"""
  return os.getenv("WATCH_DIRECTORY", "/watched_files")
--- a/src/file-processor/app/database/connection.py
+++ b/src/file-processor/app/database/connection.py
@@ -7,6 +7,7 @@ The application will terminate if MongoDB is not accessible at startup.
 import sys
 from typing import Optional
 from pymongo import MongoClient
 from pymongo.database import Database
 from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
@@ -107,6 +108,15 @@ def get_mongodb_client() -> Optional[MongoClient]:
  return _client
 def get_extra_args(session):
  # Build kwargs only if session is provided
  kwargs = {}
  if session is not None:
    kwargs["session"] = session
  return kwargs
 def test_database_connection() -> bool:
  """
  Test if database connection is working.
@@ -122,4 +132,4 @@ def test_database_connection() -> bool:
    db.command('ping')
    return True
  except Exception:
-    return False
+    return False
--- a/src/file-processor/app/database/repositories/document_content_repository.py
+++ b/src/file-processor/app/database/repositories/document_content_repository.py
@@ -1,214 +0,0 @@
 from typing import List, Optional
 from datetime import datetime
 from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
 from pymongo.errors import DuplicateKeyError, PyMongoError
 from bson import ObjectId
 from app.models.document import DocumentContent
 class DocumentContentRepository:
  """
  Repository class for document content CRUD operations in MongoDB.
  This class handles all database operations related to document content,
  following the repository pattern with dependency injection and async/await.
  """
  def __init__(self, database: AsyncIOMotorDatabase):
    """
    Initialize repository with database dependency.
    Args:
        database (AsyncIOMotorDatabase): MongoDB database instance
    """
    self.db = database
    self.collection: AsyncIOMotorCollection = database.document_contents
    self._ensure_indexes()
  async def initialize(self):
    """
    Initialize repository by ensuring required indexes exist.
    Should be called after repository instantiation to setup database indexes.
    """
    await self._ensure_indexes()
  async def _ensure_indexes(self):
    """
    Ensure required database indexes exist.
    Creates unique index on file_hash field to prevent duplicates.
    """
    try:
      await self.collection.create_index("file_hash", unique=True)
    except PyMongoError:
      # Index might already exist, ignore error
      pass
  async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
    """
    Create a new document content in the database.
    Args:
        document_content (DocumentContent): Document content data
    Returns:
        DocumentContent: Created document content with database ID
    Raises:
        DuplicateKeyError: If file_hash already exists
        ValueError: If document content creation fails due to validation
    """
    document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
    # Remove _id if it's None to let MongoDB generate it
    if document_dict.get("_id") is None:
      document_dict.pop("_id", None)
    try:
      result = await self.collection.insert_one(document_dict)
      document_dict["_id"] = result.inserted_id
      return DocumentContent(**document_dict)
    except DuplicateKeyError as e:
      raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
    except PyMongoError as e:
      raise ValueError(f"Failed to create document content: {e}")
  async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
    """
    Find document content by ID.
    Args:
        document_id (str): Document content ID to search for
    Returns:
        DocumentContent or None: Document content if found, None otherwise
    """
    try:
      if not ObjectId.is_valid(document_id):
        return None
      document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
      if document_doc:
        return DocumentContent(**document_doc)
      return None
    except PyMongoError:
      return None
  async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
    """
    Find document content by file hash.
    Args:
        file_hash (str): File hash to search for
    Returns:
        DocumentContent or None: Document content if found, None otherwise
    """
    try:
      document_doc = await self.collection.find_one({"file_hash": file_hash})
      if document_doc:
        return DocumentContent(**document_doc)
      return None
    except PyMongoError:
      return None
  async def content_exists(self, file_hash: str) -> bool:
    """
    Check if document content exists by file hash.
    Args:
        file_hash (str): File hash to check
    Returns:
        bool: True if document content exists, False otherwise
    """
    try:
      count = await self.collection.count_documents({"file_hash": file_hash})
      return count > 0
    except PyMongoError:
      return False
  async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
    """
    Update document content information.
    Args:
        document_id (str): Document content ID to update
        update_data (dict): Updated document content data
    Returns:
        DocumentContent or None: Updated document content if found, None otherwise
    """
    try:
      if not ObjectId.is_valid(document_id):
        return None
      # Remove None values and _id from update data
      clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
      if not clean_update_data:
        return await self.find_document_content_by_id(document_id)
      result = await self.collection.find_one_and_update(
        {"_id": ObjectId(document_id)},
        {"$set": clean_update_data},
        return_document=True
      )
      if result:
        return DocumentContent(**result)
      return None
    except PyMongoError:
      return None
  async def delete_document_content(self, document_id: str) -> bool:
    """
    Delete document content from database.
    Args:
        document_id (str): Document content ID to delete
    Returns:
        bool: True if document content was deleted, False otherwise
    """
    try:
      if not ObjectId.is_valid(document_id):
        return False
      result = await self.collection.delete_one({"_id": ObjectId(document_id)})
      return result.deleted_count > 0
    except PyMongoError:
      return False
  async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
    """
    List document contents with pagination.
    Args:
        skip (int): Number of document contents to skip (default: 0)
        limit (int): Maximum number of document contents to return (default: 100)
    Returns:
        List[DocumentContent]: List of document contents
    """
    try:
      cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
      document_docs = await cursor.to_list(length=limit)
      return [DocumentContent(**document_doc) for document_doc in document_docs]
    except PyMongoError:
      return []
  async def count_document_contents(self) -> int:
    """
    Count total number of document contents.
    Returns:
        int: Total number of document contents in database
    """
    try:
      return await self.collection.count_documents({})
    except PyMongoError:
      return 0
--- a/src/file-processor/app/database/repositories/document_repository.py
+++ b/src/file-processor/app/database/repositories/document_repository.py
@@ -6,9 +6,13 @@ in MongoDB with proper error handling and type safety.
 """
 from typing import Optional, List
 from bson import ObjectId
 from pymongo.collection import Collection
 from pymongo.database import Database
 from pymongo.errors import DuplicateKeyError, PyMongoError
-from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
+
 from app.database.connection import get_extra_args
 from app.models.document import FileDocument
 from app.utils.document_matching import fuzzy_matching, subsequence_matching
@@ -34,52 +38,49 @@ class FileDocumentRepository:
  with proper error handling and data validation.
  """
-  def __init__(self, database: AsyncIOMotorDatabase):
+  def __init__(self, database: Database):
    """Initialize file repository with database connection."""
    self.db = database
-    self.collection: AsyncIOMotorCollection = self.db.files
+    self.collection: Collection = self.db.documents
    self._ensure_indexes()
-  async def initialize(self):
+  def initialize(self):
    """
    Initialize repository by ensuring required indexes exist.
    Should be called after repository instantiation to setup database indexes.
    """
-    await self._ensure_indexes()
+    self._ensure_indexes()
    return self
-  async def _ensure_indexes(self):
+  def _ensure_indexes(self):
    """
    Ensure required database indexes exist.
    Creates unique index on username field to prevent duplicates.
    """
-    try:
+    pass
      await self.collection.create_index("filepath", unique=True)
    except PyMongoError:
      # Index might already exist, ignore error
      pass
-  async def create_document(self, file_data: FileDocument) -> FileDocument:
+  def create_document(self, file_data: FileDocument, session=None) -> FileDocument:
    """
    Create a new file document in database.
    Args:
        file_data (FileDocument): File document data to create
        session (AsyncIOMotorClientSession, optional): MongoDB session
    Returns:
-        FileDocument: Created file document with database ID
+        FileDocument: Created document with database ID
    Raises:
        ValueError: If file creation fails due to validation
-        DuplicateKeyError: If file with same hash already exists
+        DuplicateKeyError: If a document with same hash already exists
    """
    try:
      file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
      if "_id" in file_dict and file_dict["_id"] is None:
        del file_dict["_id"]
-      result = await self.collection.insert_one(file_dict)
+      result = self.collection.insert_one(file_dict, **get_extra_args(session))
      file_data.id = result.inserted_id
      return file_data
@@ -88,7 +89,7 @@ class FileDocumentRepository:
    except PyMongoError as e:
      raise ValueError(f"Failed to create file document: {e}")
-  async def find_document_by_id(self, file_id: str) -> Optional[FileDocument]:
+  def find_document_by_id(self, file_id: str) -> Optional[FileDocument]:
    """
    Find file document by ID.
@@ -102,7 +103,7 @@ class FileDocumentRepository:
      if not ObjectId.is_valid(file_id):
        return None
-      file_doc = await self.collection.find_one({"_id": ObjectId(file_id)})
+      file_doc = self.collection.find_one({"_id": ObjectId(file_id)})
      if file_doc:
        return FileDocument(**file_doc)
      return None
@@ -110,7 +111,7 @@ class FileDocumentRepository:
    except PyMongoError:
      return None
-  async def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
+  def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
    """
    Find file document by file hash to detect duplicates.
@@ -121,7 +122,7 @@ class FileDocumentRepository:
        FileDocument or None: File document if found, None otherwise
    """
    try:
-      file_doc = await self.collection.find_one({"file_hash": file_hash})
+      file_doc = self.collection.find_one({"file_hash": file_hash})
      if file_doc:
        return FileDocument(**file_doc)
      return None
@@ -129,7 +130,7 @@ class FileDocumentRepository:
    except PyMongoError:
      return None
-  async def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
+  def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
    """
    Find file document by exact filepath.
@@ -140,7 +141,7 @@ class FileDocumentRepository:
        FileDocument or None: File document if found, None otherwise
    """
    try:
-      file_doc = await self.collection.find_one({"filepath": filepath})
+      file_doc = self.collection.find_one({"filepath": filepath})
      if file_doc:
        return FileDocument(**file_doc)
      return None
@@ -148,7 +149,7 @@ class FileDocumentRepository:
    except PyMongoError:
      return None
-  async def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
+  def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
    """
    Find file documents by filename using fuzzy matching.
@@ -162,8 +163,7 @@ class FileDocumentRepository:
    try:
      # Get all files from database
      cursor = self.collection.find({})
-      all_files = await cursor.to_list(length=None)
+      all_documents = [FileDocument(**file_doc) for file_doc in cursor]
      all_documents = [FileDocument(**file_doc) for file_doc in all_files]
      if isinstance(matching_method, FuzzyMatching):
        return fuzzy_matching(filename, all_documents, matching_method.threshold)
@@ -173,7 +173,7 @@ class FileDocumentRepository:
    except PyMongoError:
      return []
-  async def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]:
+  def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]:
    """
    List file documents with pagination.
@@ -186,13 +186,12 @@ class FileDocumentRepository:
    """
    try:
      cursor = self.collection.find({}).skip(skip).limit(limit).sort("detected_at", -1)
-      file_docs = await cursor.to_list(length=limit)
+      return [FileDocument(**doc) for doc in cursor]
      return [FileDocument(**doc) for doc in file_docs]
    except PyMongoError:
      return []
-  async def count_documents(self) -> int:
+  def count_documents(self) -> int:
    """
    Count total number of file documents.
@@ -200,17 +199,18 @@ class FileDocumentRepository:
        int: Total number of file documents in collection
    """
    try:
-      return await self.collection.count_documents({})
+      return self.collection.count_documents({})
    except PyMongoError:
      return 0
-  async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]:
+  def update_document(self, file_id: str, update_data: dict, session=None) -> Optional[FileDocument]:
    """
    Update file document with new data.
    Args:
        file_id (str): File document ID to update
        update_data (dict): Fields to update
        session (AsyncIOMotorClientSession, optional): MongoDB session
    Returns:
        FileDocument or None: Updated file document if successful, None otherwise
@@ -223,12 +223,13 @@ class FileDocumentRepository:
      clean_update_data = {k: v for k, v in update_data.items() if v is not None}
      if not clean_update_data:
-        return await self.find_document_by_id(file_id)
+        return self.find_document_by_id(file_id)
-      result = await self.collection.find_one_and_update(
+      result = self.collection.find_one_and_update(
        {"_id": ObjectId(file_id)},
        {"$set": clean_update_data},
-        return_document=True
+        return_document=True,
        **get_extra_args(session)
      )
      if result:
@@ -238,12 +239,13 @@ class FileDocumentRepository:
    except PyMongoError:
      return None
-  async def delete_document(self, file_id: str) -> bool:
+  def delete_document(self, file_id: str, session=None) -> bool:
    """
    Delete file document from database.
    Args:
        file_id (str): File document ID to delete
        session (AsyncIOMotorClientSession, optional): MongoDB session
    Returns:
        bool: True if file was deleted, False otherwise
@@ -252,7 +254,7 @@ class FileDocumentRepository:
      if not ObjectId.is_valid(file_id):
        return False
-      result = await self.collection.delete_one({"_id": ObjectId(file_id)})
+      result = self.collection.delete_one({"_id": ObjectId(file_id)}, **get_extra_args(session))
      return result.deleted_count > 0
    except PyMongoError:
--- a/src/file-processor/app/database/repositories/job_repository.py
+++ b/src/file-processor/app/database/repositories/job_repository.py
@@ -0,0 +1,230 @@
 """
 Repository for managing processing jobs in MongoDB.
 This module provides data access layer for ProcessingJob operations
 with automatic timestamp management and error handling.
 """
 from datetime import datetime
 from typing import List, Optional
 from pymongo.collection import Collection
 from pymongo.database import Database
 from pymongo.errors import PyMongoError
 from app.exceptions.job_exceptions import JobRepositoryError
 from app.models.job import ProcessingJob, ProcessingStatus
 from app.models.types import PyObjectId
 class JobRepository:
  """
  Repository for processing job data access operations.
  Provides CRUD operations for ProcessingJob documents with automatic
  timestamp management and proper error handling.
  """
  def __init__(self, database: Database):
    """Initialize repository with MongoDB collection reference."""
    self.db = database
    self.collection: Collection = self.db.processing_jobs
  def _ensure_indexes(self):
    """
    Ensure required database indexes exist.
    Creates unique index on username field to prevent duplicates.
    """
    try:
      self.collection.create_index("document_id", unique=True)
    except PyMongoError:
      # Index might already exist, ignore error
      pass
  def initialize(self):
    """
    Initialize repository by ensuring required indexes exist.
    Should be called after repository instantiation to setup database indexes.
    """
    self._ensure_indexes()
    return self
  def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob:
    """
    Create a new processing job.
    Args:
        file_id: Reference to the file document
        task_id: Optional Celery task UUID
    Returns:
        The created ProcessingJob
    Raises:
        JobRepositoryError: If database operation fails
    """
    try:
      job_data = {
          "document_id": document_id,
          "status": ProcessingStatus.PENDING,
          "task_id": task_id,
          "created_at": datetime.now(),
          "started_at": None,
          "completed_at": None,
          "error_message": None
      }
      result = self.collection.insert_one(job_data)
      job_data["_id"] = result.inserted_id
      return ProcessingJob(**job_data)
    except PyMongoError as e:
      raise JobRepositoryError("create_job", e)
  def find_job_by_id(self, job_id: PyObjectId) -> Optional[ProcessingJob]:
    """
    Retrieve a job by its ID.
    Args:
        job_id: The job ObjectId
    Returns:
        The ProcessingJob document
    Raises:
        JobNotFoundError: If job doesn't exist
        JobRepositoryError: If database operation fails
    """
    try:
      job_data = self.collection.find_one({"_id": job_id})
      if job_data:
        return ProcessingJob(**job_data)
      return None
    except PyMongoError as e:
      raise JobRepositoryError("get_job_by_id", e)
  def update_job_status(
      self,
      job_id: PyObjectId,
      status: ProcessingStatus,
      error_message: Optional[str] = None
  ) -> Optional[ProcessingJob]:
    """
    Update job status with automatic timestamp management.
    Args:
        job_id: The job ObjectId
        status: New processing status
        error_message: Optional error message for failed jobs
    Returns:
        The updated ProcessingJob
    Raises:
        JobNotFoundError: If job doesn't exist
        JobRepositoryError: If database operation fails
    """
    try:
      # Prepare update data
      update_data = {"status": status}
      # Set appropriate timestamp based on status
      current_time = datetime.now()
      if status == ProcessingStatus.PROCESSING:
        update_data["started_at"] = current_time
      elif status in (ProcessingStatus.COMPLETED, ProcessingStatus.FAILED):
        update_data["completed_at"] = current_time
      # Add error message if provided
      if error_message is not None:
        update_data["error_message"] = error_message
      result = self.collection.find_one_and_update(
        {"_id": job_id},
        {"$set": update_data},
        return_document=True
      )
      if result:
        return ProcessingJob(**result)
      return None
    except PyMongoError as e:
      raise JobRepositoryError("update_job_status", e)
  def delete_job(self, job_id: PyObjectId) -> bool:
    """
    Delete a job from the database.
    Args:
        job_id: The job ObjectId
    Returns:
        True if job was deleted, False if not found
    Raises:
        JobRepositoryError: If database operation fails
    """
    try:
      result = self.collection.delete_one({"_id": job_id})
      return result.deleted_count > 0
    except PyMongoError as e:
      raise JobRepositoryError("delete_job", e)
  def find_jobs_by_document_id(self, document_id: PyObjectId) -> List[ProcessingJob]:
    """
    Retrieve all jobs for a specific file.
    Args:
        document_id: The file ObjectId
    Returns:
        List of ProcessingJob documents
    Raises:
        JobRepositoryError: If database operation fails
    """
    try:
      cursor = self.collection.find({"document_id": document_id})
      jobs = []
      for job_data in cursor:
        jobs.append(ProcessingJob(**job_data))
      return jobs
    except PyMongoError as e:
      raise JobRepositoryError("get_jobs_by_file_id", e)
  def get_jobs_by_status(self, status: ProcessingStatus) -> List[ProcessingJob]:
    """
    Retrieve all jobs with a specific status.
    Args:
        status: The processing status to filter by
    Returns:
        List of ProcessingJob documents
    Raises:
        JobRepositoryError: If database operation fails
    """
    try:
      cursor = self.collection.find({"status": status})
      jobs = []
      for job_data in cursor:
        jobs.append(ProcessingJob(**job_data))
      return jobs
    except PyMongoError as e:
      raise JobRepositoryError("get_jobs_by_status", e)
--- a/src/file-processor/app/database/repositories/user_repository.py
+++ b/src/file-processor/app/database/repositories/user_repository.py
@@ -5,10 +5,12 @@ This module implements the repository pattern for user CRUD operations
 with dependency injection of the database connection using async/await.
 """
 from typing import Optional, List
 from datetime import datetime
 from typing import Optional, List
 from bson import ObjectId
-from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
+from pymongo.collection import Collection
 from pymongo.database import Database
 from pymongo.errors import DuplicateKeyError, PyMongoError
 from app.models.user import UserCreate, UserInDB, UserUpdate
@@ -23,7 +25,7 @@ class UserRepository:
  following the repository pattern with dependency injection and async/await.
  """
-  def __init__(self, database: AsyncIOMotorDatabase):
+  def __init__(self, database: Database):
    """
    Initialize repository with database dependency.
@@ -31,30 +33,30 @@ class UserRepository:
        database (AsyncIOMotorDatabase): MongoDB database instance
    """
    self.db = database
-    self.collection: AsyncIOMotorCollection = database.users
+    self.collection: Collection = database.users
    self._ensure_indexes()
-  async def initialize(self):
+  def initialize(self):
    """
    Initialize repository by ensuring required indexes exist.
    Should be called after repository instantiation to setup database indexes.
    """
-    await self._ensure_indexes()
+    self._ensure_indexes()
    return self
-  async def _ensure_indexes(self):
+  def _ensure_indexes(self):
    """
    Ensure required database indexes exist.
    Creates unique index on username field to prevent duplicates.
    """
    try:
-      await self.collection.create_index("username", unique=True)
+      self.collection.create_index("username", unique=True)
    except PyMongoError:
      # Index might already exist, ignore error
      pass
-  async def create_user(self, user_data: UserCreate) -> UserInDB:
+  def create_user(self, user_data: UserCreate) -> UserInDB:
    """
    Create a new user in the database.
@@ -79,7 +81,7 @@ class UserRepository:
    }
    try:
-      result = await self.collection.insert_one(user_dict)
+      result = self.collection.insert_one(user_dict)
      user_dict["_id"] = result.inserted_id
      return UserInDB(**user_dict)
    except DuplicateKeyError as e:
@@ -87,7 +89,7 @@ class UserRepository:
    except PyMongoError as e:
      raise ValueError(f"Failed to create user: {e}")
-  async def find_user_by_username(self, username: str) -> Optional[UserInDB]:
+  def find_user_by_username(self, username: str) -> Optional[UserInDB]:
    """
    Find user by username.
@@ -98,14 +100,14 @@ class UserRepository:
        UserInDB or None: User if found, None otherwise
    """
    try:
-      user_doc = await self.collection.find_one({"username": username})
+      user_doc = self.collection.find_one({"username": username})
      if user_doc:
        return UserInDB(**user_doc)
      return None
    except PyMongoError:
      return None
-  async def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
+  def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
    """
    Find user by ID.
@@ -119,14 +121,14 @@ class UserRepository:
      if not ObjectId.is_valid(user_id):
        return None
-      user_doc = await self.collection.find_one({"_id": ObjectId(user_id)})
+      user_doc = self.collection.find_one({"_id": ObjectId(user_id)})
      if user_doc:
        return UserInDB(**user_doc)
      return None
    except PyMongoError:
      return None
-  async def find_user_by_email(self, email: str) -> Optional[UserInDB]:
+  def find_user_by_email(self, email: str) -> Optional[UserInDB]:
    """
    Find user by email address.
@@ -137,14 +139,14 @@ class UserRepository:
        UserInDB or None: User if found, None otherwise
    """
    try:
-      user_doc = await self.collection.find_one({"email": email})
+      user_doc = self.collection.find_one({"email": email})
      if user_doc:
        return UserInDB(**user_doc)
      return None
    except PyMongoError:
      return None
-  async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
+  def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
    """
    Update user information.
@@ -177,9 +179,9 @@ class UserRepository:
      clean_update_data = {k: v for k, v in update_data.items() if v is not None}
      if not clean_update_data:
-        return await self.find_user_by_id(user_id)
+        return self.find_user_by_id(user_id)
-      result = await self.collection.find_one_and_update(
+      result = self.collection.find_one_and_update(
        {"_id": ObjectId(user_id)},
        {"$set": clean_update_data},
        return_document=True
@@ -192,7 +194,7 @@ class UserRepository:
    except PyMongoError:
      return None
-  async def delete_user(self, user_id: str) -> bool:
+  def delete_user(self, user_id: str) -> bool:
    """
    Delete user from database.
@@ -206,12 +208,12 @@ class UserRepository:
      if not ObjectId.is_valid(user_id):
        return False
-      result = await self.collection.delete_one({"_id": ObjectId(user_id)})
+      result = self.collection.delete_one({"_id": ObjectId(user_id)})
      return result.deleted_count > 0
    except PyMongoError:
      return False
-  async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
+  def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
    """
    List users with pagination.
@@ -224,12 +226,12 @@ class UserRepository:
    """
    try:
      cursor = self.collection.find({}).skip(skip).limit(limit).sort("created_at", -1)
-      user_docs = await cursor.to_list(length=limit)
+      user_docs = cursor.to_list(length=limit)
      return [UserInDB(**user_doc) for user_doc in user_docs]
    except PyMongoError:
      return []
-  async def count_users(self) -> int:
+  def count_users(self) -> int:
    """
    Count total number of users.
@@ -237,11 +239,11 @@ class UserRepository:
        int: Total number of users in database
    """
    try:
-      return await self.collection.count_documents({})
+      return self.collection.count_documents({})
    except PyMongoError:
      return 0
-  async def user_exists(self, username: str) -> bool:
+  def user_exists(self, username: str) -> bool:
    """
    Check if user exists by username.
@@ -252,7 +254,7 @@ class UserRepository:
        bool: True if user exists, False otherwise
    """
    try:
-      count = await self.collection.count_documents({"username": username})
+      count = self.collection.count_documents({"username": username})
      return count > 0
    except PyMongoError:
      return False
--- a/src/file-processor/app/exceptions/init.py
+++ b/src/file-processor/app/exceptions/init.py
--- a/src/file-processor/app/exceptions/job_exceptions.py
+++ b/src/file-processor/app/exceptions/job_exceptions.py
@@ -0,0 +1,38 @@
 """
 Custom exceptions for job management operations.
 This module defines specific exceptions for job processing lifecycle
 and repository operations to provide clear error handling.
 """
 from app.models.job import ProcessingStatus
 class InvalidStatusTransitionError(Exception):
  """
  Raised when an invalid status transition is attempted.
  This exception indicates that an attempt was made to change a job's
  status to an invalid target status given the current status.
  """
  def __init__(self, current_status: ProcessingStatus, target_status: ProcessingStatus):
    self.current_status = current_status
    self.target_status = target_status
    super().__init__(
      f"Invalid status transition from '{current_status}' to '{target_status}'"
    )
 class JobRepositoryError(Exception):
  """
  Raised when a MongoDB operation fails in the job repository.
  This exception wraps database-related errors that occur during
  job repository operations.
  """
  def __init__(self, operation: str, original_error: Exception):
    self.operation = operation
    self.original_error = original_error
    super().__init__(f"Repository operation '{operation}' failed: {str(original_error)}")
--- a/src/file-processor/app/file_watcher.py
+++ b/src/file-processor/app/file_watcher.py
@@ -0,0 +1,243 @@
 """
 File watcher implementation with Watchdog observer and ProcessingJob management.
 This module provides real-time file monitoring for document processing.
 When a file is created in the watched directory, it:
 1. Creates a document record via DocumentService
 2. Dispatches a Celery task for processing
 3. Creates a ProcessingJob to track the task lifecycle
 """
 import logging
 import threading
 from pathlib import Path
 from typing import Optional
 from watchdog.events import FileSystemEventHandler, FileCreatedEvent
 from watchdog.observers import Observer
 from app.services.document_service import DocumentService
 from app.services.job_service import JobService
 logger = logging.getLogger(__name__)
 class DocumentFileEventHandler(FileSystemEventHandler):
  """
  Event handler for document file creation events.
  Processes newly created files by creating document records,
  dispatching Celery tasks, and managing processing jobs.
  """
  SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx'}
  def __init__(self, document_service: DocumentService, job_service: JobService):
    """
    Initialize the event handler.
    Args:
        document_service: Service for document management
        job_service: Service for processing job management
    """
    super().__init__()
    self.document_service = document_service
    self.job_service = job_service
  def on_created(self, event: FileCreatedEvent) -> None:
    """
    Handle file creation events.
    Args:
        event: File system event containing file path information
    """
    if event.is_directory:
      return
    filepath = event.src_path
    file_extension = Path(filepath).suffix.lower()
    if file_extension not in self.SUPPORTED_EXTENSIONS:
      logger.info(f"Ignoring unsupported file type: {filepath}")
      return
    logger.info(f"Processing new file: {filepath}")
    #    try:
    from tasks.document_processing import process_document
    task_result = process_document.delay(filepath)
    print(task_result)
    print("hello world")
    # task_id = task_result.task_id
    # logger.info(f"Dispatched Celery task with ID: {task_id}")
    # except Exception as e:
    #   logger.error(f"Failed to process file {filepath}: {str(e)}")
    #   # Note: We don't re-raise the exception to keep the watcher running
 class FileWatcher:
  """
  File system watcher for automatic document processing.
  Monitors a directory for new files and triggers processing pipeline
  using a dedicated observer thread.
  """
  def __init__(
      self,
      watch_directory: str,
      document_service: DocumentService,
      job_service: JobService,
      recursive: bool = True
  ):
    """
    Initialize the file watcher.
    Args:
        watch_directory: Directory path to monitor
        document_service: Service for document management
        job_service: Service for processing job management
        recursive: Whether to watch subdirectories recursively
    """
    self.watch_directory = Path(watch_directory)
    self.recursive = recursive
    self.observer: Optional[Observer] = None
    self._observer_thread: Optional[threading.Thread] = None
    self._stop_event = threading.Event()
    # Validate watch directory
    if not self.watch_directory.exists():
      raise ValueError(f"Watch directory does not exist: {watch_directory}")
    if not self.watch_directory.is_dir():
      raise ValueError(f"Watch path is not a directory: {watch_directory}")
    # Create event handler
    self.event_handler = DocumentFileEventHandler(
      document_service=document_service,
      job_service=job_service
    )
    logger.info(f"FileWatcher initialized for directory: {self.watch_directory}")
  def start(self) -> None:
    """
    Start the file watcher in a separate thread.
    Raises:
        RuntimeError: If the watcher is already running
    """
    if self.is_running():
      raise RuntimeError("FileWatcher is already running")
    self.observer = Observer()
    self.observer.schedule(
      self.event_handler,
      str(self.watch_directory),
      recursive=self.recursive
    )
    # Start observer in separate thread
    self._observer_thread = threading.Thread(
      target=self._run_observer,
      name="FileWatcher-Observer"
    )
    self._stop_event.clear()
    self._observer_thread.start()
    logger.info("FileWatcher started successfully")
  def stop(self, timeout: float = 5.0) -> None:
    """
    Stop the file watcher gracefully.
    Args:
        timeout: Maximum time to wait for graceful shutdown
    """
    if not self.is_running():
      logger.warning("FileWatcher is not running")
      return
    logger.info("Stopping FileWatcher...")
    # Signal stop and wait for observer thread
    self._stop_event.set()
    if self.observer:
      self.observer.stop()
    if self._observer_thread and self._observer_thread.is_alive():
      self._observer_thread.join(timeout=timeout)
      if self._observer_thread.is_alive():
        logger.warning("FileWatcher thread did not stop gracefully within timeout")
      else:
        logger.info("FileWatcher stopped gracefully")
    # Clean up
    self.observer = None
    self._observer_thread = None
  def is_running(self) -> bool:
    """
    Check if the file watcher is currently running.
    Returns:
        True if the watcher is running, False otherwise
    """
    return (
        self.observer is not None
        and self._observer_thread is not None
        and self._observer_thread.is_alive()
    )
  def _run_observer(self) -> None:
    """
    Internal method to run the observer in a separate thread.
    This method should not be called directly.
    """
    if not self.observer:
      logger.error("Observer not initialized")
      return
    try:
      self.observer.start()
      logger.info("Observer thread started")
      # Keep the observer running until stop is requested
      while not self._stop_event.is_set():
        self._stop_event.wait(timeout=1.0)
      logger.info("Observer thread stopping...")
    except Exception as e:
      logger.error(f"Observer thread error: {str(e)}")
    finally:
      if self.observer:
        self.observer.join()
        logger.info("Observer thread stopped")
 def create_file_watcher(
    watch_directory: str,
    document_service: DocumentService,
    job_service: JobService
 ) -> FileWatcher:
  """
  Factory function to create a FileWatcher instance.
  Args:
      watch_directory: Directory path to monitor
      document_service: Service for document management
      job_service: Service for processing job management
  Returns:
      Configured FileWatcher instance
  """
  return FileWatcher(
    watch_directory=watch_directory,
    document_service=document_service,
    job_service=job_service
  )
--- a/src/file-processor/app/main.py
+++ b/src/file-processor/app/main.py
@@ -1,203 +1,169 @@
 """
-FastAPI application for MyDocManager file processor service.
+FastAPI application with integrated FileWatcher for document processing.
-This service provides API endpoints for health checks and task dispatching.
+This module provides the main FastAPI application with:
 - JWT authentication
 - User management APIs
 - Real-time file monitoring via FileWatcher
 - Document processing via Celery tasks
 """
 import logging
 import os
 from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException, Depends
+from typing import AsyncGenerator
 from pydantic import BaseModel
 import redis
 from celery import Celery
-from app.database.connection import test_database_connection, get_database
+from fastapi import FastAPI
-from app.database.repositories.user_repository import UserRepository
+from fastapi.middleware.cors import CORSMiddleware
-from app.models.user import UserCreate
+
 from app.api.routes.auth import router as auth_router
 from app.api.routes.users import router as users_router
 from app.config import settings
 from app.database.connection import get_database
 from app.file_watcher import create_file_watcher, FileWatcher
 from app.services.document_service import DocumentService
 from app.services.init_service import InitializationService
 from app.services.job_service import JobService
 from app.services.user_service import UserService
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global file watcher instance
 file_watcher: FileWatcher = None
@asynccontextmanager
-async def lifespan(app: FastAPI):
+async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
  """
-  Application lifespan manager for startup and shutdown tasks.
+  FastAPI lifespan context manager.
-
+  
-  Handles initialization tasks that need to run when the application starts,
+  Handles application startup and shutdown events including:
-  including admin user creation and other setup procedures.
+  - Database connection
  - Default admin user creation
  - FileWatcher startup/shutdown
  """
-  # Startup tasks
+  global file_watcher
  # Startup
  logger.info("Starting MyDocManager application...")
  try:
    # Initialize database connection
    database = get_database()
    logger.info("Database connection established")
-    # Initialize repositories and services
+    document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder())
-    user_repository = UserRepository(database)
+    job_service = JobService(database=database)
-    user_service = UserService(user_repository)
+    user_service = UserService(database=database)
    logger.info("Service created")
    # Create default admin user
    init_service = InitializationService(user_service)
    init_service.initialize_application()
    logger.info("Default admin user initialization completed")
-    # Run initialization tasks
+    # Create and start file watcher
-    initialization_result = init_service.initialize_application()
+    file_watcher = create_file_watcher(
      watch_directory=settings.watch_directory(),
      document_service=document_service,
      job_service=job_service
    )
    file_watcher.start()
    logger.info(f"FileWatcher started for directory: {settings.watch_directory()}")
-    if initialization_result["initialization_success"]:
+    logger.info("Application startup completed successfully")
-      logger.info("Application startup completed successfully")
+    
-      if initialization_result["admin_user_created"]:
+    yield
        logger.info("Default admin user was created during startup")
    else:
      logger.error("Application startup completed with errors:")
      for error in initialization_result["errors"]:
        logger.error(f"  - {error}")
  except Exception as e:
-    logger.error(f"Critical error during application startup: {str(e)}")
+    logger.error(f"Application startup failed: {str(e)}")
-    # You might want to decide if the app should continue or exit here
+    raise
    # For now, we log the error but continue
-  yield  # Application is running
+  finally:
-  
+    # Shutdown
-  # Shutdown tasks (if needed)
+    logger.info("Shutting down MyDocManager application...")
-  logger.info("Shutting down MyDocManager application...")
+    
    if file_watcher and file_watcher.is_running():
      file_watcher.stop()
      logger.info("FileWatcher stopped")
    logger.info("Application shutdown completed")
-# Initialize FastAPI app
+# Create FastAPI application
 app = FastAPI(
-  title="MyDocManager File Processor",
+  title="MyDocManager",
-  description="File processing and task dispatch service",
+  description="Real-time document processing application with authentication",
-  version="1.0.0",
+  version="0.1.0",
  lifespan=lifespan
 )
-# Environment variables
+# Configure CORS
-REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
+app.add_middleware(
-MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
+  CORSMiddleware,
-
+  allow_origins=["http://localhost:5173"],  # React frontend
-# Initialize Redis client
+  allow_credentials=True,
-try:
+  allow_methods=["*"],
-  redis_client = redis.from_url(REDIS_URL)
+  allow_headers=["*"],
 except Exception as e:
  redis_client = None
  print(f"Warning: Could not connect to Redis: {e}")
 # Initialize Celery
 celery_app = Celery(
  "file_processor",
  broker=REDIS_URL,
  backend=REDIS_URL
 )
 # Include routers
 app.include_router(auth_router, prefix="/auth", tags=["Authentication"])
 app.include_router(users_router, prefix="/users", tags=["User Management"])
 # app.include_router(documents_router, prefix="/documents", tags=["Documents"])
 # app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"])
 # Pydantic models
 class TestTaskRequest(BaseModel):
  """Request model for test task."""
  message: str
 def get_user_service() -> UserService:
  """
  Dependency to get user service instance.
  This should be properly implemented with database connection management
  in your actual application.
  """
  database = get_database()
  user_repository = UserRepository(database)
  return UserService(user_repository)
 # Your API routes would use the service like this:
@app.post("/api/users")
 async def create_user(
    user_data: UserCreate,
    user_service: UserService = Depends(get_user_service)
 ):
  return user_service.create_user(user_data)
@app.get("/health")
 async def health_check():
  """
  Health check endpoint.
-
+  
  Returns:
-      dict: Service health status with dependencies
+      Dictionary containing application health status
  """
-  health_status = {
+  return {
      "status": "healthy",
-      "service": "file-processor",
+      "service": "MyDocManager",
-      "dependencies": {
+      "version": "1.0.0",
-          "redis": "unknown",
+      "file_watcher_running": file_watcher.is_running() if file_watcher else False
          "mongodb": "unknown"
      },
  }
  # Check Redis connection
  if redis_client:
    try:
      redis_client.ping()
      health_status["dependencies"]["redis"] = "connected"
    except Exception:
      health_status["dependencies"]["redis"] = "disconnected"
      health_status["status"] = "degraded"
  # check MongoDB connection
  if test_database_connection():
    health_status["dependencies"]["mongodb"] = "connected"
  else:
    health_status["dependencies"]["mongodb"] = "disconnected"
  return health_status
@app.post("/test-task")
 async def dispatch_test_task(request: TestTaskRequest):
  """
  Dispatch a test task to Celery worker.
  Args:
      request: Test task request containing message
  Returns:
      dict: Task dispatch information
  Raises:
      HTTPException: If task dispatch fails
  """
  try:
    # Send task to worker
    task = celery_app.send_task(
      "main.test_task",
      args=[request.message]
    )
    return {
        "status": "dispatched",
        "task_id": task.id,
        "message": f"Test task dispatched with message: {request.message}"
    }
  except Exception as e:
    raise HTTPException(
      status_code=500,
      detail=f"Failed to dispatch task: {str(e)}"
    )
@app.get("/")
 async def root():
  """
-  Root endpoint.
+  Root endpoint with basic application information.
-
+  
  Returns:
-      dict: Basic service information
+      Dictionary containing welcome message and available endpoints
  """
  return {
-      "service": "MyDocManager File Processor",
+      "message": "Welcome to MyDocManager",
-      "version": "1.0.0",
+      "description": "Real-time document processing application",
-      "status": "running"
+      "docs": "/docs",
      "health": "/health"
  }
@app.get("/watcher/status")
 async def watcher_status():
  """
  Get file watcher status.
  Returns:
      Dictionary containing file watcher status information
  """
  if not file_watcher:
    return {
        "status": "not_initialized",
        "running": False
    }
  return {
      "status": "initialized",
      "running": file_watcher.is_running(),
      "watch_directory": str(file_watcher.watch_directory),
      "recursive": file_watcher.recursive
  }
--- a/src/file-processor/app/models/auth.py
+++ b/src/file-processor/app/models/auth.py
@@ -3,12 +3,45 @@ Authentication models and enums for user management.
 Contains user roles enumeration and authentication-related Pydantic models.
 """
-
+from datetime import datetime
 from enum import Enum
 from pydantic import BaseModel, Field
 from app.models.types import PyObjectId
 class UserRole(str, Enum):
  """User roles enumeration with string values."""
  USER = "user"
-  ADMIN = "admin"
+  ADMIN = "admin"
 class UserResponse(BaseModel):
  """Model for user data in API responses (excludes password_hash)."""
  id: PyObjectId = Field(alias="_id")
  username: str
  email: str
  role: UserRole
  is_active: bool
  created_at: datetime
  updated_at: datetime
  model_config = {
      "populate_by_name": True,
      "arbitrary_types_allowed": True,
  }
 class LoginResponse(BaseModel):
  """Response model for successful login."""
  access_token: str
  token_type: str = "bearer"
  user: UserResponse
 class MessageResponse(BaseModel):
  """Generic message response."""
  message: str
--- a/src/file-processor/app/models/document.py
+++ b/src/file-processor/app/models/document.py
@@ -33,15 +33,6 @@ class ExtractionMethod(str, Enum):
  HYBRID = "hybrid"
 class ProcessingStatus(str, Enum):
  """Status values for processing jobs."""
  PENDING = "pending"
  PROCESSING = "processing"
  COMPLETED = "completed"
  FAILED = "failed"
 class FileDocument(BaseModel):
  """
  Model for file documents stored in the 'files' collection.
@@ -58,6 +49,9 @@ class FileDocument(BaseModel):
  metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
  detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
  file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
  encoding: str = Field(default="utf-8", description="Character encoding for text files")
  file_size: int = Field(..., ge=0, description="File size in bytes")
  mime_type: str = Field(..., description="MIME type detected")
  @field_validator('filepath')
  @classmethod
@@ -74,69 +68,3 @@ class FileDocument(BaseModel):
    if not v.strip():
      raise ValueError("Filename cannot be empty")
    return v.strip()
  class Config:
    """Pydantic configuration."""
    populate_by_name = True
    arbitrary_types_allowed = True
    json_encoders = {ObjectId: str}
 class DocumentContent(BaseModel):
  """Model for document content."""
  id: Optional[PyObjectId] = Field(default=None, alias="_id")
  file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
  content: str = Field(..., description="File content")
  encoding: str = Field(default="utf-8", description="Character encoding for text files")
  file_size: int = Field(..., ge=0, description="File size in bytes")
  mime_type: str = Field(..., description="MIME type detected")
 class ProcessingJob(BaseModel):
  """
  Model for processing jobs stored in the 'processing_jobs' collection.
  Tracks the lifecycle and status of document processing tasks.
  """
  id: Optional[PyObjectId] = Field(default=None, alias="_id")
  file_id: PyObjectId = Field(..., description="Reference to file document")
  status: ProcessingStatus = Field(
    default=ProcessingStatus.PENDING,
    description="Current processing status"
  )
  task_id: Optional[str] = Field(
    default=None,
    description="Celery task UUID"
  )
  created_at: Optional[datetime] = Field(
    default=None,
    description="Timestamp when job was created"
  )
  started_at: Optional[datetime] = Field(
    default=None,
    description="Timestamp when processing started"
  )
  completed_at: Optional[datetime] = Field(
    default=None,
    description="Timestamp when processing completed"
  )
  error_message: Optional[str] = Field(
    default=None,
    description="Error message if processing failed"
  )
  @field_validator('error_message')
  @classmethod
  def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
    """Clean up error message."""
    if v is not None:
      return v.strip() if v.strip() else None
    return v
  class Config:
    """Pydantic configuration."""
    populate_by_name = True
    arbitrary_types_allowed = True
    json_encoders = {ObjectId: str}
--- a/src/file-processor/app/models/job.py
+++ b/src/file-processor/app/models/job.py
@@ -0,0 +1,42 @@
 from datetime import datetime
 from enum import Enum
 from typing import Optional
 from bson import ObjectId
 from pydantic import BaseModel, Field, field_validator
 from app.models.types import PyObjectId
 class ProcessingStatus(str, Enum):
  """Status values for processing jobs."""
  PENDING = "pending"
  PROCESSING = "processing"
  COMPLETED = "completed"
  FAILED = "failed"
 class ProcessingJob(BaseModel):
  """
  Model for processing jobs stored in the 'processing_jobs' collection.
  Tracks the lifecycle and status of document processing tasks.
  """
  id: Optional[PyObjectId] = Field(default=None, alias="_id")
  document_id: PyObjectId = Field(..., description="Reference to file document")
  status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
  task_id: Optional[str] = Field(default=None, description="Celery task UUID")
  created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
  started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
  completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
  error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
  @field_validator('error_message')
  @classmethod
  def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
    """Clean up error message."""
    if v is not None:
      return v.strip() if v.strip() else None
    return v
--- a/src/file-processor/app/models/user.py
+++ b/src/file-processor/app/models/user.py
@@ -7,10 +7,10 @@ and API responses with proper validation and type safety.
 import re
 from datetime import datetime
-from typing import Optional, Any
+from typing import Optional
 from bson import ObjectId
 from pydantic import BaseModel, Field, field_validator, EmailStr
 from pydantic_core import core_schema
 from app.models.auth import UserRole
 from app.models.types import PyObjectId
@@ -138,21 +138,3 @@ class UserInDB(BaseModel):
      "arbitrary_types_allowed": True,
      "json_encoders": {ObjectId: str}
  }
 class UserResponse(BaseModel):
  """Model for user data in API responses (excludes password_hash)."""
  id: PyObjectId = Field(alias="_id")
  username: str
  email: str
  role: UserRole
  is_active: bool
  created_at: datetime
  updated_at: datetime
  model_config = {
      "populate_by_name": True,
      "arbitrary_types_allowed": True,
      "json_encoders": {ObjectId: str}
  }
--- a/src/file-processor/app/services/auth_service.py
+++ b/src/file-processor/app/services/auth_service.py
@@ -4,7 +4,11 @@ Authentication service for password hashing and verification.
 This module provides authentication-related functionality including
 password hashing, verification, and JWT token management.
 """
 from datetime import datetime, timedelta
 import jwt
 from app.config import settings
 from app.utils.security import hash_password, verify_password
@@ -55,4 +59,26 @@ class AuthService:
        >>> auth.verify_user_password("wrongpassword", hashed)
        False
    """
-    return verify_password(password, hashed_password)
+    return verify_password(password, hashed_password)
  @staticmethod
  def create_access_token(data=dict) -> str:
    """
      Create a JWT access token.
      Args:
          data (dict): Payload data to include in the token.
      Returns:
          str: Encoded JWT token.
      """
    # Copy data to avoid modifying the original dict
    to_encode = data.copy()
    # Add expiration time
    expire = datetime.now() + timedelta(hours=settings.get_jwt_expire_hours())
    to_encode.update({"exp": expire})
    # Encode JWT
    encoded_jwt = jwt.encode(to_encode, settings.get_jwt_secret_key(), algorithm=settings.get_jwt_algorithm())
    return encoded_jwt
--- a/src/file-processor/app/services/document_service.py
+++ b/src/file-processor/app/services/document_service.py
@@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions.
 """
 import hashlib
-import magic
+import os
 from datetime import datetime
 from pathlib import Path
-from typing import List, Optional, Dict, Any, Tuple
+from typing import List, Optional, Dict, Any
-from motor.motor_asyncio import AsyncIOMotorClientSession
+import magic
 from pymongo.errors import PyMongoError
-from app.database.connection import get_database
+from app.config.settings import get_objects_folder
 from app.database.repositories.document_repository import FileDocumentRepository
 from app.database.repositories.document_content_repository import DocumentContentRepository
 from app.models.document import (
  FileDocument,
  DocumentContent,
  FileType,
  ProcessingStatus
 )
 from app.models.types import PyObjectId
@@ -34,13 +31,25 @@ class DocumentService:
  and their content while ensuring data consistency through transactions.
  """
-  def __init__(self):
+  def __init__(self, database, objects_folder: str = None):
-    """Initialize the document service with repository dependencies."""
+    """
-    self.db = get_database()
+    Initialize the document service with repository dependencies.
-    self.file_repository = FileDocumentRepository(self.db)
+    
-    self.content_repository = DocumentContentRepository(self.db)
+    Args:
        database: Database instance
        objects_folder: folder to store files by their hash
    """
    self.db = database
    self.document_repository = FileDocumentRepository(self.db)
    self.objects_folder = objects_folder or get_objects_folder()
-  def _calculate_file_hash(self, file_bytes: bytes) -> str:
+  def initialize(self):
    self.document_repository.initialize()
    return self
  @staticmethod
  def _calculate_file_hash(file_bytes: bytes) -> str:
    """
    Calculate SHA256 hash of file content.
@@ -52,7 +61,8 @@ class DocumentService:
    """
    return hashlib.sha256(file_bytes).hexdigest()
-  def _detect_file_type(self, file_path: str) -> FileType:
+  @staticmethod
  def _detect_file_type(file_path: str) -> FileType:
    """
    Detect file type from file extension.
@@ -72,7 +82,8 @@ class DocumentService:
    except ValueError:
      raise ValueError(f"Unsupported file type: {extension}")
-  def _detect_mime_type(self, file_bytes: bytes) -> str:
+  @staticmethod
  def _detect_mime_type(file_bytes: bytes) -> str:
    """
    Detect MIME type from file content.
@@ -84,10 +95,51 @@ class DocumentService:
    """
    return magic.from_buffer(file_bytes, mime=True)
-  async def create_document(
+  @staticmethod
  def _read_file_bytes(file_path: str | Path) -> bytes:
    """
    Read file content as bytes asynchronously.
    Args:
        file_path (str | Path): Path of the file to read
    Returns:
        bytes: Content of the file
    Raises:
        FileNotFoundError: If the file does not exist
        OSError: If any I/O error occurs
    """
    path = Path(file_path)
    if not path.exists():
      raise FileNotFoundError(f"File not found: {file_path}")
    return path.read_bytes()
  def _get_document_path(self, file_hash):
    """
    :param file_hash:
    :return:
    """
    return os.path.join(self.objects_folder, file_hash[:24], file_hash)
  def save_content_if_needed(self, file_hash, content: bytes):
    target_path = self._get_document_path(file_hash)
    if os.path.exists(target_path):
      return
    if not os.path.exists(os.path.dirname(target_path)):
      os.makedirs(os.path.dirname(target_path))
    with open(target_path, "wb") as f:
      f.write(content)
  def create_document(
      self,
      file_path: str,
-      file_bytes: bytes,
+      file_bytes: bytes | None = None,
      encoding: str = "utf-8"
  ) -> FileDocument:
    """
@@ -110,57 +162,40 @@ class DocumentService:
        PyMongoError: If database operation fails
    """
    # Calculate automatic attributes
    file_bytes = file_bytes if file_bytes is not None else self._read_file_bytes(file_path)
    file_hash = self._calculate_file_hash(file_bytes)
    file_type = self._detect_file_type(file_path)
    mime_type = self._detect_mime_type(file_bytes)
    file_size = len(file_bytes)
    filename = Path(file_path).name
-    detected_at = datetime.utcnow()
+    detected_at = datetime.now()
-    # Start MongoDB transaction
+    try:
-    async with await self.db.client.start_session() as session:
+      self.save_content_if_needed(file_hash, file_bytes)
-      async with session.start_transaction():
+      
-        try:
+      # Create FileDocument
-          # Check if content already exists
+      file_data = FileDocument(
-          existing_content = await self.content_repository.find_document_content_by_file_hash(
+        filename=filename,
-            file_hash, session=session
+        filepath=file_path,
-          )
+        file_type=file_type,
-          
+        extraction_method=None,  # Will be set by processing workers
-          # Create DocumentContent if it doesn't exist
+        metadata={},  # Empty for now
-          if not existing_content:
+        detected_at=detected_at,
-            content_data = DocumentContent(
+        file_hash=file_hash,
-              file_hash=file_hash,
+        encoding=encoding,
-              content="",  # Will be populated by processing workers
+        file_size=file_size,
-              encoding=encoding,
+        mime_type=mime_type
-              file_size=file_size,
+      )
-              mime_type=mime_type
+      
-            )
+      created_file = self.document_repository.create_document(file_data)
-            await self.content_repository.create_document_content(
+      
-              content_data, session=session
+      return created_file
-            )
+    
-          
+    except Exception as e:
-          # Create FileDocument
+      # Transaction will automatically rollback if supported
-          file_data = FileDocument(
+      raise PyMongoError(f"Failed to create document: {str(e)}")
            filename=filename,
            filepath=file_path,
            file_type=file_type,
            extraction_method=None,  # Will be set by processing workers
            metadata={},  # Empty for now
            detected_at=detected_at,
            file_hash=file_hash
          )
          created_file = await self.file_repository.create_document(
            file_data, session=session
          )
          return created_file
        except Exception as e:
          # Transaction will automatically rollback
          raise PyMongoError(f"Failed to create document: {str(e)}")
-  async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
+  def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
    """
    Retrieve a document by its ID.
@@ -170,9 +205,9 @@ class DocumentService:
    Returns:
        FileDocument if found, None otherwise
    """
-    return await self.file_repository.find_document_by_id(document_id)
+    return self.document_repository.find_document_by_id(str(document_id))
-  async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
+  def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
    """
    Retrieve a document by its file hash.
@@ -182,9 +217,9 @@ class DocumentService:
    Returns:
        FileDocument if found, None otherwise
    """
-    return await self.file_repository.find_document_by_hash(file_hash)
+    return self.document_repository.find_document_by_hash(file_hash)
-  async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
+  def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
    """
    Retrieve a document by its file path.
@@ -194,34 +229,17 @@ class DocumentService:
    Returns:
        FileDocument if found, None otherwise
    """
-    return await self.file_repository.find_document_by_filepath(filepath)
+    return self.document_repository.find_document_by_filepath(filepath)
-  async def get_document_with_content(
+  def get_document_content_by_hash(self, file_hash):
-      self,
+    target_path = self._get_document_path(file_hash)
-      document_id: PyObjectId
+    if not os.path.exists(target_path):
  ) -> Optional[Tuple[FileDocument, DocumentContent]]:
    """
    Retrieve a document with its associated content.
    Args:
        document_id: Document ObjectId
    Returns:
        Tuple of (FileDocument, DocumentContent) if found, None otherwise
    """
    document = await self.get_document_by_id(document_id)
    if not document:
      return None
-    content = await self.content_repository.find_document_content_by_file_hash(
+    with open(target_path, "rb") as f:
-      document.file_hash
+      return f.read()
    )
    if not content:
      return None
    return (document, content)
-  async def list_documents(
+  def list_documents(
      self,
      skip: int = 0,
      limit: int = 100
@@ -236,18 +254,18 @@ class DocumentService:
    Returns:
        List of FileDocument instances
    """
-    return await self.file_repository.list_documents(skip=skip, limit=limit)
+    return self.document_repository.list_documents(skip=skip, limit=limit)
-  async def count_documents(self) -> int:
+  def count_documents(self) -> int:
    """
    Get total number of documents.
    Returns:
        Total document count
    """
-    return await self.file_repository.count_documents()
+    return self.document_repository.count_documents()
-  async def update_document(
+  def update_document(
      self,
      document_id: PyObjectId,
      update_data: Dict[str, Any]
@@ -262,9 +280,14 @@ class DocumentService:
    Returns:
        Updated FileDocument if found, None otherwise
    """
-    return await self.file_repository.update_document(document_id, update_data)
+    if "file_bytes" in update_data:
      file_hash = self._calculate_file_hash(update_data["file_bytes"])
      update_data["file_hash"] = file_hash
      self.save_content_if_needed(file_hash, update_data["file_bytes"])
    return self.document_repository.update_document(document_id, update_data)
-  async def delete_document(self, document_id: PyObjectId) -> bool:
+  def delete_document(self, document_id: PyObjectId) -> bool:
    """
    Delete a document and its orphaned content.
@@ -281,100 +304,31 @@ class DocumentService:
    Raises:
        PyMongoError: If database operation fails
    """
-    # Start MongoDB transaction
+    # Start transaction
-    async with await self.db.client.start_session() as session:
+    
-      async with session.start_transaction():
+    try:
      # Get document to find its hash
      document = self.document_repository.find_document_by_id(document_id)
      if not document:
        return False
      # Delete the document
      deleted = self.document_repository.delete_document(document_id)
      if not deleted:
        return False
      # Check if content is orphaned
      remaining_files = self.document_repository.find_document_by_hash(document.file_hash)
      # If no other files reference this content, delete it
      if not remaining_files:
        try:
-          # Get document to find its hash
+          os.remove(self._get_document_path(document.file_hash))
-          document = await self.file_repository.find_document_by_id(
+        except Exception:
-            document_id, session=session
+          pass
-          )
+      
-          if not document:
+      return True
            return False
          # Delete the document
          deleted = await self.file_repository.delete_document(
            document_id, session=session
          )
          if not deleted:
            return False
          # Check if content is orphaned
          remaining_files = await self.file_repository.find_document_by_hash(
            document.file_hash, session=session
          )
          # If no other files reference this content, delete it
          if not remaining_files:
            content = await self.content_repository.find_document_content_by_file_hash(
              document.file_hash, session=session
            )
            if content:
              await self.content_repository.delete_document_content(
                content.id, session=session
              )
          return True
        except Exception as e:
          # Transaction will automatically rollback
          raise PyMongoError(f"Failed to delete document: {str(e)}")
  async def content_exists(self, file_hash: str) -> bool:
    """
    Check if content with given hash exists.
    Args:
        file_hash: SHA256 hash of file content
    Returns:
        True if content exists, False otherwise
    """
    return await self.content_repository.content_exists(file_hash)
  async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
    """
    Retrieve content by file hash.
    Args:
        file_hash: SHA256 hash of file content
    Returns:
        DocumentContent if found, None otherwise
    """
    return await self.content_repository.find_document_content_by_file_hash(file_hash)
  async def update_document_content(
      self,
      file_hash: str,
      content: str,
      encoding: str = "utf-8"
  ) -> Optional[DocumentContent]:
    """
    Update the extracted content for a document.
    This method is typically called by processing workers to store
    the extracted text content.
    Args:
        file_hash: SHA256 hash of file content
        content: Extracted text content
        encoding: Character encoding
    Returns:
        Updated DocumentContent if found, None otherwise
    """
    existing_content = await self.content_repository.find_document_content_by_file_hash(
      file_hash
    )
    if not existing_content:
      return None
-    update_data = {
+    except Exception as e:
-        "content": content,
+      # Transaction will automatically rollback if supported
-        "encoding": encoding
+      raise PyMongoError(f"Failed to delete document: {str(e)}")
    }
    return await self.content_repository.update_document_content(
      existing_content.id, update_data
    )
--- a/src/file-processor/app/services/init_service.py
+++ b/src/file-processor/app/services/init_service.py
@@ -8,8 +8,8 @@ creating default admin user if none exists.
 import logging
 from typing import Optional
 from app.models.user import UserCreate, UserInDB, UserCreateNoValidation
 from app.models.auth import UserRole
 from app.models.user import UserInDB, UserCreateNoValidation
 from app.services.user_service import UserService
 logger = logging.getLogger(__name__)
@@ -31,7 +31,6 @@ class InitializationService:
        user_service (UserService): Service for user operations
    """
    self.user_service = user_service
  def ensure_admin_user_exists(self) -> Optional[UserInDB]:
    """
@@ -131,4 +130,23 @@ class InitializationService:
      logger.error(error_msg)
      initialization_summary["errors"].append(error_msg)
-    return initialization_summary
+    self.log_initialization_result(initialization_summary)
    return initialization_summary
  @staticmethod
  def log_initialization_result(summary: dict) -> None:
    """
    Log the result of the initialization process.
    Args:
        summary (dict): Summary of initialization tasks performed
    """
    if summary["initialization_success"]:
      logger.info("Application startup completed successfully")
      if summary["admin_user_created"]:
        logger.info("Default admin user was created during startup")
    else:
      logger.error("Application startup completed with errors:")
      for error in summary["errors"]:
        logger.error(f"  - {error}")
--- a/src/file-processor/app/services/job_service.py
+++ b/src/file-processor/app/services/job_service.py
@@ -0,0 +1,182 @@
 """
 Service layer for job processing business logic.
 This module provides high-level operations for managing processing jobs
 with strict status transition validation and business rules enforcement.
 """
 from typing import Optional
 from app.database.repositories.job_repository import JobRepository
 from app.exceptions.job_exceptions import InvalidStatusTransitionError
 from app.models.job import ProcessingJob, ProcessingStatus
 from app.models.types import PyObjectId
 class JobService:
  """
  Service for processing job business logic operations.
  Provides high-level job management with strict status transition
  validation and business rule enforcement.
  """
  def __init__(self, database):
    """
    Initialize service with job repository.
    Args:
        repository: Optional JobRepository instance (creates default if None)
    """
    self.db = database
    self.repository = JobRepository(database)
  def initialize(self):
    self.repository.initialize()
    return self
  def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob:
    """
    Create a new processing job.
    Args:
        document_id: Reference to the file document
        task_id: Optional Celery task UUID
    Returns:
        The created ProcessingJob
    Raises:
        JobRepositoryError: If database operation fails
    """
    return self.repository.create_job(document_id, task_id)
  def get_job_by_id(self, job_id: PyObjectId) -> ProcessingJob:
    """
    Retrieve a job by its ID.
    Args:
        job_id: The job ObjectId
    Returns:
        The ProcessingJob document
    Raises:
        JobNotFoundError: If job doesn't exist
        JobRepositoryError: If database operation fails
    """
    return self.repository.find_job_by_id(job_id)
  def mark_job_as_started(self, job_id: PyObjectId) -> ProcessingJob:
    """
    Mark a job as started (PENDING → PROCESSING).
    Args:
        job_id: The job ObjectId
    Returns:
        The updated ProcessingJob
    Raises:
        JobNotFoundError: If job doesn't exist
        InvalidStatusTransitionError: If job is not in PENDING status
        JobRepositoryError: If database operation fails
    """
    # Get current job to validate transition
    current_job = self.repository.find_job_by_id(job_id)
    # Validate status transition
    if current_job.status != ProcessingStatus.PENDING:
      raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.PROCESSING)
    # Update status
    return self.repository.update_job_status(job_id, ProcessingStatus.PROCESSING)
  def mark_job_as_completed(self, job_id: PyObjectId) -> ProcessingJob:
    """
    Mark a job as completed (PROCESSING → COMPLETED).
    Args:
        job_id: The job ObjectId
    Returns:
        The updated ProcessingJob
    Raises:
        JobNotFoundError: If job doesn't exist
        InvalidStatusTransitionError: If job is not in PROCESSING status
        JobRepositoryError: If database operation fails
    """
    # Get current job to validate transition
    current_job = self.repository.find_job_by_id(job_id)
    # Validate status transition
    if current_job.status != ProcessingStatus.PROCESSING:
      raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED)
    # Update status
    return self.repository.update_job_status(job_id, ProcessingStatus.COMPLETED)
  def mark_job_as_failed(
      self,
      job_id: PyObjectId,
      error_message: Optional[str] = None
  ) -> ProcessingJob:
    """
    Mark a job as failed (PROCESSING → FAILED).
    Args:
        job_id: The job ObjectId
        error_message: Optional error description
    Returns:
        The updated ProcessingJob
    Raises:
        JobNotFoundError: If job doesn't exist
        InvalidStatusTransitionError: If job is not in PROCESSING status
        JobRepositoryError: If database operation fails
    """
    # Get current job to validate transition
    current_job = self.repository.find_job_by_id(job_id)
    # Validate status transition
    if current_job.status != ProcessingStatus.PROCESSING:
      raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED)
    # Update status with error message
    return self.repository.update_job_status(
      job_id,
      ProcessingStatus.FAILED,
      error_message
    )
  def delete_job(self, job_id: PyObjectId) -> bool:
    """
    Delete a job from the database.
    Args:
        job_id: The job ObjectId
    Returns:
        True if job was deleted, False if not found
    Raises:
        JobRepositoryError: If database operation fails
    """
    return self.repository.delete_job(job_id)
  def get_jobs_by_status(self, status: ProcessingStatus) -> list[ProcessingJob]:
    """
    Retrieve all jobs with a specific status.
    Args:
        status: The processing status to filter by
    Returns:
        List of ProcessingJob documents
    Raises:
        JobRepositoryError: If database operation fails
    """
    return self.repository.get_jobs_by_status(status)
--- a/src/file-processor/app/services/user_service.py
+++ b/src/file-processor/app/services/user_service.py
@@ -6,11 +6,11 @@ retrieval, updates, and authentication operations with proper error handling.
 """
 from typing import Optional, List
 from pymongo.errors import DuplicateKeyError
 from app.models.user import UserCreate, UserInDB, UserUpdate, UserResponse, UserCreateNoValidation
 from app.models.auth import UserRole
 from app.database.repositories.user_repository import UserRepository
 from app.models.user import UserCreate, UserInDB, UserUpdate, UserCreateNoValidation
 from app.services.auth_service import AuthService
@@ -22,16 +22,21 @@ class UserService:
  authentication, and data management with proper validation.
  """
-  def __init__(self, user_repository: UserRepository):
+  def __init__(self, database):
    """
    Initialize user service with repository dependency.
    Args:
        user_repository (UserRepository): Repository for user data operations
    """
-    self.user_repository = user_repository
+    self.db = database
    self.user_repository = UserRepository(self.db)
    self.auth_service = AuthService()
  def initialize(self):
    self.user_repository.initialize()
    return self
  def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
    """
    Create a new user with business logic validation.
--- a/src/file-processor/requirements.txt
+++ b/src/file-processor/requirements.txt
@@ -1,11 +1,14 @@
 asgiref==3.9.1
 bcrypt==4.3.0
 celery==5.5.3
 email-validator==2.3.0
 fastapi==0.116.1
 httptools==0.6.4
 motor==3.7.1
 pymongo==4.15.0
 pydantic==2.11.9
 PyJWT==2.10.1
 pymongo==4.15.0
 redis==6.4.0
 uvicorn==0.35.0
-python-magic==0.4.27
+python-magic==0.4.27
 watchdog==6.0.0
--- a/src/frontend/.dockerignore
+++ b/src/frontend/.dockerignore
@@ -0,0 +1,41 @@
 # Dependencies
 node_modules
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 # Build outputs
 dist
 build
 # Environment files
 .env.local
 .env.development.local
 .env.test.local
 .env.production.local
 # IDE files
 .vscode
 .idea
 *.swp
 *.swo
 # OS generated files
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db
 # Git
 .git
 .gitignore
 # Docker
 Dockerfile
 .dockerignore
 # Logs
 *.log
--- a/src/frontend/Dockerfile
+++ b/src/frontend/Dockerfile
@@ -0,0 +1,20 @@
 # Use Node.js 20 Alpine for lightweight container
 FROM node:20-alpine
 # Set working directory
 WORKDIR /app
 # Copy package.json and package-lock.json (if available)
 COPY package*.json ./
 # Install dependencies
 RUN npm install
 # Copy source code
 COPY . .
 # Expose Vite default port
 EXPOSE 5173
 # Start development server with host 0.0.0.0 to accept external connections
 CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0", "--port", "5173"]
--- a/src/worker/Dockerfile
+++ b/src/worker/Dockerfile
@@ -3,12 +3,18 @@ FROM python:3.12-slim
 # Set working directory
 WORKDIR /app
 # Install libmagic
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libmagic1 \
    file \
 && rm -rf /var/lib/apt/lists/*
 # Copy requirements and install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
-COPY tasks/ .
+COPY . .
 # Command will be overridden by docker-compose
-CMD ["celery", "-A", "main", "worker", "--loglevel=info"]
+CMD ["celery", "-A", "main", "worker", "--loglevel=info"]
--- a/src/worker/requirements.txt
+++ b/src/worker/requirements.txt
@@ -1,4 +1,13 @@
-
+asgiref==3.9.1
 bcrypt==4.3.0
 celery==5.5.3
 email-validator==2.3.0
 fastapi==0.116.1
 httptools==0.6.4
 motor==3.7.1
 pymongo==4.15.0
 pydantic==2.11.9
 redis==6.4.0
-pymongo==4.15.0
+uvicorn==0.35.0
 python-magic==0.4.27
 watchdog==6.0.0
--- a/src/worker/tasks/document_processing.py
+++ b/src/worker/tasks/document_processing.py
@@ -0,0 +1,85 @@
 """
 Celery tasks for document processing with ProcessingJob status management.
 This module contains Celery tasks that handle document content extraction
 and update processing job statuses throughout the task lifecycle.
 """
 import logging
 from typing import Any, Dict
 from app.config import settings
 from app.database.connection import get_database
 from app.services.document_service import DocumentService
 from tasks.main import celery_app
 logger = logging.getLogger(__name__)
@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
 def process_document(self, filepath: str) -> Dict[str, Any]:
  """
  Process a document file and extract its content.
  This task:
  1. Updates the processing job status to PROCESSING
  2. Performs document content extraction
  3. Updates job status to COMPLETED or FAILED based on result
  Args:
      self : Celery task instance
      filepath: Full path to the document file to process
  Returns:
      Dictionary containing processing results
  Raises:
      Exception: Any processing error (will trigger retry)
  """
  task_id = self.request.id
  logger.info(f"Starting document processing task {task_id} for file: {filepath}")
  database = get_database()
  document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder())
  from app.services.job_service import JobService
  job_service = JobService(database=database)
  job = None
  try:
    # Step 1: Insert the document in DB
    document = document_service.create_document(filepath)
    logger.info(f"Job {task_id} created for document {document.id} with file path: {filepath}")
    # Step 2: Create a new job record for the document
    job = job_service.create_job(task_id=task_id, document_id=document.id)
    # Step 3: Mark job as started
    job_service.mark_job_as_started(job_id=job.id)
    logger.info(f"Job {task_id} marked as PROCESSING")
    # Step 4: Mark job as completed
    job_service.mark_job_as_completed(job_id=job.id)
    logger.info(f"Job {task_id} marked as COMPLETED")
    return {
        "task_id": task_id,
        "filepath": filepath,
        "status": "completed",
    }
  except Exception as e:
    error_message = f"Document processing failed: {str(e)}"
    logger.error(f"Task {task_id} failed: {error_message}")
    try:
      # Mark job as failed
      if job is not None:
        job_service.mark_job_as_failed(job_id=job.id, error_message=error_message)
        logger.info(f"Job {task_id} marked as FAILED")
      else:
        logger.error(f"Failed to process {filepath}. error = {str(e)}")
    except Exception as job_error:
      logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
    # Re-raise the exception to trigger Celery retry mechanism
    raise
--- a/src/worker/tasks/main.py
+++ b/src/worker/tasks/main.py
@@ -3,9 +3,8 @@ Celery worker for MyDocManager document processing tasks.
 This module contains all Celery tasks for processing documents.
 """
 import os
-import time
+
 from celery import Celery
 # Environment variables
@@ -13,101 +12,25 @@ REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
 MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
 # Initialize Celery app
-app = Celery(
+celery_app = Celery(
  "mydocmanager_worker",
  broker=REDIS_URL,
-  backend=REDIS_URL
+  backend=REDIS_URL,
 )
 celery_app.autodiscover_tasks(["tasks.document_processing"])
 # Celery configuration
-app.conf.update(
+celery_app.conf.update(
  task_serializer="json",
  accept_content=["json"],
  result_serializer="json",
  timezone="UTC",
  enable_utc=True,
  task_track_started=True,
-  task_time_limit=300,  # 5 minutes
+  task_time_limit=300,        # 5 minutes
-  task_soft_time_limit=240,  # 4 minutes
+  task_soft_time_limit=240,   # 4 minutes
 )
@app.task(bind=True)
 def test_task(self, message: str):
  """
  Test task for validating worker functionality.
  Args:
      message: Test message to process
  Returns:
      dict: Task result with processing information
  """
  try:
    print(f"[WORKER] Starting test task with message: {message}")
    # Simulate some work
    for i in range(5):
      print(f"[WORKER] Processing step {i + 1}/5...")
      time.sleep(1)
      # Update task progress
      self.update_state(
        state="PROGRESS",
        meta={
            "current": i + 1,
            "total": 5,
            "message": f"Processing step {i + 1}"
        }
      )
    result = {
        "status": "completed",
        "message": f"Successfully processed: {message}",
        "processed_at": time.time(),
        "worker_id": self.request.id
    }
    print(f"[WORKER] Test task completed successfully: {result}")
    return result
  except Exception as exc:
    print(f"[WORKER] Test task failed: {str(exc)}")
    raise self.retry(exc=exc, countdown=60, max_retries=3)
@app.task(bind=True)
 def process_document_task(self, file_path: str):
  """
  Placeholder task for document processing.
  Args:
      file_path: Path to the document to process
  Returns:
      dict: Processing result
  """
  try:
    print(f"[WORKER] Starting document processing for: {file_path}")
    # Placeholder for document processing logic
    time.sleep(2)  # Simulate processing time
    result = {
        "status": "completed",
        "file_path": file_path,
        "processed_at": time.time(),
        "content": f"Placeholder content for {file_path}",
        "worker_id": self.request.id
    }
    print(f"[WORKER] Document processing completed: {file_path}")
    return result
  except Exception as exc:
    print(f"[WORKER] Document processing failed for {file_path}: {str(exc)}")
    raise self.retry(exc=exc, countdown=60, max_retries=3)
 if __name__ == "__main__":
-  app.start()
+  celery_app.start()
--- a/tests/api/init.py
+++ b/tests/api/init.py
--- a/tests/api/test_auth_routes.py
+++ b/tests/api/test_auth_routes.py
@@ -0,0 +1,149 @@
 from datetime import datetime
 from unittest.mock import MagicMock
 import pytest
 from fastapi import status, HTTPException
 from fastapi.testclient import TestClient
 from mongomock.mongo_client import MongoClient
 from app.api.dependencies import get_auth_service, get_user_service, get_current_user
 from app.main import app  # Assuming you have FastAPI app defined in app/main.py
 from app.models.auth import UserRole
 from app.models.types import PyObjectId
 from app.models.user import UserInDB
 from app.services.auth_service import AuthService
 from app.services.user_service import UserService
@pytest.fixture
 def client():
  return TestClient(app)
@pytest.fixture
 def fake_user():
  return UserInDB(
    _id=PyObjectId(),
    username="testuser",
    email="test@example.com",
    role=UserRole.USER,
    is_active=True,
    hashed_password="hashed-secret",
    created_at=datetime(2025, 1, 1),
    updated_at=datetime(2025, 1, 2),
  )
 def override_auth_service():
  mock = MagicMock(spec=AuthService)
  mock.verify_user_password.return_value = True
  mock.create_access_token.return_value = "fake-jwt-token"
  return mock
 def override_user_service(fake_user):
  mock = MagicMock(spec=UserService)
  mock.get_user_by_username.return_value = fake_user
  return mock
 def override_get_current_user(fake_user):
  def _override():
    return fake_user
  return _override
 def override_get_database():
  def _override():
    client = MongoClient()
    db = client.test_database
    return db
  return _override
 # ---------------------- TESTS FOR /auth/login ----------------------
 class TestLogin:
  def test_i_can_login_with_valid_credentials(self, client, fake_user):
    auth_service = override_auth_service()
    user_service = override_user_service(fake_user)
    client.app.dependency_overrides[get_auth_service] = lambda: auth_service
    client.app.dependency_overrides[get_user_service] = lambda: user_service
    response = client.post(
      "/auth/login",
      data={"username": "testuser", "password": "secret"},
    )
    assert response.status_code == status.HTTP_200_OK
    data = response.json()
    assert "access_token" in data
    assert data["user"]["username"] == "testuser"
  def test_i_cannot_login_with_invalid_username(self, client):
    auth_service = override_auth_service()
    user_service = MagicMock(spec=UserService)
    user_service.get_user_by_username.return_value = None
    client.app.dependency_overrides[get_auth_service] = lambda: auth_service
    client.app.dependency_overrides[get_user_service] = lambda: user_service
    response = client.post(
      "/auth/login",
      data={"username": "unknown", "password": "secret"},
    )
    assert response.status_code == status.HTTP_401_UNAUTHORIZED
  def test_i_cannot_login_with_inactive_user(self, client, fake_user):
    fake_user.is_active = False
    auth_service = override_auth_service()
    user_service = override_user_service(fake_user)
    client.app.dependency_overrides[get_auth_service] = lambda: auth_service
    client.app.dependency_overrides[get_user_service] = lambda: user_service
    response = client.post(
      "/auth/login",
      data={"username": "testuser", "password": "secret"},
    )
    assert response.status_code == status.HTTP_401_UNAUTHORIZED
  def test_i_cannot_login_with_wrong_password(self, client, fake_user):
    auth_service = override_auth_service()
    auth_service.verify_user_password.return_value = False
    user_service = override_user_service(fake_user)
    client.app.dependency_overrides[get_auth_service] = lambda: auth_service
    client.app.dependency_overrides[get_user_service] = lambda: user_service
    response = client.post(
      "/auth/login",
      data={"username": "testuser", "password": "wrong"},
    )
    assert response.status_code == status.HTTP_401_UNAUTHORIZED
 # ---------------------- TESTS FOR /auth/me ----------------------
 class TesteMe:
  def test_i_can_get_current_user_profile(self, client, fake_user):
    client.app.dependency_overrides[get_current_user] = override_get_current_user(fake_user)
    response = client.get("/auth/me")
    assert response.status_code == status.HTTP_200_OK
    data = response.json()
    assert data["username"] == fake_user.username
    assert data["email"] == fake_user.email
  def test_i_cannot_get_profile_without_authentication(self, client, monkeypatch):
    def raise_http_exception():
      raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)
    client.app.dependency_overrides[get_current_user] = raise_http_exception
    response = client.get("/auth/me")
    assert response.status_code == status.HTTP_401_UNAUTHORIZED
--- a/tests/api/test_users.py
+++ b/tests/api/test_users.py
@@ -0,0 +1,167 @@
 # File: tests/api/test_users.py
 from datetime import datetime
 from unittest.mock import MagicMock
 import pytest
 from fastapi import status
 from fastapi.testclient import TestClient
 from app.api.dependencies import get_admin_user, get_user_service
 from app.main import app
 from app.models.auth import UserRole
 from app.models.types import PyObjectId
 from app.models.user import UserInDB, UserCreate
 from app.services.user_service import UserService
 # -----------------------
 # Fixtures
 # -----------------------
@pytest.fixture
 def fake_user_admin():
  return UserInDB(
    _id=PyObjectId(),
    username="admin",
    email="admin@example.com",
    role=UserRole.ADMIN,
    is_active=True,
    hashed_password="hashed-secret",
    created_at=datetime(2025, 1, 1),
    updated_at=datetime(2025, 1, 2),
  )
@pytest.fixture
 def fake_user_response():
  return UserInDB(
    _id=PyObjectId(),
    username="other",
    email="other@example.com",
    role=UserRole.USER,
    is_active=True,
    hashed_password="hashed-secret-2",
    created_at=datetime(2025, 1, 1),
    updated_at=datetime(2025, 1, 2),
  )
@pytest.fixture
 def client(fake_user_admin):
  # Fake admin dependency
  def get_admin_user_override():
    return fake_user_admin
  # Fake user service
  user_service_mock = MagicMock(spec=UserService)
  def get_user_service_override():
    return user_service_mock
  client = TestClient(app)
  client.app.dependency_overrides = {
      get_admin_user: get_admin_user_override,
      get_user_service: get_user_service_override
  }
  client.user_service_mock = user_service_mock
  return client
 # -----------------------
 # Tests
 # -----------------------
 class TestListUsers:
  def test_i_can_list_users(self, client, fake_user_admin, fake_user_response):
    client.user_service_mock.list_users.return_value = [fake_user_admin, fake_user_response]
    response = client.get("/users")
    assert response.status_code == status.HTTP_200_OK
    data = response.json()
    assert len(data) == 2
    assert data[0]["username"] == "admin"
  def test_i_can_list_users_when_empty(self, client):
    client.user_service_mock.list_users.return_value = []
    response = client.get("/users")
    assert response.status_code == status.HTTP_200_OK
    assert response.json() == []
 class TestGetUserById:
  def test_i_can_get_user_by_id(self, client, fake_user_response):
    client.user_service_mock.get_user_by_id.return_value = fake_user_response
    response = client.get(f"/users/{fake_user_response.id}")
    assert response.status_code == status.HTTP_200_OK
    data = response.json()
    assert data["username"] == fake_user_response.username
  def test_i_cannot_get_user_by_id_not_found(self, client):
    client.user_service_mock.get_user_by_id.return_value = None
    response = client.get("/users/64f0c9f4b0d1c8b7b8e1f0a2")
    assert response.status_code == status.HTTP_404_NOT_FOUND
    assert response.json()["detail"] == "User not found"
 class TestCreateUser:
  def test_i_can_create_user(self, client, fake_user_response):
    user_data = UserCreate(username="newuser",
                           email="new@example.com",
                           password="#Passw0rd!",
                           role=UserRole.USER)
    client.user_service_mock.create_user.return_value = fake_user_response
    response = client.post("/users", json=user_data.model_dump(mode="json"))
    assert response.status_code == status.HTTP_201_CREATED
    data = response.json()
    assert data["username"] == fake_user_response.username
  def test_i_cannot_create_user_when_service_raises_value_error(self, client):
    user_data = {"username": "baduser", "email": "bad@example.com", "role": "user", "password": "password"}
    client.user_service_mock.create_user.side_effect = ValueError("Invalid data")
    response = client.post("/users", json=user_data)
    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
 class TestUpdateUser:
  def test_i_can_update_user(self, client, fake_user_response):
    user_data = {"username": "updateduser", "email": "updated@example.com"}
    client.user_service_mock.update_user.return_value = fake_user_response
    response = client.put(f"/users/{fake_user_response.id}", json=user_data)
    assert response.status_code == status.HTTP_200_OK
    data = response.json()
    assert data["username"] == fake_user_response.username
  def test_i_cannot_update_user_not_found(self, client):
    client.user_service_mock.update_user.return_value = None
    user_data = {"username": "updateduser"}
    response = client.put("/users/64f0c9f4b0d1c8b7b8e1f0a2", json=user_data)
    assert response.status_code == status.HTTP_404_NOT_FOUND
    assert response.json()["detail"] == "User not found"
  def test_i_cannot_update_user_when_service_raises_value_error(self, client):
    client.user_service_mock.update_user.side_effect = ValueError("Invalid update")
    user_data = {"username": "badupdate"}
    response = client.put("/users/64f0c9f4b0d1c8b7b8e1f0a2", json=user_data)
    assert response.status_code == status.HTTP_400_BAD_REQUEST
    assert response.json()["detail"] == "Invalid update"
 class TestDeleteUser:
  def test_i_can_delete_user(self, client):
    client.user_service_mock.delete_user.return_value = True
    response = client.delete("/users/64f0c9f4b0d1c8b7b8e1f0a1")
    assert response.status_code == status.HTTP_200_OK
    data = response.json()
    assert data["message"] == "User successfully deleted"
  def test_i_cannot_delete_user_not_found(self, client):
    client.user_service_mock.delete_user.return_value = False
    response = client.delete("/users/64f0c9f4b0d1c8b7b8e1f0a2")
    assert response.status_code == status.HTTP_404_NOT_FOUND
    assert response.json()["detail"] == "User not found"
--- a/tests/database/init.py
+++ b/tests/database/init.py
--- a/tests/models/init.py
+++ b/tests/models/init.py
--- a/tests/models/test_user_models.py
+++ b/tests/models/test_user_models.py
@@ -10,8 +10,8 @@ from pydantic import ValidationError
 from datetime import datetime
 from bson import ObjectId
-from app.models.user import UserCreate, UserUpdate, UserInDB, UserResponse
+from app.models.user import UserCreate, UserUpdate, UserInDB
-from app.models.auth import UserRole
+from app.models.auth import UserRole, UserResponse
 class TestUserCreateModel:
@@ -349,7 +349,7 @@ class TestUserResponseModel:
    # Convert to response model (excluding password_hash)
    user_response = UserResponse(
-      id=user_in_db.id,
+      _id=user_in_db.id,
      username=user_in_db.username,
      email=user_in_db.email,
      role=user_in_db.role,
--- a/tests/repositories/init.py
+++ b/tests/repositories/init.py
--- a/tests/repositories/test_document_repository.py
+++ b/tests/repositories/test_document_repository.py
@@ -0,0 +1,611 @@
 """
 Test suite for FileDocumentRepository with async/support.
 This module contains comprehensive tests for all FileDocumentRepository methods
 using mongomock-motor for in-memory MongoDB testing.
 """
 from datetime import datetime
 import pytest
 from bson import ObjectId
 from mongomock.mongo_client import MongoClient
 from pymongo.errors import PyMongoError
 from app.database.repositories.document_repository import (
  FileDocumentRepository,
  MatchMethodBase,
  SubsequenceMatching,
  FuzzyMatching
 )
 from app.models.document import FileDocument, FileType, ExtractionMethod
@pytest.fixture
 def in_memory_repository():
  """Create an in-memory FileDocumentRepository for testing."""
  client = MongoClient()
  db = client.test_database
  repo = FileDocumentRepository(db)
  repo.initialize()
  return repo
@pytest.fixture
 def sample_file_document():
  """Sample FileDocument data for testing."""
  return FileDocument(
    filename="sample_document.pdf",
    filepath="/home/user/documents/sample_document.pdf",
    file_type=FileType.PDF,
    extraction_method=ExtractionMethod.OCR,
    metadata={"pages": 5, "language": "en", "author": "John Doe"},
    detected_at=datetime.now(),
    file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
    encoding="utf-8",
    file_size=1024000,
    mime_type="application/pdf"
  )
@pytest.fixture
 def sample_update_data():
  """Sample update data for testing."""
  return {
      "extraction_method": ExtractionMethod.HYBRID,
      "metadata": {"pages": 10, "language": "fr", "updated": True},
      "file_size": 2048000
  }
@pytest.fixture
 def multiple_sample_files():
  """Multiple FileDocument objects for list/search testing."""
  base_time = datetime.now()
  return [
      FileDocument(
        filename="first_doc.txt",
        filepath="/docs/first_doc.txt",
        file_type=FileType.TXT,
        extraction_method=ExtractionMethod.DIRECT_TEXT,
        metadata={"words": 500},
        detected_at=base_time,
        file_hash="hash1" + "0" * 58,
        encoding="utf-8",
        file_size=5000,
        mime_type="text/plain"
      ),
      FileDocument(
        filename="second_document.pdf",
        filepath="/docs/second_document.pdf",
        file_type=FileType.PDF,
        extraction_method=ExtractionMethod.OCR,
        metadata={"pages": 8},
        detected_at=base_time,
        file_hash="hash2" + "0" * 58,
        encoding="utf-8",
        file_size=10000,
        mime_type="application/pdf"
      ),
      FileDocument(
        filename="third_file.docx",
        filepath="/docs/third_file.docx",
        file_type=FileType.DOCX,
        extraction_method=ExtractionMethod.HYBRID,
        metadata={"paragraphs": 15},
        detected_at=base_time,
        file_hash="hash3" + "0" * 58,
        encoding="utf-8",
        file_size=15000,
        mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
      )
  ]
 class TestFileDocumentRepositoryInitialization:
  """Tests for repository initialization."""
  def test_i_can_initialize_repository(self):
    """Test repository initialization."""
    # Arrange
    client = MongoClient()
    db = client.test_database
    repo = FileDocumentRepository(db)
    repo.initialize()
    # Act & Assert (should not raise any exception)
    assert repo.db is not None
    assert repo.collection is not None
    # TODO : check that the indexes are created
 class TestFileDocumentRepositoryCreation:
  """Tests for file document creation functionality."""
  def test_i_can_create_file_document(self, in_memory_repository, sample_file_document):
    """Test successful file document creation."""
    # Act
    created_file = in_memory_repository.create_document(sample_file_document)
    # Assert
    assert created_file is not None
    assert created_file.filename == sample_file_document.filename
    assert created_file.filepath == sample_file_document.filepath
    assert created_file.file_type == sample_file_document.file_type
    assert created_file.extraction_method == sample_file_document.extraction_method
    assert created_file.metadata == sample_file_document.metadata
    assert created_file.file_hash == sample_file_document.file_hash
    assert created_file.file_size == sample_file_document.file_size
    assert created_file.mime_type == sample_file_document.mime_type
    assert created_file.id is not None
    assert isinstance(created_file.id, ObjectId)
  def test_i_can_create_file_document_without_id(self, in_memory_repository, sample_file_document):
    """Test creating file document with _id set to None (should be removed)."""
    # Arrange
    sample_file_document.id = None
    # Act
    created_file = in_memory_repository.create_document(sample_file_document)
    # Assert
    assert created_file is not None
    assert created_file.id is not None
    assert isinstance(created_file.id, ObjectId)
  def test_i_cannot_create_file_document_with_pymongo_error(self, in_memory_repository,
                                                            sample_file_document, mocker):
    """Test handling of PyMongo errors during file document creation."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
    # Act & Assert
    with pytest.raises(ValueError) as exc_info:
      in_memory_repository.create_document(sample_file_document)
    assert "Failed to create file document" in str(exc_info.value)
 class TestFileDocumentRepositoryFinding:
  """Tests for file document finding functionality."""
  def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document):
    """Test finding file document by valid ObjectId."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    # Act
    found_file = in_memory_repository.find_document_by_id(str(created_file.id))
    # Assert
    assert found_file is not None
    assert found_file.id == created_file.id
    assert found_file.filename == created_file.filename
    assert found_file.filepath == created_file.filepath
  def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository):
    """Test that invalid ObjectId returns None."""
    # Act
    found_file = in_memory_repository.find_document_by_id("invalid_id")
    # Assert
    assert found_file is None
  def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository):
    """Test that nonexistent but valid ObjectId returns None."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
    found_file = in_memory_repository.find_document_by_id(nonexistent_id)
    # Assert
    assert found_file is None
  def test_i_can_find_document_by_file_hash(self, in_memory_repository, sample_file_document):
    """Test finding file document by file hash."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    # Act
    found_file = in_memory_repository.find_document_by_hash(sample_file_document.file_hash)
    # Assert
    assert found_file is not None
    assert found_file.file_hash == created_file.file_hash
    assert found_file.id == created_file.id
  def test_i_cannot_find_document_with_nonexistent_file_hash(self, in_memory_repository):
    """Test that nonexistent file hash returns None."""
    # Act
    found_file = in_memory_repository.find_document_by_hash("nonexistent_hash")
    # Assert
    assert found_file is None
  def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document):
    """Test finding file document by filepath."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    # Act
    found_file = in_memory_repository.find_document_by_filepath(sample_file_document.filepath)
    # Assert
    assert found_file is not None
    assert found_file.filepath == created_file.filepath
    assert found_file.id == created_file.id
  def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository):
    """Test that nonexistent filepath returns None."""
    # Act
    found_file = in_memory_repository.find_document_by_filepath("/nonexistent/path/file.pdf")
    # Assert
    assert found_file is None
  def test_i_cannot_find_document_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during file document finding."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error"))
    # Act
    found_file = in_memory_repository.find_document_by_hash("test_hash")
    # Assert
    assert found_file is None
 class TestFileDocumentRepositoryNameMatching:
  """Tests for file document name matching functionality."""
  def test_i_can_find_documents_by_name_with_fuzzy_matching(self, in_memory_repository, multiple_sample_files):
    """Test finding file documents by filename using fuzzy matching."""
    # Arrange
    for file_doc in multiple_sample_files:
      in_memory_repository.create_document(file_doc)
    # Act
    fuzzy_method = FuzzyMatching(threshold=0.5)
    found_files = in_memory_repository.find_document_by_name("document", fuzzy_method)
    # Assert
    assert len(found_files) >= 1
    assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
    # Should find files with "document" in the name
    found_filenames = [f.filename for f in found_files]
    assert any("document" in fname.lower() for fname in found_filenames)
  def test_i_can_find_documents_by_name_with_subsequence_matching(self, in_memory_repository,
                                                                  multiple_sample_files):
    """Test finding file documents by filename using subsequence matching."""
    # Arrange
    for file_doc in multiple_sample_files:
      in_memory_repository.create_document(file_doc)
    # Act
    subsequence_method = SubsequenceMatching()
    found_files = in_memory_repository.find_document_by_name("doc", subsequence_method)
    # Assert
    assert len(found_files) >= 1
    assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
  def test_i_can_find_documents_by_name_with_default_method(self, in_memory_repository, multiple_sample_files):
    """Test finding file documents by filename with default matching method."""
    # Arrange
    for file_doc in multiple_sample_files:
      in_memory_repository.create_document(file_doc)
    # Act
    found_files = in_memory_repository.find_document_by_name("first")
    # Assert
    assert len(found_files) >= 0
    assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
  def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during document name matching."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
    # Act
    found_files = in_memory_repository.find_document_by_name("test")
    # Assert
    assert found_files == []
 class TestFileDocumentRepositoryListing:
  """Tests for file document listing functionality."""
  def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_files):
    """Test listing file documents with default pagination."""
    # Arrange
    for file_doc in multiple_sample_files:
      in_memory_repository.create_document(file_doc)
    # Act
    files = in_memory_repository.list_documents()
    # Assert
    assert len(files) == len(multiple_sample_files)
    assert all(isinstance(file_doc, FileDocument) for file_doc in files)
  def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_files):
    """Test listing file documents with custom pagination."""
    # Arrange
    for file_doc in multiple_sample_files:
      in_memory_repository.create_document(file_doc)
    # Act
    files_page1 = in_memory_repository.list_documents(skip=0, limit=2)
    files_page2 = in_memory_repository.list_documents(skip=2, limit=2)
    # Assert
    assert len(files_page1) == 2
    assert len(files_page2) == 1  # Only 3 total files
    # Ensure no overlap between pages
    page1_ids = [file_doc.id for file_doc in files_page1]
    page2_ids = [file_doc.id for file_doc in files_page2]
    assert len(set(page1_ids).intersection(set(page2_ids))) == 0
  def test_i_can_list_documents_sorted_by_detected_at(self, in_memory_repository, sample_file_document):
    """Test that file documents are sorted by detected_at in descending order."""
    # Arrange
    file1 = sample_file_document.model_copy()
    file1.filepath = "/docs/file1.pdf"
    file1.filename = "file1.pdf"
    file1.file_hash = "hash1" + "0" * 58
    file1.detected_at = datetime(2024, 1, 1, 10, 0, 0)
    file2 = sample_file_document.model_copy()
    file2.filepath = "/docs/file2.pdf"
    file2.filename = "file2.pdf"
    file2.file_hash = "hash2" + "0" * 58
    file2.detected_at = datetime(2024, 1, 2, 10, 0, 0)  # Later date
    created_file1 = in_memory_repository.create_document(file1)
    created_file2 = in_memory_repository.create_document(file2)
    # Act
    files = in_memory_repository.list_documents()
    # Assert
    assert len(files) == 2
    # Most recent (latest detected_at) should be first
    assert files[0].id == created_file2.id
    assert files[1].id == created_file1.id
  def test_i_can_list_empty_documents(self, in_memory_repository):
    """Test listing file documents from empty collection."""
    # Act
    files = in_memory_repository.list_documents()
    # Assert
    assert files == []
  def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during file document listing."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
    # Act
    files = in_memory_repository.list_documents()
    # Assert
    assert files == []
 class TestFileDocumentRepositoryUpdate:
  """Tests for file document update functionality."""
  def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document,
                                              sample_update_data):
    """Test successful file document update."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    # Act
    updated_file = in_memory_repository.update_document(str(created_file.id), sample_update_data)
    # Assert
    assert updated_file is not None
    assert updated_file.extraction_method == sample_update_data["extraction_method"]
    assert updated_file.metadata == sample_update_data["metadata"]
    assert updated_file.file_size == sample_update_data["file_size"]
    assert updated_file.id == created_file.id
    assert updated_file.filename == created_file.filename  # Unchanged fields remain
    assert updated_file.filepath == created_file.filepath
  def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document):
    """Test updating file document with partial data."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    partial_update = {"file_size": 999999}
    # Act
    updated_file = in_memory_repository.update_document(str(created_file.id), partial_update)
    # Assert
    assert updated_file is not None
    assert updated_file.file_size == 999999
    assert updated_file.filename == created_file.filename  # Should remain unchanged
    assert updated_file.metadata == created_file.metadata  # Should remain unchanged
  def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document):
    """Test that None values are filtered out from update data."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    update_with_none = {"file_size": 777777, "metadata": None}
    # Act
    updated_file = in_memory_repository.update_document(str(created_file.id), update_with_none)
    # Assert
    assert updated_file is not None
    assert updated_file.file_size == 777777
    assert updated_file.metadata == created_file.metadata  # Should remain unchanged (None filtered out)
  def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document):
    """Test updating file document with empty data returns current document."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    empty_update = {}
    # Act
    result = in_memory_repository.update_document(str(created_file.id), empty_update)
    # Assert
    assert result is not None
    assert result.filename == created_file.filename
    assert result.filepath == created_file.filepath
    assert result.metadata == created_file.metadata
  def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data):
    """Test that updating with invalid ID returns None."""
    # Act
    result = in_memory_repository.update_document("invalid_id", sample_update_data)
    # Assert
    assert result is None
  def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data):
    """Test that updating nonexistent file document returns None."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
    result = in_memory_repository.update_document(nonexistent_id, sample_update_data)
    # Assert
    assert result is None
  def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document,
                                                       sample_update_data, mocker):
    """Test handling of PyMongo errors during file document update."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
                        side_effect=PyMongoError("Database error"))
    # Act
    result = in_memory_repository.update_document(str(created_file.id), sample_update_data)
    # Assert
    assert result is None
 class TestFileDocumentRepositoryDeletion:
  """Tests for file document deletion functionality."""
  def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document):
    """Test successful file document deletion."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    # Act
    deletion_result = in_memory_repository.delete_document(str(created_file.id))
    # Assert
    assert deletion_result is True
    # Verify document is actually deleted
    found_file = in_memory_repository.find_document_by_id(str(created_file.id))
    assert found_file is None
  def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository):
    """Test that deleting with invalid ID returns False."""
    # Act
    result = in_memory_repository.delete_document("invalid_id")
    # Assert
    assert result is False
  def test_i_cannot_delete_nonexistent_document(self, in_memory_repository):
    """Test that deleting nonexistent file document returns False."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
    result = in_memory_repository.delete_document(nonexistent_id)
    # Assert
    assert result is False
  def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
    """Test handling of PyMongo errors during file document deletion."""
    # Arrange
    created_file = in_memory_repository.create_document(sample_file_document)
    mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
    # Act
    result = in_memory_repository.delete_document(str(created_file.id))
    # Assert
    assert result is False
 class TestFileDocumentRepositoryUtilities:
  """Tests for utility methods."""
  def test_i_can_count_documents(self, in_memory_repository, sample_file_document):
    """Test counting file documents."""
    # Arrange
    initial_count = in_memory_repository.count_documents()
    in_memory_repository.create_document(sample_file_document)
    # Act
    final_count = in_memory_repository.count_documents()
    # Assert
    assert final_count == initial_count + 1
  def test_i_can_count_zero_documents(self, in_memory_repository):
    """Test counting file documents in empty collection."""
    # Act
    count = in_memory_repository.count_documents()
    # Assert
    assert count == 0
  def test_i_cannot_count_documents_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during file document counting."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'count_documents', side_effect=PyMongoError("Database error"))
    # Act
    count = in_memory_repository.count_documents()
    # Assert
    assert count == 0
 class TestMatchingMethods:
  """Tests for matching method classes."""
  def test_i_can_create_fuzzy_matching_with_default_threshold(self):
    """Test creating FuzzyMatching with default threshold."""
    # Act
    fuzzy = FuzzyMatching()
    # Assert
    assert fuzzy.threshold == 0.6
  def test_i_can_create_fuzzy_matching_with_custom_threshold(self):
    """Test creating FuzzyMatching with custom threshold."""
    # Act
    fuzzy = FuzzyMatching(threshold=0.8)
    # Assert
    assert fuzzy.threshold == 0.8
  def test_i_can_create_subsequence_matching(self):
    """Test creating SubsequenceMatching."""
    # Act
    subsequence = SubsequenceMatching()
    # Assert
    assert isinstance(subsequence, MatchMethodBase)
    assert isinstance(subsequence, SubsequenceMatching)
--- a/tests/repositories/test_job_repository.py
+++ b/tests/repositories/test_job_repository.py
@@ -0,0 +1,496 @@
 """
 Test suite for JobRepository with async/support.
 This module contains comprehensive tests for all JobRepository methods
 using mongomock-motor for in-memory MongoDB testing.
 """
 from datetime import datetime
 import pytest
 from bson import ObjectId
 from mongomock.mongo_client import MongoClient
 from mongomock_motor import AsyncMongoMockClient
 from pymongo.errors import PyMongoError
 from app.database.repositories.job_repository import JobRepository
 from app.exceptions.job_exceptions import JobRepositoryError
 from app.models.job import ProcessingJob, ProcessingStatus
 from app.models.types import PyObjectId
@pytest.fixture
 def in_memory_repository():
  """Create an in-memory JobRepository for testing."""
  client = MongoClient()
  db = client.test_database
  repo = JobRepository(db)
  repo.initialize()
  return repo
@pytest.fixture
 def sample_document_id():
  """Sample document ObjectId for testing."""
  return PyObjectId()
@pytest.fixture
 def sample_task_id():
  """Sample Celery task ID for testing."""
  return "celery-task-12345-abcde"
@pytest.fixture
 def multiple_sample_jobs():
  """Multiple ProcessingJob objects for testing."""
  doc_id_1 = ObjectId()
  doc_id_2 = ObjectId()
  base_time = datetime.utcnow()
  return [
      ProcessingJob(
        document_id=doc_id_1,
        status=ProcessingStatus.PENDING,
        task_id="task-1",
        created_at=base_time,
        started_at=None,
        completed_at=None,
        error_message=None
      ),
      ProcessingJob(
        document_id=doc_id_2,
        status=ProcessingStatus.PROCESSING,
        task_id="task-2",
        created_at=base_time,
        started_at=base_time,
        completed_at=None,
        error_message=None
      ),
      ProcessingJob(
        document_id=doc_id_1,
        status=ProcessingStatus.COMPLETED,
        task_id="task-3",
        created_at=base_time,
        started_at=base_time,
        completed_at=base_time,
        error_message=None
      )
  ]
 class TestJobRepositoryInitialization:
  """Tests for repository initialization."""
  def test_i_can_initialize_repository(self):
    """Test repository initialization."""
    # Arrange
    client = AsyncMongoMockClient()
    db = client.test_database
    repo = JobRepository(db)
    # Act
    initialized_repo = repo.initialize()
    # Assert
    assert initialized_repo is repo
    assert repo.db is not None
    assert repo.collection is not None
 class TestJobRepositoryCreation:
  """Tests for job creation functionality."""
  def test_i_can_create_job_with_task_id(self, in_memory_repository, sample_document_id, sample_task_id):
    """Test successful job creation with task ID."""
    # Act
    created_job = in_memory_repository.create_job(sample_document_id, sample_task_id)
    # Assert
    assert created_job is not None
    assert created_job.document_id == sample_document_id
    assert created_job.task_id == sample_task_id
    assert created_job.status == ProcessingStatus.PENDING
    assert created_job.created_at is not None
    assert created_job.started_at is None
    assert created_job.completed_at is None
    assert created_job.error_message is None
    assert created_job.id is not None
    assert isinstance(created_job.id, ObjectId)
  def test_i_can_create_job_without_task_id(self, in_memory_repository, sample_document_id):
    """Test successful job creation without task ID."""
    # Act
    created_job = in_memory_repository.create_job(sample_document_id)
    # Assert
    assert created_job is not None
    assert created_job.document_id == sample_document_id
    assert created_job.task_id is None
    assert created_job.status == ProcessingStatus.PENDING
    assert created_job.created_at is not None
    assert created_job.started_at is None
    assert created_job.completed_at is None
    assert created_job.error_message is None
    assert created_job.id is not None
    assert isinstance(created_job.id, ObjectId)
  def test_i_cannot_create_duplicate_job_for_document(self, in_memory_repository, sample_document_id,
                                                      sample_task_id):
    """Test that creating job with duplicate document_id raises DuplicateKeyError."""
    # Arrange
    in_memory_repository.create_job(sample_document_id, sample_task_id)
    # Act & Assert
    with pytest.raises(JobRepositoryError) as exc_info:
      in_memory_repository.create_job(sample_document_id, "different-task-id")
    assert "create_job" in str(exc_info.value)
  def test_i_cannot_create_job_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker):
    """Test handling of PyMongo errors during job creation."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
    # Act & Assert
    with pytest.raises(JobRepositoryError) as exc_info:
      in_memory_repository.create_job(sample_document_id)
    assert "create_job" in str(exc_info.value)
 class TestJobRepositoryFinding:
  """Tests for job finding functionality."""
  def test_i_can_find_job_by_valid_id(self, in_memory_repository, sample_document_id, sample_task_id):
    """Test finding job by valid ObjectId."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id, sample_task_id)
    # Act
    found_job = in_memory_repository.find_job_by_id(created_job.id)
    # Assert
    assert found_job is not None
    assert found_job.id == created_job.id
    assert found_job.document_id == created_job.document_id
    assert found_job.task_id == created_job.task_id
    assert found_job.status == created_job.status
  def test_i_cannot_find_job_by_nonexistent_id(self, in_memory_repository):
    """Test that nonexistent ObjectId returns None."""
    # Arrange
    nonexistent_id = PyObjectId()
    # Act
    found_job = in_memory_repository.find_job_by_id(nonexistent_id)
    # Assert
    assert found_job is None
  def test_i_cannot_find_job_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during job finding."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error"))
    # Act & Assert
    with pytest.raises(JobRepositoryError) as exc_info:
      in_memory_repository.find_job_by_id(PyObjectId())
    assert "get_job_by_id" in str(exc_info.value)
  def test_i_can_find_jobs_by_document_id(self, in_memory_repository, sample_document_id, sample_task_id):
    """Test finding jobs by document ID."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id, sample_task_id)
    # Act
    found_jobs = in_memory_repository.find_jobs_by_document_id(sample_document_id)
    # Assert
    assert len(found_jobs) == 1
    assert found_jobs[0].id == created_job.id
    assert found_jobs[0].document_id == sample_document_id
  def test_i_can_find_empty_jobs_list_for_nonexistent_document(self, in_memory_repository):
    """Test that nonexistent document ID returns empty list."""
    # Arrange
    nonexistent_id = ObjectId()
    # Act
    found_jobs = in_memory_repository.find_jobs_by_document_id(nonexistent_id)
    # Assert
    assert found_jobs == []
  def test_i_cannot_find_jobs_by_document_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during finding jobs by document ID."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
    # Act & Assert
    with pytest.raises(JobRepositoryError) as exc_info:
      in_memory_repository.find_jobs_by_document_id(PyObjectId())
    assert "get_jobs_by_file_id" in str(exc_info.value)
  @pytest.mark.parametrize("status", [
      ProcessingStatus.PENDING,
      ProcessingStatus.PROCESSING,
      ProcessingStatus.COMPLETED
  ])
  def test_i_can_find_jobs_by_pending_status(self, in_memory_repository, sample_document_id, status):
    """Test finding jobs by PENDING status."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    in_memory_repository.update_job_status(created_job.id, status)
    # Act
    found_jobs = in_memory_repository.get_jobs_by_status(status)
    # Assert
    assert len(found_jobs) == 1
    assert found_jobs[0].id == created_job.id
    assert found_jobs[0].status == status
  def test_i_can_find_jobs_by_failed_status(self, in_memory_repository, sample_document_id):
    """Test finding jobs by FAILED status."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    in_memory_repository.update_job_status(created_job.id, ProcessingStatus.FAILED, "Test error")
    # Act
    found_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.FAILED)
    # Assert
    assert len(found_jobs) == 1
    assert found_jobs[0].id == created_job.id
    assert found_jobs[0].status == ProcessingStatus.FAILED
    assert found_jobs[0].error_message == "Test error"
  def test_i_can_find_empty_jobs_list_for_unused_status(self, in_memory_repository):
    """Test that unused status returns empty list."""
    # Act
    found_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.COMPLETED)
    # Assert
    assert found_jobs == []
  def test_i_cannot_find_jobs_by_status_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during finding jobs by status."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
    # Act & Assert
    with pytest.raises(JobRepositoryError) as exc_info:
      in_memory_repository.get_jobs_by_status(ProcessingStatus.PENDING)
    assert "get_jobs_by_status" in str(exc_info.value)
 class TestJobRepositoryStatusUpdate:
  """Tests for job status update functionality."""
  def test_i_can_update_job_status_to_processing(self, in_memory_repository, sample_document_id):
    """Test updating job status to PROCESSING with started_at timestamp."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    # Act
    updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.PROCESSING)
    # Assert
    assert updated_job is not None
    assert updated_job.id == created_job.id
    assert updated_job.status == ProcessingStatus.PROCESSING
    assert updated_job.started_at is not None
    assert updated_job.completed_at is None
    assert updated_job.error_message is None
  def test_i_can_update_job_status_to_completed(self, in_memory_repository, sample_document_id):
    """Test updating job status to COMPLETED with completed_at timestamp."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    in_memory_repository.update_job_status(created_job.id, ProcessingStatus.PROCESSING)
    # Act
    updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.COMPLETED)
    # Assert
    assert updated_job is not None
    assert updated_job.id == created_job.id
    assert updated_job.status == ProcessingStatus.COMPLETED
    assert updated_job.started_at is not None
    assert updated_job.completed_at is not None
    assert updated_job.error_message is None
  def test_i_can_update_job_status_to_failed_with_error(self, in_memory_repository, sample_document_id):
    """Test updating job status to FAILED with error message and completed_at timestamp."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    error_message = "Processing failed due to invalid format"
    # Act
    updated_job = in_memory_repository.update_job_status(
      created_job.id, ProcessingStatus.FAILED, error_message
    )
    # Assert
    assert updated_job is not None
    assert updated_job.id == created_job.id
    assert updated_job.status == ProcessingStatus.FAILED
    assert updated_job.completed_at is not None
    assert updated_job.error_message == error_message
  def test_i_can_update_job_status_to_failed_without_error(self, in_memory_repository, sample_document_id):
    """Test updating job status to FAILED without error message."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    # Act
    updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.FAILED)
    # Assert
    assert updated_job is not None
    assert updated_job.id == created_job.id
    assert updated_job.status == ProcessingStatus.FAILED
    assert updated_job.completed_at is not None
    assert updated_job.error_message is None
  def test_i_cannot_update_nonexistent_job_status(self, in_memory_repository):
    """Test that updating nonexistent job returns None."""
    # Arrange
    nonexistent_id = ObjectId()
    # Act
    result = in_memory_repository.update_job_status(nonexistent_id, ProcessingStatus.COMPLETED)
    # Assert
    assert result is None
  def test_i_cannot_update_job_status_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker):
    """Test handling of PyMongo errors during job status update."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
                        side_effect=PyMongoError("Database error"))
    # Act & Assert
    with pytest.raises(JobRepositoryError) as exc_info:
      in_memory_repository.update_job_status(created_job.id, ProcessingStatus.COMPLETED)
    assert "update_job_status" in str(exc_info.value)
 class TestJobRepositoryDeletion:
  """Tests for job deletion functionality."""
  def test_i_can_delete_existing_job(self, in_memory_repository, sample_document_id):
    """Test successful job deletion."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    # Act
    deletion_result = in_memory_repository.delete_job(created_job.id)
    # Assert
    assert deletion_result is True
    # Verify job is actually deleted
    found_job = in_memory_repository.find_job_by_id(created_job.id)
    assert found_job is None
  def test_i_cannot_delete_nonexistent_job(self, in_memory_repository):
    """Test that deleting nonexistent job returns False."""
    # Arrange
    nonexistent_id = ObjectId()
    # Act
    result = in_memory_repository.delete_job(nonexistent_id)
    # Assert
    assert result is False
  def test_i_cannot_delete_job_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker):
    """Test handling of PyMongo errors during job deletion."""
    # Arrange
    created_job = in_memory_repository.create_job(sample_document_id)
    mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
    # Act & Assert
    with pytest.raises(JobRepositoryError) as exc_info:
      in_memory_repository.delete_job(created_job.id)
    assert "delete_job" in str(exc_info.value)
 class TestJobRepositoryComplexScenarios:
  """Tests for complex job repository scenarios."""
  def test_i_can_handle_complete_job_lifecycle(self, in_memory_repository, sample_document_id, sample_task_id):
    """Test complete job lifecycle from creation to completion."""
    # Create job
    job = in_memory_repository.create_job(sample_document_id, sample_task_id)
    assert job.status == ProcessingStatus.PENDING
    assert job.started_at is None
    assert job.completed_at is None
    # Start processing
    job = in_memory_repository.update_job_status(job.id, ProcessingStatus.PROCESSING)
    assert job.status == ProcessingStatus.PROCESSING
    assert job.started_at is not None
    assert job.completed_at is None
    # Complete job
    job = in_memory_repository.update_job_status(job.id, ProcessingStatus.COMPLETED)
    assert job.status == ProcessingStatus.COMPLETED
    assert job.started_at is not None
    assert job.completed_at is not None
    assert job.error_message is None
  def test_i_can_handle_job_failure_scenario(self, in_memory_repository, sample_document_id, sample_task_id):
    """Test job failure scenario with error message."""
    # Create and start job
    job = in_memory_repository.create_job(sample_document_id, sample_task_id)
    job = in_memory_repository.update_job_status(job.id, ProcessingStatus.PROCESSING)
    # Fail job with error
    error_msg = "File format not supported"
    job = in_memory_repository.update_job_status(job.id, ProcessingStatus.FAILED, error_msg)
    # Assert failure state
    assert job.status == ProcessingStatus.FAILED
    assert job.started_at is not None
    assert job.completed_at is not None
    assert job.error_message == error_msg
  def test_i_can_handle_multiple_documents_with_different_statuses(self, in_memory_repository):
    """Test managing multiple jobs for different documents with various statuses."""
    # Create jobs for different documents
    doc1 = PyObjectId()
    doc2 = PyObjectId()
    doc3 = PyObjectId()
    job1 = in_memory_repository.create_job(doc1, "task-1")
    job2 = in_memory_repository.create_job(doc2, "task-2")
    job3 = in_memory_repository.create_job(doc3, "task-3")
    # Update to different statuses
    in_memory_repository.update_job_status(job1.id, ProcessingStatus.PROCESSING)
    in_memory_repository.update_job_status(job2.id, ProcessingStatus.COMPLETED)
    in_memory_repository.update_job_status(job3.id, ProcessingStatus.FAILED, "Error occurred")
    # Verify status queries
    pending_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.PENDING)
    processing_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.PROCESSING)
    completed_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.COMPLETED)
    failed_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.FAILED)
    assert len(pending_jobs) == 0
    assert len(processing_jobs) == 1
    assert len(completed_jobs) == 1
    assert len(failed_jobs) == 1
    assert processing_jobs[0].id == job1.id
    assert completed_jobs[0].id == job2.id
    assert failed_jobs[0].id == job3.id
--- a/tests/repositories/test_user_repository.py
+++ b/tests/repositories/test_user_repository.py
@@ -1,29 +1,26 @@
 """
-Test suite for UserRepository with async/await support.
+Test suite for UserRepository with async/support.
 This module contains comprehensive tests for all UserRepository methods
 using mongomock-motor for in-memory MongoDB testing.
 """
 import pytest
 from datetime import datetime
 import pytest_asyncio
 from bson import ObjectId
 from mongomock.mongo_client import MongoClient
 from pymongo.errors import DuplicateKeyError
 from mongomock_motor import AsyncMongoMockClient
 from app.database.repositories.user_repository import UserRepository
-from app.models.user import UserCreate, UserUpdate, UserInDB
+from app.models.user import UserCreate, UserUpdate
-@pytest_asyncio.fixture
+@pytest.fixture
-async def in_memory_repository():
+def in_memory_repository():
  """Create an in-memory UserRepository for testing."""
-  client = AsyncMongoMockClient()
+  client = MongoClient()
  db = client.test_database
  repo = UserRepository(db)
-  await repo.initialize()
+  repo.initialize()
  return repo
@@ -51,11 +48,10 @@ def sample_user_update():
 class TestUserRepositoryCreation:
  """Tests for user creation functionality."""
-  @pytest.mark.asyncio
+  def test_i_can_create_user(self, in_memory_repository, sample_user_create):
  async def test_i_can_create_user(self, in_memory_repository, sample_user_create):
    """Test successful user creation."""
    # Act
-    created_user = await in_memory_repository.create_user(sample_user_create)
+    created_user = in_memory_repository.create_user(sample_user_create)
    # Assert
    assert created_user is not None
@@ -68,15 +64,14 @@ class TestUserRepositoryCreation:
    assert created_user.updated_at is not None
    assert created_user.hashed_password != sample_user_create.password  # Should be hashed
-  @pytest.mark.asyncio
+  def test_i_cannot_create_user_with_duplicate_username(self, in_memory_repository, sample_user_create):
  async def test_i_cannot_create_user_with_duplicate_username(self, in_memory_repository, sample_user_create):
    """Test that creating user with duplicate username raises DuplicateKeyError."""
    # Arrange
-    await in_memory_repository.create_user(sample_user_create)
+    in_memory_repository.create_user(sample_user_create)
    # Act & Assert
    with pytest.raises(DuplicateKeyError) as exc_info:
-      await in_memory_repository.create_user(sample_user_create)
+      in_memory_repository.create_user(sample_user_create)
    assert "already exists" in str(exc_info.value)
@@ -84,14 +79,13 @@ class TestUserRepositoryCreation:
 class TestUserRepositoryFinding:
  """Tests for user finding functionality."""
-  @pytest.mark.asyncio
+  def test_i_can_find_user_by_id(self, in_memory_repository, sample_user_create):
  async def test_i_can_find_user_by_id(self, in_memory_repository, sample_user_create):
    """Test finding user by valid ID."""
    # Arrange
-    created_user = await in_memory_repository.create_user(sample_user_create)
+    created_user = in_memory_repository.create_user(sample_user_create)
    # Act
-    found_user = await in_memory_repository.find_user_by_id(str(created_user.id))
+    found_user = in_memory_repository.find_user_by_id(str(created_user.id))
    # Assert
    assert found_user is not None
@@ -99,69 +93,63 @@ class TestUserRepositoryFinding:
    assert found_user.username == created_user.username
    assert found_user.email == created_user.email
-  @pytest.mark.asyncio
+  def test_i_cannot_find_user_by_invalid_id(self, in_memory_repository):
  async def test_i_cannot_find_user_by_invalid_id(self, in_memory_repository):
    """Test that invalid ObjectId returns None."""
    # Act
-    found_user = await in_memory_repository.find_user_by_id("invalid_id")
+    found_user = in_memory_repository.find_user_by_id("invalid_id")
    # Assert
    assert found_user is None
-  @pytest.mark.asyncio
+  def test_i_cannot_find_user_by_nonexistent_id(self, in_memory_repository):
  async def test_i_cannot_find_user_by_nonexistent_id(self, in_memory_repository):
    """Test that nonexistent but valid ObjectId returns None."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
-    found_user = await in_memory_repository.find_user_by_id(nonexistent_id)
+    found_user = in_memory_repository.find_user_by_id(nonexistent_id)
    # Assert
    assert found_user is None
-  @pytest.mark.asyncio
+  def test_i_can_find_user_by_username(self, in_memory_repository, sample_user_create):
  async def test_i_can_find_user_by_username(self, in_memory_repository, sample_user_create):
    """Test finding user by username."""
    # Arrange
-    created_user = await in_memory_repository.create_user(sample_user_create)
+    created_user = in_memory_repository.create_user(sample_user_create)
    # Act
-    found_user = await in_memory_repository.find_user_by_username(sample_user_create.username)
+    found_user = in_memory_repository.find_user_by_username(sample_user_create.username)
    # Assert
    assert found_user is not None
    assert found_user.username == created_user.username
    assert found_user.id == created_user.id
-  @pytest.mark.asyncio
+  def test_i_cannot_find_user_by_nonexistent_username(self, in_memory_repository):
  async def test_i_cannot_find_user_by_nonexistent_username(self, in_memory_repository):
    """Test that nonexistent username returns None."""
    # Act
-    found_user = await in_memory_repository.find_user_by_username("nonexistent")
+    found_user = in_memory_repository.find_user_by_username("nonexistent")
    # Assert
    assert found_user is None
-  @pytest.mark.asyncio
+  def test_i_can_find_user_by_email(self, in_memory_repository, sample_user_create):
  async def test_i_can_find_user_by_email(self, in_memory_repository, sample_user_create):
    """Test finding user by email."""
    # Arrange
-    created_user = await in_memory_repository.create_user(sample_user_create)
+    created_user = in_memory_repository.create_user(sample_user_create)
    # Act
-    found_user = await in_memory_repository.find_user_by_email(str(sample_user_create.email))
+    found_user = in_memory_repository.find_user_by_email(str(sample_user_create.email))
    # Assert
    assert found_user is not None
    assert found_user.email == created_user.email
    assert found_user.id == created_user.id
-  @pytest.mark.asyncio
+  def test_i_cannot_find_user_by_nonexistent_email(self, in_memory_repository):
  async def test_i_cannot_find_user_by_nonexistent_email(self, in_memory_repository):
    """Test that nonexistent email returns None."""
    # Act
-    found_user = await in_memory_repository.find_user_by_email("nonexistent@example.com")
+    found_user = in_memory_repository.find_user_by_email("nonexistent@example.com")
    # Assert
    assert found_user is None
@@ -170,15 +158,14 @@ class TestUserRepositoryFinding:
 class TestUserRepositoryUpdate:
  """Tests for user update functionality."""
-  @pytest.mark.asyncio
+  def test_i_can_update_user(self, in_memory_repository, sample_user_create, sample_user_update):
  async def test_i_can_update_user(self, in_memory_repository, sample_user_create, sample_user_update):
    """Test successful user update."""
    # Arrange
-    created_user = await in_memory_repository.create_user(sample_user_create)
+    created_user = in_memory_repository.create_user(sample_user_create)
    original_updated_at = created_user.updated_at
    # Act
-    updated_user = await in_memory_repository.update_user(str(created_user.id), sample_user_update)
+    updated_user = in_memory_repository.update_user(str(created_user.id), sample_user_update)
    # Assert
    assert updated_user is not None
@@ -187,24 +174,22 @@ class TestUserRepositoryUpdate:
    assert updated_user.role == sample_user_update.role
    assert updated_user.id == created_user.id
-  @pytest.mark.asyncio
+  def test_i_cannot_update_user_with_invalid_id(self, in_memory_repository, sample_user_update):
  async def test_i_cannot_update_user_with_invalid_id(self, in_memory_repository, sample_user_update):
    """Test that updating with invalid ID returns None."""
    # Act
-    result = await in_memory_repository.update_user("invalid_id", sample_user_update)
+    result = in_memory_repository.update_user("invalid_id", sample_user_update)
    # Assert
    assert result is None
-  @pytest.mark.asyncio
+  def test_i_can_update_user_with_partial_data(self, in_memory_repository, sample_user_create):
  async def test_i_can_update_user_with_partial_data(self, in_memory_repository, sample_user_create):
    """Test updating user with partial data."""
    # Arrange
-    created_user = await in_memory_repository.create_user(sample_user_create)
+    created_user = in_memory_repository.create_user(sample_user_create)
    partial_update = UserUpdate(username="newusername")
    # Act
-    updated_user = await in_memory_repository.update_user(str(created_user.id), partial_update)
+    updated_user = in_memory_repository.update_user(str(created_user.id), partial_update)
    # Assert
    assert updated_user is not None
@@ -212,15 +197,14 @@ class TestUserRepositoryUpdate:
    assert updated_user.email == created_user.email  # Should remain unchanged
    assert updated_user.role == created_user.role  # Should remain unchanged
-  @pytest.mark.asyncio
+  def test_i_can_update_user_with_empty_data(self, in_memory_repository, sample_user_create):
  async def test_i_can_update_user_with_empty_data(self, in_memory_repository, sample_user_create):
    """Test updating user with empty data returns current user."""
    # Arrange
-    created_user = await in_memory_repository.create_user(sample_user_create)
+    created_user = in_memory_repository.create_user(sample_user_create)
    empty_update = UserUpdate()
    # Act
-    result = await in_memory_repository.update_user(str(created_user.id), empty_update)
+    result = in_memory_repository.update_user(str(created_user.id), empty_update)
    # Assert
    assert result is not None
@@ -231,39 +215,36 @@ class TestUserRepositoryUpdate:
 class TestUserRepositoryDeletion:
  """Tests for user deletion functionality."""
-  @pytest.mark.asyncio
+  def test_i_can_delete_user(self, in_memory_repository, sample_user_create):
  async def test_i_can_delete_user(self, in_memory_repository, sample_user_create):
    """Test successful user deletion."""
    # Arrange
-    created_user = await in_memory_repository.create_user(sample_user_create)
+    created_user = in_memory_repository.create_user(sample_user_create)
    # Act
-    deletion_result = await in_memory_repository.delete_user(str(created_user.id))
+    deletion_result = in_memory_repository.delete_user(str(created_user.id))
    # Assert
    assert deletion_result is True
    # Verify user is actually deleted
-    found_user = await in_memory_repository.find_user_by_id(str(created_user.id))
+    found_user = in_memory_repository.find_user_by_id(str(created_user.id))
    assert found_user is None
-  @pytest.mark.asyncio
+  def test_i_cannot_delete_user_with_invalid_id(self, in_memory_repository):
  async def test_i_cannot_delete_user_with_invalid_id(self, in_memory_repository):
    """Test that deleting with invalid ID returns False."""
    # Act
-    result = await in_memory_repository.delete_user("invalid_id")
+    result = in_memory_repository.delete_user("invalid_id")
    # Assert
    assert result is False
-  @pytest.mark.asyncio
+  def test_i_cannot_delete_nonexistent_user(self, in_memory_repository):
  async def test_i_cannot_delete_nonexistent_user(self, in_memory_repository):
    """Test that deleting nonexistent user returns False."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
-    result = await in_memory_repository.delete_user(nonexistent_id)
+    result = in_memory_repository.delete_user(nonexistent_id)
    # Assert
    assert result is False
@@ -272,30 +253,27 @@ class TestUserRepositoryDeletion:
 class TestUserRepositoryUtilities:
  """Tests for utility methods."""
-  @pytest.mark.asyncio
+  def test_i_can_count_users(self, in_memory_repository, sample_user_create):
  async def test_i_can_count_users(self, in_memory_repository, sample_user_create):
    """Test counting users."""
    # Arrange
-    initial_count = await in_memory_repository.count_users()
+    initial_count = in_memory_repository.count_users()
-    await in_memory_repository.create_user(sample_user_create)
+    in_memory_repository.create_user(sample_user_create)
    # Act
-    final_count = await in_memory_repository.count_users()
+    final_count = in_memory_repository.count_users()
    # Assert
    assert final_count == initial_count + 1
-  @pytest.mark.asyncio
+  def test_i_can_check_user_exists(self, in_memory_repository, sample_user_create):
  async def test_i_can_check_user_exists(self, in_memory_repository, sample_user_create):
    """Test checking if user exists."""
    # Arrange
-    await in_memory_repository.create_user(sample_user_create)
+    in_memory_repository.create_user(sample_user_create)
    # Act
-    exists = await in_memory_repository.user_exists(sample_user_create.username)
+    exists = in_memory_repository.user_exists(sample_user_create.username)
-    not_exists = await in_memory_repository.user_exists("nonexistent")
+    not_exists = in_memory_repository.user_exists("nonexistent")
    # Assert
    assert exists is True
    assert not_exists is False
--- a/tests/services/init.py
+++ b/tests/services/init.py
--- a/tests/services/test_document_service.py
+++ b/tests/services/test_document_service.py
@@ -0,0 +1,570 @@
 """
 Unit tests for DocumentService using in-memory MongoDB.
 Tests the orchestration logic with real MongoDB operations
 using mongomock for better integration testing.
 """
 import os
 from datetime import datetime
 from unittest.mock import patch
 import pytest
 import pytest_asyncio
 from bson import ObjectId
 from mongomock.mongo_client import MongoClient
 from app.models.document import FileType
 from app.services.document_service import DocumentService
@pytest.fixture(autouse=True)
 def cleanup_test_folder():
  """Clean up test folder."""
  import shutil
  shutil.rmtree("test_folder", ignore_errors=True)
@pytest.fixture
 def in_memory_database():
  """Create an in-memory database for testing."""
  client = MongoClient()
  return client.test_database
@pytest_asyncio.fixture
 def document_service(in_memory_database):
  """Create DocumentService with in-memory repositories."""
  service = DocumentService(in_memory_database, objects_folder="test_folder")
  return service
@pytest.fixture
 def sample_file_bytes():
  """Sample file content as bytes."""
  return b"This is a test PDF content"
@pytest.fixture
 def sample_text_bytes():
  """Sample text file content as bytes."""
  return b"This is a test text file content"
@pytest.fixture
 def sample_file_hash():
  """Expected SHA256 hash for sample file bytes."""
  import hashlib
  return hashlib.sha256(b"This is a test PDF content").hexdigest()
 def validate_file_saved(document_service, file_hash, file_bytes):
  # Verify file is saved to disk
  target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
  assert os.path.exists(target_file_path)
  with open(target_file_path, "rb") as f:
    content = f.read()
  assert content == file_bytes
 class TestCreateDocument:
  """Tests for create_document method."""
  @patch('app.services.document_service.magic.from_buffer')
  @patch('app.services.document_service.datetime')
  def test_i_can_create_document_with_new_content(
      self,
      mock_datetime,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test creating document when content doesn't exist yet."""
    # Setup mocks
    fixed_time = datetime(2025, 1, 1, 10, 30, 0)
    mock_datetime.now.return_value = fixed_time
    mock_magic.return_value = "application/pdf"
    # Execute
    result = document_service.create_document(
      "/test/test.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Verify document creation
    assert result is not None
    assert result.filename == "test.pdf"
    assert result.filepath == "/test/test.pdf"
    assert result.file_type == FileType.PDF
    assert result.detected_at == fixed_time
    assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
    # Verify document created in database
    doc_in_db = document_service.document_repository.find_document_by_id(result.id)
    assert doc_in_db is not None
    assert doc_in_db.id == result.id
    assert doc_in_db.filename == result.filename
    assert doc_in_db.filepath == result.filepath
    assert doc_in_db.file_type == result.file_type
    assert doc_in_db.detected_at == fixed_time
    assert doc_in_db.file_hash == result.file_hash
    # Verify file is saved to disk
    validate_file_saved(document_service, result.file_hash, sample_file_bytes)
  @patch('app.services.document_service.magic.from_buffer')
  @patch('app.services.document_service.datetime')
  def test_i_can_create_document_with_existing_content(
      self,
      mock_datetime,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test creating document when content already exists (deduplication)."""
    # Setup mocks
    fixed_time = datetime(2025, 1, 1, 10, 30, 0)
    mock_datetime.now.return_value = fixed_time
    mock_magic.return_value = "application/pdf"
    # Create first document
    first_doc = document_service.create_document(
      "/test/first.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Create second document with same content
    second_doc = document_service.create_document(
      "/test/second.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Verify both documents exist but share same hash
    assert first_doc.file_hash == second_doc.file_hash
    assert first_doc.filename != second_doc.filename
    assert first_doc.filepath != second_doc.filepath
  def test_i_cannot_create_document_with_unsupported_file_type(
      self,
      document_service,
      sample_file_bytes
  ):
    """Test that unsupported file types raise ValueError."""
    with pytest.raises(ValueError, match="Unsupported file type"):
      document_service.create_document(
        "/test/test.xyz",  # Unsupported extension
        sample_file_bytes,
        "utf-8"
      )
  def test_i_cannot_create_document_with_empty_file_path(
      self,
      document_service,
      sample_file_bytes
  ):
    """Test that empty file path raises ValueError."""
    with pytest.raises(ValueError):
      document_service.create_document(
        "",  # Empty path
        sample_file_bytes,
        "utf-8"
      )
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_create_document_with_empty_bytes(
      self,
      mock_magic,
      document_service
  ):
    """Test behavior with empty file bytes."""
    # Setup
    mock_magic.return_value = "text/plain"
    # Execute with empty bytes
    result = document_service.create_document(
      "/test/empty.txt",
      b"",  # Empty bytes
      "utf-8"
    )
    # Verify file is saved to disk
    validate_file_saved(document_service, result.file_hash, b"")
 class TestGetMethods:
  """Tests for document retrieval methods."""
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_get_document_by_id(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test retrieving document by ID."""
    # Setup
    mock_magic.return_value = "application/pdf"
    # Create a document first
    created_doc = document_service.create_document(
      "/test/test.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Execute
    result = document_service.get_document_by_id(created_doc.id)
    # Verify
    assert result is not None
    assert result.id == created_doc.id
    assert result.filename == created_doc.filename
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_get_document_by_hash(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test retrieving document by file hash."""
    # Setup
    mock_magic.return_value = "application/pdf"
    # Create a document first
    created_doc = document_service.create_document(
      "/test/test.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Execute
    result = document_service.get_document_by_hash(created_doc.file_hash)
    # Verify
    assert result is not None
    assert result.file_hash == created_doc.file_hash
    assert result.filename == created_doc.filename
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_get_document_by_filepath(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test retrieving document by file path."""
    # Setup
    mock_magic.return_value = "application/pdf"
    test_path = "/test/unique_test.pdf"
    # Create a document first
    created_doc = document_service.create_document(
      test_path,
      sample_file_bytes,
      "utf-8"
    )
    # Execute
    result = document_service.get_document_by_filepath(test_path)
    # Verify
    assert result is not None
    assert result.filepath == test_path
    assert result.id == created_doc.id
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_get_document_content(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test retrieving document with associated content."""
    # Setup
    mock_magic.return_value = "application/pdf"
    # Create a document first
    created_doc = document_service.create_document(
      "/test/test.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Execute
    result = document_service.get_document_content_by_hash(created_doc.file_hash)
    # Verify
    assert result == sample_file_bytes
  def test_i_cannot_get_nonexistent_document_by_id(
      self,
      document_service
  ):
    """Test that nonexistent document returns None."""
    # Execute with random ObjectId
    result = document_service.get_document_by_id(ObjectId())
    # Verify
    assert result is None
  def test_i_cannot_get_nonexistent_document_by_hash(
      self,
      document_service
  ):
    """Test that nonexistent document hash returns None."""
    # Execute
    result = document_service.get_document_by_hash("nonexistent_hash")
    # Verify
    assert result is None
 class TestPaginationAndCounting:
  """Tests for document listing and counting."""
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_list_documents_with_pagination(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test document listing with pagination parameters."""
    # Setup
    mock_magic.return_value = "application/pdf"
    # Create multiple documents
    for i in range(5):
      document_service.create_document(
        f"/test/test{i}.pdf",
        sample_file_bytes + bytes(str(i), 'utf-8'),  # Make each file unique
        "utf-8"
      )
    # Execute with pagination
    result = document_service.list_documents(skip=1, limit=2)
    # Verify
    assert len(result) == 2
    # Test counting
    total_count = document_service.count_documents()
    assert total_count == 5
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_count_documents(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test document counting."""
    # Setup
    mock_magic.return_value = "text/plain"
    # Initially should be 0
    initial_count = document_service.count_documents()
    assert initial_count == 0
    # Create some documents
    for i in range(3):
      document_service.create_document(
        f"/test/test{i}.txt",
        sample_file_bytes + bytes(str(i), 'utf-8'),
        "utf-8"
      )
    # Execute
    final_count = document_service.count_documents()
    # Verify
    assert final_count == 3
 class TestUpdateAndDelete:
  """Tests for document update and deletion operations."""
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_update_document_metadata(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test updating document metadata."""
    # Setup
    mock_magic.return_value = "application/pdf"
    # Create a document first
    created_doc = document_service.create_document(
      "/test/test.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Execute update
    update_data = {"metadata": {"page_count": 5}}
    result = document_service.update_document(created_doc.id, update_data)
    # Verify
    assert result is not None
    assert result.metadata.get("page_count") == 5
    assert result.filename == created_doc.filename
    assert result.filepath == created_doc.filepath
    assert result.file_hash == created_doc.file_hash
    assert result.file_type == created_doc.file_type
    assert result.metadata == update_data['metadata']
  def test_i_can_update_document_content(
      self,
      document_service,
      sample_file_bytes
  ):
    # Create a document first
    created_doc = document_service.create_document(
      "/test/test.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Execute update
    update_data = {"file_bytes": b"this is an updated file content"}
    result = document_service.update_document(created_doc.id, update_data)
    assert result.filename == created_doc.filename
    assert result.filepath == created_doc.filepath
    assert result.file_hash != created_doc.file_hash
    assert result.file_type == created_doc.file_type
    assert result.metadata == created_doc.metadata
    # Verify file is saved to disk
    validate_file_saved(document_service, result.file_hash, b"this is an updated file content")
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_delete_document_and_orphaned_content(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test deleting document with orphaned content cleanup."""
    # Setup
    mock_magic.return_value = "application/pdf"
    # Create a document
    created_doc = document_service.create_document(
      "/test/test.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # Verify content exists
    validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes)
    # Execute deletion
    result = document_service.delete_document(created_doc.id)
    # Verify document and content are deleted
    assert result is True
    deleted_doc = document_service.get_document_by_id(created_doc.id)
    assert deleted_doc is None
    # validate content is deleted
    file_hash = created_doc.file_hash[:24]
    target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
    assert not os.path.exists(target_file_path)
  @patch('app.services.document_service.magic.from_buffer')
  def test_i_can_delete_document_without_affecting_shared_content(
      self,
      mock_magic,
      document_service,
      sample_file_bytes
  ):
    """Test deleting document without removing shared content."""
    # Setup
    mock_magic.return_value = "application/pdf"
    # Create two documents with same content
    doc1 = document_service.create_document(
      "/test/test1.pdf",
      sample_file_bytes,
      "utf-8"
    )
    doc2 = document_service.create_document(
      "/test/test2.pdf",
      sample_file_bytes,
      "utf-8"
    )
    # They should share the same hash
    assert doc1.file_hash == doc2.file_hash
    # Delete first document
    result = document_service.delete_document(doc1.id)
    assert result is True
    # Verify first document is deleted but content still exists
    deleted_doc = document_service.get_document_by_id(doc1.id)
    assert deleted_doc is None
    remaining_doc = document_service.get_document_by_id(doc2.id)
    assert remaining_doc is not None
    validate_file_saved(document_service, doc2.file_hash, sample_file_bytes)
 class TestHashCalculation:
  """Tests for file hash calculation utility."""
  def test_i_can_calculate_consistent_file_hash(self, document_service):
    """Test that file hash calculation is consistent."""
    test_bytes = b"Test content for hashing"
    # Calculate hash multiple times
    hash1 = document_service._calculate_file_hash(test_bytes)
    hash2 = document_service._calculate_file_hash(test_bytes)
    # Should be identical
    assert hash1 == hash2
    assert len(hash1) == 64  # SHA256 produces 64-character hex string
  def test_i_get_different_hashes_for_different_content(self, document_service):
    """Test that different content produces different hashes."""
    content1 = b"First content"
    content2 = b"Second content"
    hash1 = document_service._calculate_file_hash(content1)
    hash2 = document_service._calculate_file_hash(content2)
    assert hash1 != hash2
 class TestFileTypeDetection:
  """Tests for file type detection."""
  def test_i_can_detect_pdf_file_type(self, document_service):
    """Test PDF file type detection."""
    file_type = document_service._detect_file_type("/path/to/document.pdf")
    assert file_type == FileType.PDF
  def test_i_can_detect_txt_file_type(self, document_service):
    """Test text file type detection."""
    file_type = document_service._detect_file_type("/path/to/document.txt")
    assert file_type == FileType.TXT
  def test_i_can_detect_docx_file_type(self, document_service):
    """Test DOCX file type detection."""
    file_type = document_service._detect_file_type("/path/to/document.docx")
    assert file_type == FileType.DOCX
  def test_i_cannot_detect_unsupported_file_type(self, document_service):
    """Test unsupported file type raises ValueError."""
    with pytest.raises(ValueError, match="Unsupported file type"):
      document_service._detect_file_type("/path/to/document.xyz")
--- a/tests/services/test_job_service.py
+++ b/tests/services/test_job_service.py
@@ -0,0 +1,518 @@
 """
 Unit tests for JobService using in-memory MongoDB.
 Tests the business logic operations with real MongoDB operations
 using mongomock for better integration testing.
 """
 import pytest
 from bson import ObjectId
 from mongomock.mongo_client import MongoClient
 from app.exceptions.job_exceptions import InvalidStatusTransitionError
 from app.models.job import ProcessingStatus
 from app.models.types import PyObjectId
 from app.services.job_service import JobService
@pytest.fixture
 def in_memory_database():
  """Create an in-memory database for testing."""
  client = MongoClient()
  return client.test_database
@pytest.fixture
 def job_service(in_memory_database):
  """Create JobService with in-memory repositories."""
  service = JobService(in_memory_database).initialize()
  return service
@pytest.fixture
 def sample_document_id():
  """Sample file ObjectId."""
  return PyObjectId()
@pytest.fixture
 def sample_task_id():
  """Sample Celery task UUID."""
  return "550e8400-e29b-41d4-a716-446655440000"
 class TestCreateJob:
  """Tests for create_job method."""
  def test_i_can_create_job_with_task_id(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test creating job with task ID."""
    # Execute
    result = job_service.create_job(sample_document_id, sample_task_id)
    # Verify job creation
    assert result is not None
    assert result.document_id == sample_document_id
    assert result.task_id == sample_task_id
    assert result.status == ProcessingStatus.PENDING
    assert result.created_at is not None
    assert result.started_at is None
    assert result.error_message is None
    # Verify job exists in database
    job_in_db = job_service.get_job_by_id(result.id)
    assert job_in_db is not None
    assert job_in_db.id == result.id
    assert job_in_db.document_id == sample_document_id
    assert job_in_db.task_id == sample_task_id
    assert job_in_db.status == ProcessingStatus.PENDING
  def test_i_can_create_job_without_task_id(
      self,
      job_service,
      sample_document_id
  ):
    """Test creating job without task ID."""
    # Execute
    result = job_service.create_job(sample_document_id)
    # Verify job creation
    assert result is not None
    assert result.document_id == sample_document_id
    assert result.task_id is None
    assert result.status == ProcessingStatus.PENDING
    assert result.created_at is not None
    assert result.started_at is None
    assert result.error_message is None
 class TestGetJobMethods:
  """Tests for job retrieval methods."""
  def test_i_can_get_job_by_id(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test retrieving job by ID."""
    # Create a job first
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    # Execute
    result = job_service.get_job_by_id(created_job.id)
    # Verify
    assert result is not None
    assert result.id == created_job.id
    assert result.document_id == created_job.document_id
    assert result.task_id == created_job.task_id
    assert result.status == created_job.status
  def test_i_can_get_jobs_by_status(
      self,
      job_service,
      sample_document_id
  ):
    """Test retrieving jobs by status."""
    # Create jobs with different statuses
    pending_job = job_service.create_job(sample_document_id, "pending-task")
    processing_job = job_service.create_job(ObjectId(), "processing-task")
    job_service.mark_job_as_started(processing_job.id)
    completed_job = job_service.create_job(ObjectId(), "completed-task")
    job_service.mark_job_as_started(completed_job.id)
    job_service.mark_job_as_completed(completed_job.id)
    # Execute - get pending jobs
    pending_results = job_service.get_jobs_by_status(ProcessingStatus.PENDING)
    # Verify
    assert len(pending_results) == 1
    assert pending_results[0].id == pending_job.id
    assert pending_results[0].status == ProcessingStatus.PENDING
    # Execute - get processing jobs
    processing_results = job_service.get_jobs_by_status(ProcessingStatus.PROCESSING)
    assert len(processing_results) == 1
    assert processing_results[0].status == ProcessingStatus.PROCESSING
    # Execute - get completed jobs
    completed_results = job_service.get_jobs_by_status(ProcessingStatus.COMPLETED)
    assert len(completed_results) == 1
    assert completed_results[0].status == ProcessingStatus.COMPLETED
 class TestUpdateStatus:
  """Tests for mark_job_as_started method."""
  def test_i_can_mark_pending_job_as_started(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test marking pending job as started (PENDING → PROCESSING)."""
    # Create a pending job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    assert created_job.status == ProcessingStatus.PENDING
    # Execute
    result = job_service.mark_job_as_started(created_job.id)
    # Verify status transition
    assert result is not None
    assert result.id == created_job.id
    assert result.status == ProcessingStatus.PROCESSING
    # Verify in database
    updated_job = job_service.get_job_by_id(created_job.id)
    assert updated_job.status == ProcessingStatus.PROCESSING
  def test_i_cannot_mark_processing_job_as_started(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that processing job cannot be marked as started."""
    # Create and start a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    job_service.mark_job_as_started(created_job.id)
    # Try to start it again
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_started(created_job.id)
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.PROCESSING
    assert exc_info.value.target_status == ProcessingStatus.PROCESSING
  def test_i_cannot_mark_completed_job_as_started(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that completed job cannot be marked as started."""
    # Create, start, and complete a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    job_service.mark_job_as_started(created_job.id)
    job_service.mark_job_as_completed(created_job.id)
    # Try to start it again
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_started(created_job.id)
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.COMPLETED
    assert exc_info.value.target_status == ProcessingStatus.PROCESSING
  def test_i_cannot_mark_failed_job_as_started(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that failed job cannot be marked as started."""
    # Create, start, and fail a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    job_service.mark_job_as_started(created_job.id)
    job_service.mark_job_as_failed(created_job.id, "Test error")
    # Try to start it again
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_started(created_job.id)
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.FAILED
    assert exc_info.value.target_status == ProcessingStatus.PROCESSING
  def test_i_can_mark_processing_job_as_completed(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test marking processing job as completed (PROCESSING → COMPLETED)."""
    # Create and start a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    started_job = job_service.mark_job_as_started(created_job.id)
    # Execute
    result = job_service.mark_job_as_completed(created_job.id)
    # Verify status transition
    assert result is not None
    assert result.id == created_job.id
    assert result.status == ProcessingStatus.COMPLETED
    # Verify in database
    updated_job = job_service.get_job_by_id(created_job.id)
    assert updated_job.status == ProcessingStatus.COMPLETED
  def test_i_cannot_mark_pending_job_as_completed(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that pending job cannot be marked as completed."""
    # Create a pending job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    # Try to complete it directly
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_completed(created_job.id)
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.PENDING
    assert exc_info.value.target_status == ProcessingStatus.COMPLETED
  def test_i_cannot_mark_completed_job_as_completed(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that completed job cannot be marked as completed again."""
    # Create, start, and complete a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    job_service.mark_job_as_started(created_job.id)
    job_service.mark_job_as_completed(created_job.id)
    # Try to complete it again
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_completed(created_job.id)
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.COMPLETED
    assert exc_info.value.target_status == ProcessingStatus.COMPLETED
  def test_i_cannot_mark_failed_job_as_completed(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that failed job cannot be marked as completed."""
    # Create, start, and fail a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    job_service.mark_job_as_started(created_job.id)
    job_service.mark_job_as_failed(created_job.id, "Test error")
    # Try to complete it
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_completed(created_job.id)
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.FAILED
    assert exc_info.value.target_status == ProcessingStatus.COMPLETED
  def test_i_can_mark_processing_job_as_failed_with_error_message(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test marking processing job as failed with error message."""
    # Create and start a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    started_job = job_service.mark_job_as_started(created_job.id)
    error_message = "Processing failed due to invalid file format"
    # Execute
    result = job_service.mark_job_as_failed(created_job.id, error_message)
    # Verify status transition
    assert result is not None
    assert result.id == created_job.id
    assert result.status == ProcessingStatus.FAILED
    assert result.error_message == error_message
    # Verify in database
    updated_job = job_service.get_job_by_id(created_job.id)
    assert updated_job.status == ProcessingStatus.FAILED
    assert updated_job.error_message == error_message
  def test_i_can_mark_processing_job_as_failed_without_error_message(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test marking processing job as failed without error message."""
    # Create and start a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    job_service.mark_job_as_started(created_job.id)
    # Execute without error message
    result = job_service.mark_job_as_failed(created_job.id)
    # Verify status transition
    assert result is not None
    assert result.status == ProcessingStatus.FAILED
    assert result.error_message is None
  def test_i_cannot_mark_pending_job_as_failed(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that pending job cannot be marked as failed."""
    # Create a pending job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    # Try to fail it directly
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_failed(created_job.id, "Test error")
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.PENDING
    assert exc_info.value.target_status == ProcessingStatus.FAILED
  def test_i_cannot_mark_completed_job_as_failed(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that completed job cannot be marked as failed."""
    # Create, start, and complete a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    job_service.mark_job_as_started(created_job.id)
    job_service.mark_job_as_completed(created_job.id)
    # Try to fail it
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_failed(created_job.id, "Test error")
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.COMPLETED
    assert exc_info.value.target_status == ProcessingStatus.FAILED
  def test_i_cannot_mark_failed_job_as_failed(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test that failed job cannot be marked as failed again."""
    # Create, start, and fail a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    job_service.mark_job_as_started(created_job.id)
    job_service.mark_job_as_failed(created_job.id, "First error")
    # Try to fail it again
    with pytest.raises(InvalidStatusTransitionError) as exc_info:
      job_service.mark_job_as_failed(created_job.id, "Second error")
    # Verify exception details
    assert exc_info.value.current_status == ProcessingStatus.FAILED
    assert exc_info.value.target_status == ProcessingStatus.FAILED
 class TestDeleteJob:
  """Tests for delete_job method."""
  def test_i_can_delete_existing_job(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test deleting an existing job."""
    # Create a job
    created_job = job_service.create_job(sample_document_id, sample_task_id)
    # Verify job exists
    job_before_delete = job_service.get_job_by_id(created_job.id)
    assert job_before_delete is not None
    # Execute deletion
    result = job_service.delete_job(created_job.id)
    # Verify deletion
    assert result is True
    # Verify job no longer exists
    deleted_job = job_service.get_job_by_id(created_job.id)
    assert deleted_job is None
  def test_i_cannot_delete_nonexistent_job(
      self,
      job_service
  ):
    """Test deleting a nonexistent job returns False."""
    # Execute deletion with random ObjectId
    result = job_service.delete_job(ObjectId())
    # Verify
    assert result is False
 class TestStatusTransitionValidation:
  """Tests for status transition validation across different scenarios."""
  def test_valid_job_lifecycle_flow(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test complete valid job lifecycle: PENDING → PROCESSING → COMPLETED."""
    # Create job (PENDING)
    job = job_service.create_job(sample_document_id, sample_task_id)
    assert job.status == ProcessingStatus.PENDING
    # Start job (PENDING → PROCESSING)
    started_job = job_service.mark_job_as_started(job.id)
    assert started_job.status == ProcessingStatus.PROCESSING
    # Complete job (PROCESSING → COMPLETED)
    completed_job = job_service.mark_job_as_completed(job.id)
    assert completed_job.status == ProcessingStatus.COMPLETED
  def test_valid_job_failure_flow(
      self,
      job_service,
      sample_document_id,
      sample_task_id
  ):
    """Test valid job failure: PENDING → PROCESSING → FAILED."""
    # Create job (PENDING)
    job = job_service.create_job(sample_document_id, sample_task_id)
    assert job.status == ProcessingStatus.PENDING
    # Start job (PENDING → PROCESSING)
    started_job = job_service.mark_job_as_started(job.id)
    assert started_job.status == ProcessingStatus.PROCESSING
    # Fail job (PROCESSING → FAILED)
    failed_job = job_service.mark_job_as_failed(job.id, "Test failure")
    assert failed_job.status == ProcessingStatus.FAILED
    assert failed_job.error_message == "Test failure"
  def test_job_operations_with_empty_database(
      self,
      job_service
  ):
    """Test job operations when database is empty."""
    # Try to get nonexistent job
    result = job_service.get_job_by_id(ObjectId())
    assert result is None
    # Try to get jobs by status when none exist
    pending_jobs = job_service.get_jobs_by_status(ProcessingStatus.PENDING)
    assert pending_jobs == []
    # Try to delete nonexistent job
    delete_result = job_service.delete_job(ObjectId())
    assert delete_result is False
--- a/tests/test_connection.py
+++ b/tests/test_connection.py
@@ -1,187 +0,0 @@
 """
 Unit tests for MongoDB database connection module.
 Tests the database connection functionality with mocking
 to avoid requiring actual MongoDB instance during tests.
 """
 import pytest
 from unittest.mock import Mock, patch, MagicMock
 from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
 from app.database.connection import (
  create_mongodb_client,
  get_database,
  close_database_connection,
  get_mongodb_client,
  test_database_connection
 )
 def test_i_can_get_database_connection():
  """Test successful database connection creation."""
  mock_client = Mock()
  mock_database = Mock()
  # Configure the mock to support dictionary-like access
  mock_client.__getitem__ = Mock(return_value=mock_database)
  with patch('app.database.connection.MongoClient', return_value=mock_client):
    with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
      with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
        # Reset global variables
        import app.database.connection
        app.database.connection._client = None
        app.database.connection._database = None
        result = get_database()
        assert result == mock_database
        mock_client.admin.command.assert_called_with('ping')
        # Verify that __getitem__ was called with the database name
        mock_client.__getitem__.assert_called_with("testdb")
 def test_i_cannot_connect_to_invalid_mongodb_url():
  """Test fail-fast behavior with invalid MongoDB URL."""
  mock_client = Mock()
  mock_client.admin.command.side_effect = ConnectionFailure("Connection failed")
  with patch('app.database.connection.MongoClient', return_value=mock_client):
    with patch('app.database.connection.get_mongodb_url', return_value="mongodb://invalid:27017"):
      with pytest.raises(SystemExit) as exc_info:
        create_mongodb_client()
      assert exc_info.value.code == 1
 def test_i_cannot_connect_with_server_selection_timeout():
  """Test fail-fast behavior with server selection timeout."""
  mock_client = Mock()
  mock_client.admin.command.side_effect = ServerSelectionTimeoutError("Timeout")
  with patch('app.database.connection.MongoClient', return_value=mock_client):
    with patch('app.database.connection.get_mongodb_url', return_value="mongodb://timeout:27017"):
      with pytest.raises(SystemExit) as exc_info:
        create_mongodb_client()
      assert exc_info.value.code == 1
 def test_i_cannot_connect_with_unexpected_error():
  """Test fail-fast behavior with unexpected connection error."""
  with patch('app.database.connection.MongoClient', side_effect=Exception("Unexpected error")):
    with patch('app.database.connection.get_mongodb_url', return_value="mongodb://error:27017"):
      with pytest.raises(SystemExit) as exc_info:
        create_mongodb_client()
      assert exc_info.value.code == 1
 def test_i_can_get_database_singleton():
  """Test that get_database returns the same instance (singleton pattern)."""
  mock_client = Mock()
  mock_database = Mock()
  mock_client.__getitem__ = Mock(return_value=mock_database)
  with patch('app.database.connection.MongoClient', return_value=mock_client):
    with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
      with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
        # Reset global variables
        import app.database.connection
        app.database.connection._client = None
        app.database.connection._database = None
        # First call
        db1 = get_database()
        # Second call
        db2 = get_database()
        assert db1 is db2
        # MongoClient should be called only once
        assert mock_client.admin.command.call_count == 1
 def test_i_can_close_database_connection():
  """Test closing database connection."""
  mock_client = Mock()
  mock_database = Mock()
  mock_client.__getitem__ = Mock(return_value=mock_database)
  with patch('app.database.connection.MongoClient', return_value=mock_client):
    with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
      with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
        # Reset global variables
        import app.database.connection
        app.database.connection._client = None
        app.database.connection._database = None
        # Create connection
        get_database()
        # Close connection
        close_database_connection()
        mock_client.close.assert_called_once()
        assert app.database.connection._client is None
        assert app.database.connection._database is None
 def test_i_can_get_mongodb_client():
  """Test getting raw MongoDB client instance."""
  mock_client = Mock()
  mock_database = Mock()
  mock_client.__getitem__ = Mock(return_value=mock_database)
  with patch('app.database.connection.MongoClient', return_value=mock_client):
    with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
      with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
        # Reset global variables
        import app.database.connection
        app.database.connection._client = None
        app.database.connection._database = None
        # Create connection first
        get_database()
        # Get client
        result = get_mongodb_client()
        assert result == mock_client
 def test_i_can_get_none_mongodb_client_when_not_connected():
  """Test getting MongoDB client returns None when not connected."""
  # Reset global variables
  import app.database.connection
  app.database.connection._client = None
  app.database.connection._database = None
  result = get_mongodb_client()
  assert result is None
 def test_i_can_test_database_connection_success():
  """Test database connection health check - success case."""
  mock_database = Mock()
  mock_database.command.return_value = True
  with patch('app.database.connection.get_database', return_value=mock_database):
    result = test_database_connection()
    assert result is True
    mock_database.command.assert_called_with('ping')
 def test_i_can_close_connection_when_no_client():
  """Test closing connection when no client exists (should not raise error)."""
  # Reset global variables
  import app.database.connection
  app.database.connection._client = None
  app.database.connection._database = None
  # Should not raise any exception
  close_database_connection()
  assert app.database.connection._client is None
  assert app.database.connection._database is None
--- a/tests/test_document_content_repository.py
+++ b/tests/test_document_content_repository.py
@@ -1,311 +0,0 @@
 """
 Test suite for DocumentContentRepository with async/await support.
 This module contains comprehensive tests for all DocumentContentRepository methods
 using mongomock-motor for in-memory MongoDB testing.
 """
 import pytest
 import hashlib
 from datetime import datetime
 import pytest_asyncio
 from bson import ObjectId
 from pymongo.errors import DuplicateKeyError
 from mongomock_motor import AsyncMongoMockClient
 from app.database.repositories.document_content_repository import DocumentContentRepository
 from app.models.document import DocumentContent
@pytest_asyncio.fixture
 async def in_memory_repository():
  """Create an in-memory DocumentContentRepository for testing."""
  client = AsyncMongoMockClient()
  db = client.test_database
  repo = DocumentContentRepository(db)
  await repo.initialize()
  return repo
@pytest.fixture
 def sample_document_content():
  """Sample DocumentContent data for testing."""
  content = "This is sample document content for testing purposes."
  file_hash = hashlib.sha256(content.encode()).hexdigest()
  return DocumentContent(
    file_hash=file_hash,
    content=content,
    encoding="utf-8",
    file_size=len(content.encode()),
    mime_type="text/plain"
  )
@pytest.fixture
 def another_document_content():
  """Another sample DocumentContent data for testing."""
  content = "This is another sample document with different content."
  file_hash = hashlib.sha256(content.encode()).hexdigest()
  return DocumentContent(
    file_hash=file_hash,
    content=content,
    encoding="utf-8",
    file_size=len(content.encode()),
    mime_type="text/plain"
  )
 class TestDocumentContentRepositoryCreation:
  """Tests for document content creation functionality."""
  @pytest.mark.asyncio
  async def test_i_can_create_document_content(self, in_memory_repository, sample_document_content):
    """Test successful document content creation."""
    # Act
    created_content = await in_memory_repository.create_document_content(sample_document_content)
    # Assert
    assert created_content is not None
    assert created_content.file_hash == sample_document_content.file_hash
    assert created_content.content == sample_document_content.content
    assert created_content.encoding == sample_document_content.encoding
    assert created_content.file_size == sample_document_content.file_size
    assert created_content.mime_type == sample_document_content.mime_type
    assert created_content.id is not None
  @pytest.mark.asyncio
  async def test_i_cannot_create_document_content_with_duplicate_file_hash(self, in_memory_repository,
                                                                           sample_document_content):
    """Test that creating document content with duplicate file_hash raises DuplicateKeyError."""
    # Arrange
    await in_memory_repository.create_document_content(sample_document_content)
    # Act & Assert
    with pytest.raises(DuplicateKeyError) as exc_info:
      await in_memory_repository.create_document_content(sample_document_content)
    assert "already exists" in str(exc_info.value)
 class TestDocumentContentRepositoryFinding:
  """Tests for document content finding functionality."""
  @pytest.mark.asyncio
  async def test_i_can_find_document_content_by_id(self, in_memory_repository, sample_document_content):
    """Test finding document content by valid ID."""
    # Arrange
    created_content = await in_memory_repository.create_document_content(sample_document_content)
    # Act
    found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
    # Assert
    assert found_content is not None
    assert found_content.id == created_content.id
    assert found_content.file_hash == created_content.file_hash
    assert found_content.content == created_content.content
  @pytest.mark.asyncio
  async def test_i_cannot_find_document_content_by_invalid_id(self, in_memory_repository):
    """Test that invalid ObjectId returns None."""
    # Act
    found_content = await in_memory_repository.find_document_content_by_id("invalid_id")
    # Assert
    assert found_content is None
  @pytest.mark.asyncio
  async def test_i_cannot_find_document_content_by_nonexistent_id(self, in_memory_repository):
    """Test that nonexistent but valid ObjectId returns None."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
    found_content = await in_memory_repository.find_document_content_by_id(nonexistent_id)
    # Assert
    assert found_content is None
  @pytest.mark.asyncio
  async def test_i_can_find_document_content_by_file_hash(self, in_memory_repository, sample_document_content):
    """Test finding document content by file hash."""
    # Arrange
    created_content = await in_memory_repository.create_document_content(sample_document_content)
    # Act
    found_content = await in_memory_repository.find_document_content_by_file_hash(sample_document_content.file_hash)
    # Assert
    assert found_content is not None
    assert found_content.file_hash == created_content.file_hash
    assert found_content.id == created_content.id
  @pytest.mark.asyncio
  async def test_i_cannot_find_document_content_by_nonexistent_file_hash(self, in_memory_repository):
    """Test that nonexistent file hash returns None."""
    # Act
    found_content = await in_memory_repository.find_document_content_by_file_hash("nonexistent_hash")
    # Assert
    assert found_content is None
 class TestDocumentContentRepositoryUpdate:
  """Tests for document content update functionality."""
  @pytest.mark.asyncio
  async def test_i_can_update_document_content(self, in_memory_repository, sample_document_content):
    """Test successful document content update."""
    # Arrange
    created_content = await in_memory_repository.create_document_content(sample_document_content)
    update_data = {
        "content": "Updated content for testing",
        "encoding": "utf-16",
        "mime_type": "text/html"
    }
    # Act
    updated_content = await in_memory_repository.update_document_content(str(created_content.id), update_data)
    # Assert
    assert updated_content is not None
    assert updated_content.content == update_data["content"]
    assert updated_content.encoding == update_data["encoding"]
    assert updated_content.mime_type == update_data["mime_type"]
    assert updated_content.id == created_content.id
    assert updated_content.file_hash == created_content.file_hash  # Should remain unchanged
  @pytest.mark.asyncio
  async def test_i_cannot_update_document_content_with_invalid_id(self, in_memory_repository):
    """Test that updating with invalid ID returns None."""
    # Act
    result = await in_memory_repository.update_document_content("invalid_id", {"content": "test"})
    # Assert
    assert result is None
  @pytest.mark.asyncio
  async def test_i_can_update_document_content_with_partial_data(self, in_memory_repository, sample_document_content):
    """Test updating document content with partial data."""
    # Arrange
    created_content = await in_memory_repository.create_document_content(sample_document_content)
    partial_update = {"encoding": "iso-8859-1"}
    # Act
    updated_content = await in_memory_repository.update_document_content(str(created_content.id), partial_update)
    # Assert
    assert updated_content is not None
    assert updated_content.encoding == "iso-8859-1"
    assert updated_content.content == created_content.content  # Should remain unchanged
    assert updated_content.mime_type == created_content.mime_type  # Should remain unchanged
  @pytest.mark.asyncio
  async def test_i_can_update_document_content_with_empty_data(self, in_memory_repository, sample_document_content):
    """Test updating document content with empty data returns current content."""
    # Arrange
    created_content = await in_memory_repository.create_document_content(sample_document_content)
    empty_update = {}
    # Act
    result = await in_memory_repository.update_document_content(str(created_content.id), empty_update)
    # Assert
    assert result is not None
    assert result.content == created_content.content
    assert result.encoding == created_content.encoding
    assert result.mime_type == created_content.mime_type
 class TestDocumentContentRepositoryDeletion:
  """Tests for document content deletion functionality."""
  @pytest.mark.asyncio
  async def test_i_can_delete_document_content(self, in_memory_repository, sample_document_content):
    """Test successful document content deletion."""
    # Arrange
    created_content = await in_memory_repository.create_document_content(sample_document_content)
    # Act
    deletion_result = await in_memory_repository.delete_document_content(str(created_content.id))
    # Assert
    assert deletion_result is True
    # Verify content is actually deleted
    found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
    assert found_content is None
  @pytest.mark.asyncio
  async def test_i_cannot_delete_document_content_with_invalid_id(self, in_memory_repository):
    """Test that deleting with invalid ID returns False."""
    # Act
    result = await in_memory_repository.delete_document_content("invalid_id")
    # Assert
    assert result is False
  @pytest.mark.asyncio
  async def test_i_cannot_delete_nonexistent_document_content(self, in_memory_repository):
    """Test that deleting nonexistent document content returns False."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
    result = await in_memory_repository.delete_document_content(nonexistent_id)
    # Assert
    assert result is False
 class TestDocumentContentRepositoryUtilities:
  """Tests for utility methods."""
  @pytest.mark.asyncio
  async def test_i_can_check_content_exists(self, in_memory_repository, sample_document_content):
    """Test checking if document content exists by file hash."""
    # Arrange
    await in_memory_repository.create_document_content(sample_document_content)
    # Act
    exists = await in_memory_repository.content_exists(sample_document_content.file_hash)
    not_exists = await in_memory_repository.content_exists("nonexistent_hash")
    # Assert
    assert exists is True
    assert not_exists is False
  @pytest.mark.asyncio
  async def test_i_can_list_document_contents(self, in_memory_repository, sample_document_content,
                                              another_document_content):
    """Test listing document contents with pagination."""
    # Arrange
    await in_memory_repository.create_document_content(sample_document_content)
    await in_memory_repository.create_document_content(another_document_content)
    # Act
    all_contents = await in_memory_repository.list_document_contents()
    limited_contents = await in_memory_repository.list_document_contents(skip=0, limit=1)
    # Assert
    assert len(all_contents) == 2
    assert len(limited_contents) == 1
    assert all(isinstance(content, DocumentContent) for content in all_contents)
  @pytest.mark.asyncio
  async def test_i_can_count_document_contents(self, in_memory_repository, sample_document_content,
                                               another_document_content):
    """Test counting document contents."""
    # Arrange
    initial_count = await in_memory_repository.count_document_contents()
    await in_memory_repository.create_document_content(sample_document_content)
    await in_memory_repository.create_document_content(another_document_content)
    # Act
    final_count = await in_memory_repository.count_document_contents()
    # Assert
    assert final_count == initial_count + 2
--- a/tests/test_document_repository.py
+++ b/tests/test_document_repository.py
@@ -1,566 +0,0 @@
 """
 Test suite for FileDocumentRepository with async/await support.
 This module contains comprehensive tests for all FileDocumentRepository methods
 using mongomock-motor for in-memory MongoDB testing.
 """
 import pytest
 from datetime import datetime
 from typing import Dict, Any
 import pytest_asyncio
 from bson import ObjectId
 from pymongo.errors import DuplicateKeyError, PyMongoError
 from mongomock_motor import AsyncMongoMockClient
 from app.database.repositories.document_repository import FileDocumentRepository
 from app.models.document import FileDocument, FileType
@pytest_asyncio.fixture
 async def in_memory_repository():
  """Create an in-memory FileDocumentRepository for testing."""
  client = AsyncMongoMockClient()
  db = client.test_database
  repo = FileDocumentRepository(db)
  # repo.db = db
  # repo.collection = db.files
  await repo.initialize()
  return repo
@pytest.fixture
 def sample_file_document():
  """Sample FileDocument data for testing."""
  return FileDocument(
    filename="test_document.pdf",
    filepath="/path/to/test_document.pdf",
    file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
    file_type=FileType("pdf"),
    detected_at=datetime.now(),
  )
@pytest.fixture
 def sample_update_data():
  """Sample update data for testing."""
  return {
      "metadata": {"tags": ["updated", "document"]},
      "file_type": FileType("txt"),
  }
@pytest.fixture
 def multiple_sample_documents():
  """Multiple FileDocument objects for list/search testing."""
  base_time = datetime.now()
  return [
      FileDocument(
        filename="document1.pdf",
        filepath="/path/to/document1.pdf",
        file_hash="hash1" + "0" * 58,
        file_type=FileType("pdf"),
        detected_at=base_time,
      ),
      FileDocument(
        filename="similar_document.pdf",
        filepath="/path/to/similar_document.pdf",
        file_hash="hash2" + "0" * 58,
        file_type=FileType("pdf"),
        detected_at=base_time,
      ),
      FileDocument(
        filename="completely_different.txt",
        filepath="/path/to/completely_different.txt",
        file_hash="hash3" + "0" * 58,
        file_type=FileType("pdf"),
        detected_at=base_time,
      )
  ]
 class TestFileDocumentRepositoryInitialization:
  """Tests for repository initialization."""
  @pytest.mark.asyncio
  async def test_i_can_initialize_repository(self):
    """Test repository initialization."""
    # Arrange
    client = AsyncMongoMockClient()
    db = client.test_database
    repo = FileDocumentRepository(db)
    await repo.initialize()
    # Act & Assert (should not raise any exception)
    assert repo.db is not None
    assert repo.collection is not None
    # TODO : check that the indexes are create
 class TestFileDocumentRepositoryCreation:
  """Tests for file document creation functionality."""
  @pytest.mark.asyncio
  async def test_i_can_create_document(self, in_memory_repository, sample_file_document):
    """Test successful file document creation."""
    # Act
    created_doc = await in_memory_repository.create_document(sample_file_document)
    # Assert
    assert created_doc is not None
    assert created_doc.filename == sample_file_document.filename
    assert created_doc.filepath == sample_file_document.filepath
    assert created_doc.file_hash == sample_file_document.file_hash
    assert created_doc.file_type == sample_file_document.file_type
    assert created_doc.id is not None
    assert isinstance(created_doc.id, ObjectId)
  @pytest.mark.asyncio
  async def test_i_can_create_document_without_id(self, in_memory_repository, sample_file_document):
    """Test creating document with _id set to None (should be removed)."""
    # Arrange
    sample_file_document.id = None
    # Act
    created_doc = await in_memory_repository.create_document(sample_file_document)
    # Assert
    assert created_doc is not None
    assert created_doc.id is not None
    assert isinstance(created_doc.id, ObjectId)
  @pytest.mark.asyncio
  async def test_i_cannot_create_duplicate_document(self, in_memory_repository, sample_file_document):
    """Test that creating document with duplicate hash raises DuplicateKeyError."""
    # Arrange
    await in_memory_repository.create_document(sample_file_document)
    duplicate_doc = FileDocument(
      filename="different_name.pdf",
      filepath=sample_file_document.filepath,
      file_hash="different_hash" + "0" * 58,
      file_type=FileType("pdf"),
      detected_at=datetime.now()
    )
    # Act & Assert
    with pytest.raises(DuplicateKeyError) as exc_info:
      await in_memory_repository.create_document(duplicate_doc)
    assert "already exists" in str(exc_info.value)
  @pytest.mark.asyncio
  async def test_i_cannot_create_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
    """Test handling of PyMongo errors during document creation."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
    # Act & Assert
    with pytest.raises(ValueError) as exc_info:
      await in_memory_repository.create_document(sample_file_document)
    assert "Failed to create file document" in str(exc_info.value)
 class TestFileDocumentRepositoryFinding:
  """Tests for file document finding functionality."""
  @pytest.mark.asyncio
  async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document):
    """Test finding document by valid ObjectId."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    # Act
    found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id))
    # Assert
    assert found_doc is not None
    assert found_doc.id == created_doc.id
    assert found_doc.filename == created_doc.filename
    assert found_doc.file_hash == created_doc.file_hash
  @pytest.mark.asyncio
  async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository):
    """Test that invalid ObjectId returns None."""
    # Act
    found_doc = await in_memory_repository.find_document_by_id("invalid_id")
    # Assert
    assert found_doc is None
  @pytest.mark.asyncio
  async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository):
    """Test that nonexistent but valid ObjectId returns None."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
    found_doc = await in_memory_repository.find_document_by_id(nonexistent_id)
    # Assert
    assert found_doc is None
  @pytest.mark.asyncio
  async def test_i_can_find_document_by_hash(self, in_memory_repository, sample_file_document):
    """Test finding document by file hash."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    # Act
    found_doc = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash)
    # Assert
    assert found_doc is not None
    assert found_doc.file_hash == created_doc.file_hash
    assert found_doc.id == created_doc.id
  @pytest.mark.asyncio
  async def test_i_cannot_find_document_with_nonexistent_hash(self, in_memory_repository):
    """Test that nonexistent hash returns None."""
    # Act
    found_doc = await in_memory_repository.find_document_by_hash("nonexistent_hash")
    # Assert
    assert found_doc is None
  @pytest.mark.asyncio
  async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document):
    """Test finding document by exact filepath."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    # Act
    found_doc = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath)
    # Assert
    assert found_doc is not None
    assert found_doc.filepath == created_doc.filepath
    assert found_doc.id == created_doc.id
  @pytest.mark.asyncio
  async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository):
    """Test that nonexistent filepath returns None."""
    # Act
    found_doc = await in_memory_repository.find_document_by_filepath("/nonexistent/path.pdf")
    # Assert
    assert found_doc is None
 class TestFileDocumentRepositoryFuzzySearch:
  """Tests for fuzzy search functionality by filename."""
  @pytest.mark.asyncio
  async def test_i_can_find_documents_by_exact_name(self, in_memory_repository, multiple_sample_documents):
    """Test finding documents with exact filename match."""
    # Arrange
    for doc in multiple_sample_documents:
      await in_memory_repository.create_document(doc)
    # Act
    found_docs = await in_memory_repository.find_document_by_name("document1.pdf")
    # Assert
    assert len(found_docs) == 1
    assert found_docs[0].filename == "document1.pdf"
  @pytest.mark.asyncio
  async def test_i_can_find_documents_by_fuzzy_name(self, in_memory_repository, multiple_sample_documents):
    """Test finding documents with fuzzy matching using default threshold."""
    # Arrange
    for doc in multiple_sample_documents:
      await in_memory_repository.create_document(doc)
    # Act
    found_docs = await in_memory_repository.find_document_by_name("document")
    # Assert
    assert len(found_docs) >= 2  # Should find document1.pdf and similar_document.pdf
    filenames = [doc.filename for doc in found_docs]
    assert "document1.pdf" in filenames
    assert "similar_document.pdf" in filenames
  @pytest.mark.asyncio
  async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during name search."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
    # Act
    found_docs = await in_memory_repository.find_document_by_name("test")
    # Assert
    assert found_docs == []
 class TestFileDocumentRepositoryListing:
  """Tests for document listing functionality."""
  @pytest.mark.asyncio
  async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_documents):
    """Test listing documents with default pagination."""
    # Arrange
    for doc in multiple_sample_documents:
      await in_memory_repository.create_document(doc)
    # Act
    docs = await in_memory_repository.list_documents()
    # Assert
    assert len(docs) == len(multiple_sample_documents)
    assert all(isinstance(doc, FileDocument) for doc in docs)
  @pytest.mark.asyncio
  async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_documents):
    """Test listing documents with custom pagination."""
    # Arrange
    for doc in multiple_sample_documents:
      await in_memory_repository.create_document(doc)
    # Act
    docs_page1 = await in_memory_repository.list_documents(skip=0, limit=2)
    docs_page2 = await in_memory_repository.list_documents(skip=2, limit=2)
    # Assert
    assert len(docs_page1) == 2
    assert len(docs_page2) == 1  # Only 3 total documents
    # Ensure no overlap between pages
    page1_ids = [doc.id for doc in docs_page1]
    page2_ids = [doc.id for doc in docs_page2]
    assert len(set(page1_ids).intersection(set(page2_ids))) == 0
  @pytest.mark.asyncio
  async def test_i_can_list_documents_sorted_by_date(self, in_memory_repository, sample_file_document):
    """Test that documents are sorted by detected_at in descending order."""
    # Arrange
    from datetime import timedelta
    # Create documents with different timestamps
    doc1 = sample_file_document.model_copy()
    doc1.filename = "oldest.pdf"
    doc1.filepath = f"/path/to/{doc1.filename}"
    doc1.file_hash = "hash1" + "0" * 58
    doc1.detected_at = datetime.now() - timedelta(hours=2)
    doc2 = sample_file_document.model_copy()
    doc2.filename = "newest.pdf"
    doc2.filepath = f"/path/to/{doc2.filename}"
    doc2.file_hash = "hash2" + "0" * 58
    doc2.detected_at = datetime.now()
    await in_memory_repository.create_document(doc1)
    await in_memory_repository.create_document(doc2)
    # Act
    docs = await in_memory_repository.list_documents()
    # Assert
    assert len(docs) == 2
    assert docs[0].filename == "newest.pdf"  # Most recent first
    assert docs[1].filename == "oldest.pdf"
  @pytest.mark.asyncio
  async def test_i_can_list_empty_documents(self, in_memory_repository):
    """Test listing documents from empty collection."""
    # Act
    docs = await in_memory_repository.list_documents()
    # Assert
    assert docs == []
  @pytest.mark.asyncio
  async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker):
    """Test handling of PyMongo errors during document listing."""
    # Arrange
    mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
    # Act
    docs = await in_memory_repository.list_documents()
    # Assert
    assert docs == []
 class TestFileDocumentRepositoryUpdate:
  """Tests for document update functionality."""
  @pytest.mark.asyncio
  async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document,
                                                    sample_update_data):
    """Test successful document update."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    # Act
    updated_doc = await in_memory_repository.update_document(str(created_doc.id), sample_update_data)
    # Assert
    assert updated_doc is not None
    assert updated_doc.file_type == sample_update_data["file_type"]
    assert updated_doc.id == created_doc.id
    assert updated_doc.filename == created_doc.filename  # Unchanged fields remain
  @pytest.mark.asyncio
  async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document):
    """Test updating document with partial data."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    partial_update = {"file_type": FileType("txt")}
    # Act
    updated_doc = await in_memory_repository.update_document(str(created_doc.id), partial_update)
    # Assert
    assert updated_doc is not None
    assert updated_doc.file_type == FileType("txt")
    assert updated_doc.filename == created_doc.filename  # Should remain unchanged
    assert updated_doc.filepath == created_doc.filepath  # Should remain unchanged
  @pytest.mark.asyncio
  async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document):
    """Test that None values are filtered out from update data."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    update_with_none = {"metadata": {"tags": ["updated", "document"]}, "file_type": None}
    # Act
    updated_doc = await in_memory_repository.update_document(str(created_doc.id), update_with_none)
    # Assert
    assert updated_doc is not None
    assert updated_doc.metadata == {"tags": ["updated", "document"]}
    assert updated_doc.file_type == created_doc.file_type  # Should remain unchanged (None filtered out)
  @pytest.mark.asyncio
  async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document):
    """Test updating document with empty data returns current document."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    empty_update = {}
    # Act
    result = await in_memory_repository.update_document(str(created_doc.id), empty_update)
    # Assert
    assert result is not None
    assert result.filename == created_doc.filename
    assert result.file_hash == created_doc.file_hash
    assert result.metadata == created_doc.metadata
  @pytest.mark.asyncio
  async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data):
    """Test that updating with invalid ID returns None."""
    # Act
    result = await in_memory_repository.update_document("invalid_id", sample_update_data)
    # Assert
    assert result is None
  @pytest.mark.asyncio
  async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data):
    """Test that updating nonexistent document returns None."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
    result = await in_memory_repository.update_document(nonexistent_id, sample_update_data)
    # Assert
    assert result is None
  @pytest.mark.asyncio
  async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document,
                                                             sample_update_data, mocker):
    """Test handling of PyMongo errors during document update."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
                        side_effect=PyMongoError("Database error"))
    # Act
    result = await in_memory_repository.update_document(str(created_doc.id), sample_update_data)
    # Assert
    assert result is None
 class TestFileDocumentRepositoryDeletion:
  """Tests for document deletion functionality."""
  @pytest.mark.asyncio
  async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document):
    """Test successful document deletion."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    # Act
    deletion_result = await in_memory_repository.delete_document(str(created_doc.id))
    # Assert
    assert deletion_result is True
    # Verify document is actually deleted
    found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id))
    assert found_doc is None
  @pytest.mark.asyncio
  async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository):
    """Test that deleting with invalid ID returns False."""
    # Act
    result = await in_memory_repository.delete_document("invalid_id")
    # Assert
    assert result is False
  @pytest.mark.asyncio
  async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository):
    """Test that deleting nonexistent document returns False."""
    # Arrange
    nonexistent_id = str(ObjectId())
    # Act
    result = await in_memory_repository.delete_document(nonexistent_id)
    # Assert
    assert result is False
  @pytest.mark.asyncio
  async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
    """Test handling of PyMongo errors during document deletion."""
    # Arrange
    created_doc = await in_memory_repository.create_document(sample_file_document)
    mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
    # Act
    result = await in_memory_repository.delete_document(str(created_doc.id))
    # Assert
    assert result is False
 class TestFileDocumentRepositoryUtilities:
  """Tests for utility methods."""
  @pytest.mark.asyncio
  async def test_i_can_count_documents(self, in_memory_repository, sample_file_document):
    """Test counting documents."""
    # Arrange
    initial_count = await in_memory_repository.count_documents()
    await in_memory_repository.create_document(sample_file_document)
    # Act
    final_count = await in_memory_repository.count_documents()
    # Assert
    assert final_count == initial_count + 1
  @pytest.mark.asyncio
  async def test_i_can_count_zero_documents(self, in_memory_repository):
    """Test counting documents in empty collection."""
    # Act
    count = await in_memory_repository.count_documents()
    # Assert
    assert count == 0
--- a/tests/test_document_service.py
+++ b/tests/test_document_service.py
@@ -1,697 +0,0 @@
 """
 Unit tests for DocumentService using in-memory MongoDB.
 Tests the orchestration logic with real MongoDB operations
 using mongomock for better integration testing.
 """
 import pytest
 import pytest_asyncio
 from unittest.mock import Mock, patch
 from datetime import datetime
 from bson import ObjectId
 from pathlib import Path
 from mongomock_motor import AsyncMongoMockClient
 from app.services.document_service import DocumentService
 from app.database.repositories.document_repository import FileDocumentRepository
 from app.database.repositories.document_content_repository import DocumentContentRepository
 from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod
 from app.models.types import PyObjectId
@pytest_asyncio.fixture
 async def in_memory_file_repository():
    """Create an in-memory FileDocumentRepository for testing."""
    client = AsyncMongoMockClient()
    db = client.test_database
    repo = FileDocumentRepository(db)
    await repo.initialize()
    return repo
@pytest_asyncio.fixture
 async def in_memory_content_repository():
    """Create an in-memory DocumentContentRepository for testing."""
    client = AsyncMongoMockClient()
    db = client.test_database
    repo = DocumentContentRepository(db)
    await repo.initialize()
    return repo
@pytest_asyncio.fixture
 async def in_memory_database():
    """Create an in-memory database for testing."""
    client = AsyncMongoMockClient()
    return client.test_database
@pytest_asyncio.fixture
 async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database):
    """Create DocumentService with in-memory repositories."""
    with patch('app.services.document_service.get_database', return_value=in_memory_database):
        service = DocumentService()
        service.file_repository = in_memory_file_repository
        service.content_repository = in_memory_content_repository
        return service
@pytest.fixture
 def sample_file_bytes():
    """Sample file content as bytes."""
    return b"This is a test PDF content"
@pytest.fixture
 def sample_text_bytes():
    """Sample text file content as bytes."""
    return b"This is a test text file content"
@pytest.fixture
 def sample_file_hash():
    """Expected SHA256 hash for sample file bytes."""
    import hashlib
    return hashlib.sha256(b"This is a test PDF content").hexdigest()
@pytest.fixture
 def sample_file_document():
    """Sample FileDocument for testing."""
    return FileDocument(
        id=ObjectId(),
        filename="test.pdf",
        filepath="/test/test.pdf",
        file_type=FileType.PDF,
        extraction_method=None,
        metadata={},
        detected_at=datetime(2024, 1, 15, 10, 30, 0),
        file_hash="test_hash"
    )
 class TestCreateDocument:
    """Tests for create_document method."""
    @patch('app.services.document_service.magic.from_buffer')
    @patch('app.services.document_service.datetime')
    @pytest.mark.asyncio
    async def test_i_can_create_document_with_new_content(
        self,
        mock_datetime,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test creating document when content doesn't exist yet."""
        # Setup mocks
        fixed_time = datetime(2024, 1, 15, 10, 30, 0)
        mock_datetime.utcnow.return_value = fixed_time
        mock_magic.return_value = "application/pdf"
        # Execute
        result = await document_service.create_document(
            "/test/test.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Verify document creation
        assert result is not None
        assert result.filename == "test.pdf"
        assert result.filepath == "/test/test.pdf"
        assert result.file_type == FileType.PDF
        assert result.detected_at == fixed_time
        assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
        # Verify content was created
        content = await document_service.content_repository.find_document_content_by_file_hash(
            result.file_hash
        )
        assert content is not None
        assert content.file_hash == result.file_hash
        assert content.file_size == len(sample_file_bytes)
        assert content.mime_type == "application/pdf"
        assert content.encoding == "utf-8"
    @patch('app.services.document_service.magic.from_buffer')
    @patch('app.services.document_service.datetime')
    @pytest.mark.asyncio
    async def test_i_can_create_document_with_existing_content(
        self,
        mock_datetime,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test creating document when content already exists (deduplication)."""
        # Setup mocks
        fixed_time = datetime(2024, 1, 15, 10, 30, 0)
        mock_datetime.utcnow.return_value = fixed_time
        mock_magic.return_value = "application/pdf"
        # Create first document
        first_doc = await document_service.create_document(
            "/test/first.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Create second document with same content
        second_doc = await document_service.create_document(
            "/test/second.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Verify both documents exist but share same hash
        assert first_doc.file_hash == second_doc.file_hash
        assert first_doc.filename != second_doc.filename
        assert first_doc.filepath != second_doc.filepath
        # Verify only one content document exists
        all_content = await document_service.content_repository.list_document_content()
        content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash]
        assert len(content_for_hash) == 1
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_create_document_with_different_encodings(
        self,
        mock_magic,
        document_service,
        sample_text_bytes
    ):
        """Test creating documents with different text encodings."""
        # Setup
        mock_magic.return_value = "text/plain"
        # Test with different encodings
        encodings = ["utf-8", "latin-1", "ascii"]
        for i, encoding in enumerate(encodings):
            result = await document_service.create_document(
                f"/test/test{i}.txt",
                sample_text_bytes,
                encoding
            )
            # Verify document was created
            assert result is not None
            assert result.file_type == FileType.TXT
            # Verify content has correct encoding
            content = await document_service.content_repository.find_document_content_by_file_hash(
                result.file_hash
            )
            assert content.encoding == encoding
    @pytest.mark.asyncio
    async def test_i_cannot_create_document_with_unsupported_file_type(
        self,
        document_service,
        sample_file_bytes
    ):
        """Test that unsupported file types raise ValueError."""
        with pytest.raises(ValueError, match="Unsupported file type"):
            await document_service.create_document(
                "/test/test.xyz",  # Unsupported extension
                sample_file_bytes,
                "utf-8"
            )
    @pytest.mark.asyncio
    async def test_i_cannot_create_document_with_empty_file_path(
        self,
        document_service,
        sample_file_bytes
    ):
        """Test that empty file path raises ValueError."""
        with pytest.raises(ValueError):
            await document_service.create_document(
                "",  # Empty path
                sample_file_bytes,
                "utf-8"
            )
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_create_document_with_empty_bytes(
        self,
        mock_magic,
        document_service
    ):
        """Test behavior with empty file bytes."""
        # Setup
        mock_magic.return_value = "text/plain"
        # Execute with empty bytes
        result = await document_service.create_document(
            "/test/empty.txt",
            b"",  # Empty bytes
            "utf-8"
        )
        # Should still work but with zero file size
        assert result is not None
        content = await document_service.content_repository.find_document_content_by_file_hash(
            result.file_hash
        )
        assert content.file_size == 0
 class TestGetMethods:
    """Tests for document retrieval methods."""
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_get_document_by_id(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test retrieving document by ID."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Create a document first
        created_doc = await document_service.create_document(
            "/test/test.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Execute
        result = await document_service.get_document_by_id(created_doc.id)
        # Verify
        assert result is not None
        assert result.id == created_doc.id
        assert result.filename == created_doc.filename
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_get_document_by_hash(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test retrieving document by file hash."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Create a document first
        created_doc = await document_service.create_document(
            "/test/test.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Execute
        result = await document_service.get_document_by_hash(created_doc.file_hash)
        # Verify
        assert result is not None
        assert result.file_hash == created_doc.file_hash
        assert result.filename == created_doc.filename
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_get_document_by_filepath(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test retrieving document by file path."""
        # Setup
        mock_magic.return_value = "application/pdf"
        test_path = "/test/unique_test.pdf"
        # Create a document first
        created_doc = await document_service.create_document(
            test_path,
            sample_file_bytes,
            "utf-8"
        )
        # Execute
        result = await document_service.get_document_by_filepath(test_path)
        # Verify
        assert result is not None
        assert result.filepath == test_path
        assert result.id == created_doc.id
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_get_document_with_content(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test retrieving document with associated content."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Create a document first
        created_doc = await document_service.create_document(
            "/test/test.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Execute
        result = await document_service.get_document_with_content(created_doc.id)
        # Verify
        assert result is not None
        document, content = result
        assert document.id == created_doc.id
        assert content is not None
        assert content.file_hash == created_doc.file_hash
    @pytest.mark.asyncio
    async def test_i_cannot_get_nonexistent_document_by_id(
        self,
        document_service
    ):
        """Test that nonexistent document returns None."""
        # Execute with random ObjectId
        result = await document_service.get_document_by_id(ObjectId())
        # Verify
        assert result is None
    @pytest.mark.asyncio
    async def test_i_cannot_get_nonexistent_document_by_hash(
        self,
        document_service
    ):
        """Test that nonexistent document hash returns None."""
        # Execute
        result = await document_service.get_document_by_hash("nonexistent_hash")
        # Verify
        assert result is None
 class TestPaginationAndCounting:
    """Tests for document listing and counting."""
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_list_documents_with_pagination(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test document listing with pagination parameters."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Create multiple documents
        for i in range(5):
            await document_service.create_document(
                f"/test/test{i}.pdf",
                sample_file_bytes + bytes(str(i), 'utf-8'),  # Make each file unique
                "utf-8"
            )
        # Execute with pagination
        result = await document_service.list_documents(skip=1, limit=2)
        # Verify
        assert len(result) == 2
        # Test counting
        total_count = await document_service.count_documents()
        assert total_count == 5
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_count_documents(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test document counting."""
        # Setup
        mock_magic.return_value = "text/plain"
        # Initially should be 0
        initial_count = await document_service.count_documents()
        assert initial_count == 0
        # Create some documents
        for i in range(3):
            await document_service.create_document(
                f"/test/test{i}.txt",
                sample_file_bytes + bytes(str(i), 'utf-8'),
                "utf-8"
            )
        # Execute
        final_count = await document_service.count_documents()
        # Verify
        assert final_count == 3
 class TestUpdateAndDelete:
    """Tests for document update and deletion operations."""
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_update_document_metadata(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test updating document metadata."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Create a document first
        created_doc = await document_service.create_document(
            "/test/test.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Execute update
        update_data = {"metadata": {"page_count": 5}}
        result = await document_service.update_document(created_doc.id, update_data)
        # Verify
        assert result is not None
        assert result.metadata.get("page_count") == 5
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_delete_document_and_orphaned_content(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test deleting document with orphaned content cleanup."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Create a document
        created_doc = await document_service.create_document(
            "/test/test.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Verify content exists
        content_before = await document_service.content_repository.find_document_content_by_file_hash(
            created_doc.file_hash
        )
        assert content_before is not None
        # Execute deletion
        result = await document_service.delete_document(created_doc.id)
        # Verify document and content are deleted
        assert result is True
        deleted_doc = await document_service.get_document_by_id(created_doc.id)
        assert deleted_doc is None
        content_after = await document_service.content_repository.find_document_content_by_file_hash(
            created_doc.file_hash
        )
        assert content_after is None
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_delete_document_without_affecting_shared_content(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test deleting document without removing shared content."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Create two documents with same content
        doc1 = await document_service.create_document(
            "/test/test1.pdf",
            sample_file_bytes,
            "utf-8"
        )
        doc2 = await document_service.create_document(
            "/test/test2.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # They should share the same hash
        assert doc1.file_hash == doc2.file_hash
        # Delete first document
        result = await document_service.delete_document(doc1.id)
        assert result is True
        # Verify first document is deleted but content still exists
        deleted_doc = await document_service.get_document_by_id(doc1.id)
        assert deleted_doc is None
        remaining_doc = await document_service.get_document_by_id(doc2.id)
        assert remaining_doc is not None
        content = await document_service.content_repository.find_document_content_by_file_hash(
            doc2.file_hash
        )
        assert content is not None
 class TestUtilityMethods:
    """Tests for utility methods."""
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_check_content_exists(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test checking if content exists by hash."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Initially content doesn't exist
        test_hash = "nonexistent_hash"
        exists_before = await document_service.content_exists(test_hash)
        assert exists_before is False
        # Create a document
        created_doc = await document_service.create_document(
            "/test/test.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Now content should exist
        exists_after = await document_service.content_exists(created_doc.file_hash)
        assert exists_after is True
    @patch('app.services.document_service.magic.from_buffer')
    @pytest.mark.asyncio
    async def test_i_can_update_document_content(
        self,
        mock_magic,
        document_service,
        sample_file_bytes
    ):
        """Test updating extracted document content."""
        # Setup
        mock_magic.return_value = "application/pdf"
        # Create a document first
        created_doc = await document_service.create_document(
            "/test/test.pdf",
            sample_file_bytes,
            "utf-8"
        )
        # Update content
        new_content = "Updated extracted content"
        result = await document_service.update_document_content(
            created_doc.file_hash,
            new_content
        )
        # Verify update
        assert result is not None
        assert result.content == new_content
        # Verify persistence
        updated_content = await document_service.content_repository.find_document_content_by_file_hash(
            created_doc.file_hash
        )
        assert updated_content.content == new_content
 class TestHashCalculation:
    """Tests for file hash calculation utility."""
    def test_i_can_calculate_consistent_file_hash(self, document_service):
        """Test that file hash calculation is consistent."""
        test_bytes = b"Test content for hashing"
        # Calculate hash multiple times
        hash1 = document_service._calculate_file_hash(test_bytes)
        hash2 = document_service._calculate_file_hash(test_bytes)
        # Should be identical
        assert hash1 == hash2
        assert len(hash1) == 64  # SHA256 produces 64-character hex string
    def test_i_get_different_hashes_for_different_content(self, document_service):
        """Test that different content produces different hashes."""
        content1 = b"First content"
        content2 = b"Second content"
        hash1 = document_service._calculate_file_hash(content1)
        hash2 = document_service._calculate_file_hash(content2)
        assert hash1 != hash2
 class TestFileTypeDetection:
    """Tests for file type detection."""
    def test_i_can_detect_pdf_file_type(self, document_service):
        """Test PDF file type detection."""
        file_type = document_service._detect_file_type("/path/to/document.pdf")
        assert file_type == FileType.PDF
    def test_i_can_detect_txt_file_type(self, document_service):
        """Test text file type detection."""
        file_type = document_service._detect_file_type("/path/to/document.txt")
        assert file_type == FileType.TXT
    def test_i_can_detect_docx_file_type(self, document_service):
        """Test DOCX file type detection."""
        file_type = document_service._detect_file_type("/path/to/document.docx")
        assert file_type == FileType.DOCX
    def test_i_cannot_detect_unsupported_file_type(self, document_service):
        """Test unsupported file type raises ValueError."""
        with pytest.raises(ValueError, match="Unsupported file type"):
            document_service._detect_file_type("/path/to/document.xyz")
--- a/tests/utils/init.py
+++ b/tests/utils/init.py
--- a/tests/utils/test_document_matching.py
+++ b/tests/utils/test_document_matching.py
@@ -14,6 +14,8 @@ def get_doc(filename: str = None):
    file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
    file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"),
    detected_at=datetime.now(),
    file_size=1024,
    mime_type="application/pdf"
  )
--- a/tests/utils/test_security.py
+++ b/tests/utils/test_security.py