diff --git a/Readme.md b/Readme.md index fc03512..88a9cb2 100644 --- a/Readme.md +++ b/Readme.md @@ -103,17 +103,22 @@ MyDocManager/ │ │ │ ├── models/ │ │ │ │ ├── __init__.py │ │ │ │ ├── user.py # User Pydantic models -│ │ │ │ └── auth.py # Auth Pydantic models +│ │ │ │ ├── auth.py # Auth Pydantic models +│ │ │ │ ├── document.py # Document Pydantic models +│ │ │ │ ├── job.py # Job Processing Pydantic models +│ │ │ │ └── types.py # PyObjectId and other useful types │ │ │ ├── database/ │ │ │ │ ├── __init__.py │ │ │ │ ├── connection.py # MongoDB connection │ │ │ │ └── repositories/ │ │ │ │ ├── __init__.py -│ │ │ │ └── user_repository.py # User CRUD operations +│ │ │ │ ├── user_repository.py # User CRUD operations +│ │ │ │ └── document_repository.py # User CRUD operations │ │ │ ├── services/ │ │ │ │ ├── __init__.py │ │ │ │ ├── auth_service.py # JWT & password logic │ │ │ │ ├── user_service.py # User business logic +│ │ │ │ ├── document_service.py # Document business logic │ │ │ │ └── init_service.py # Admin creation at startup │ │ │ ├── api/ │ │ │ │ ├── __init__.py @@ -125,7 +130,7 @@ MyDocManager/ │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── security.py # Password utilities -│ │ │ └── exceptions.py # Custom exceptions +│ │ │ └── document_matching.py # Fuzzy matching Algorithms │ ├── worker/ │ │ ├── Dockerfile │ │ ├── requirements.txt @@ -224,78 +229,76 @@ On first startup, the application automatically creates a default admin user: #### Files Collection -Stores file metadata and extracted content: +Stores file metadata and extracted content using Pydantic models: -```json -{ - "_id": "ObjectId", - "filename": "document.pdf", - "filepath": "/watched_files/document.pdf", - "file_type": "pdf", - "extraction_method": "direct_text", // direct_text, ocr, hybrid - "metadata": { - "page_count": 15, // for PDFs - "word_count": 250, // for text files - "image_dimensions": { // for images - "width": 1920, - "height": 1080 - } - }, - "detected_at": "2024-01-15T10:29:00Z", - "file_hash": "sha256_hash_value" -} -``` -#### Document Contents Collection +```python +class FileDocument(BaseModel): + """ + Model for file documents stored in the 'files' collection. -Stores actual file content and technical metadata: -```json -{ - "_id": "ObjectId", - "file_hash": "sha256_hash_value", - "content": "extracted text content...", - "encoding": "utf-8", - "file_size": 2048576, - "mime_type": "application/pdf" -} + Represents a file detected in the watched directory with its + metadata and extracted content. + """ + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + filename: str = Field(..., description="Original filename") + filepath: str = Field(..., description="Full path to the file") + file_type: FileType = Field(..., description="Type of the file") + extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content") + metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata") + detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected") + file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") + encoding: str = Field(default="utf-8", description="Character encoding for text files") + file_size: int = Field(..., ge=0, description="File size in bytes") + mime_type: str = Field(..., description="MIME type detected") + + @field_validator('filepath') + @classmethod + def validate_filepath(cls, v: str) -> str: + """Validate filepath format.""" + if not v.strip(): + raise ValueError("Filepath cannot be empty") + return v.strip() + + @field_validator('filename') + @classmethod + def validate_filename(cls, v: str) -> str: + """Validate filename format.""" + if not v.strip(): + raise ValueError("Filename cannot be empty") + return v.strip() ``` #### Processing Jobs Collection Tracks processing status and lifecycle: -```json -{ - "_id": "ObjectId", - "file_id": "reference_to_files_collection", - "status": "completed", - // pending, processing, completed, failed - "task_id": "celery_task_uuid", - "created_at": "2024-01-15T10:29:00Z", - "started_at": "2024-01-15T10:29:30Z", - "completed_at": "2024-01-15T10:30:00Z", - "error_message": null -} +```python +class ProcessingJob(BaseModel): + """ + Model for processing jobs stored in the 'processing_jobs' collection. + + Tracks the lifecycle and status of document processing tasks. + """ + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + file_id: PyObjectId = Field(..., description="Reference to file document") + status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status") + task_id: Optional[str] = Field(default=None, description="Celery task UUID") + created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created") + started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started") + completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed") + error_message: Optional[str] = Field(default=None, description="Error message if processing failed") + + @field_validator('error_message') + @classmethod + def validate_error_message(cls, v: Optional[str]) -> Optional[str]: + """Clean up error message.""" + if v is not None: + return v.strip() if v.strip() else None + return v ``` -### Data Storage Strategy - -- **Choice**: Three separate collections for files, content, and processing status -- **Rationale**: Normalization prevents content duplication when multiple files have identical content -- **Benefits**: - - Content deduplication via SHA256 hash - - Better query performance for metadata vs content searches - - Clear separation of concerns between file metadata, content, and processing lifecycle - - Multiple files can reference the same content (e.g., identical copies in different locations) - -### Content Storage Location - -- **Choice**: Store extracted content in separate `document_contents` collection -- **Rationale**: Content normalization and deduplication -- **Benefits**: - - Single content storage per unique file hash - - Multiple file entries can reference same content - - Efficient storage for duplicate files - ### Supported File Types (Initial Implementation) - **Text Files** (`.txt`): Direct content reading @@ -306,7 +309,7 @@ Tracks processing status and lifecycle: #### Watchdog Implementation -- **Choice**: Dedicated observer thread (Option A) +- **Choice**: Dedicated observer thread - **Rationale**: Standard approach, clean separation of concerns - **Implementation**: Watchdog observer runs in separate thread from FastAPI @@ -327,17 +330,17 @@ Tracks processing status and lifecycle: #### Content Storage Location -- **Choice**: Store extracted content in `files` collection -- **Rationale**: Content is intrinsic property of the file -- **Benefits**: Single query to get file + content, simpler data model +- **Choice**: Store files in the file system, using the SHA256 hash as filename +- **Rationale**: MongoDB is not meant for large files, better performance. Files remain in the file system for easy + access. ### Implementation Order 1. ✅ Pydantic models for MongoDB collections -2. ✅ Repository layer for data access (files + processing_jobs) -3. ✅ Celery tasks for document processing -4. ✅ Watchdog file monitoring implementation -5. ✅ FastAPI integration and startup coordination +2. UNDER PROGRESS : Repository layer for data access (files + processing_jobs) +3. TODO : Celery tasks for document processing +4. TODO : Watchdog file monitoring implementation +5. TODO : FastAPI integration and startup coordination ### Processing Pipeline Features @@ -347,87 +350,6 @@ Tracks processing status and lifecycle: - **Extensible Metadata**: Flexible metadata storage per file type - **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches -## Document Service Architecture - -### Service Overview - -The document service provides orchestrated access to file documents and their content through a single interface that coordinates between `FileDocument` and `DocumentContent` repositories. - -### Service Design - -- **Architecture Pattern**: Service orchestration with separate repositories -- **Transaction Support**: MongoDB ACID transactions for data consistency -- **Content Deduplication**: Multiple files can reference the same content via SHA256 hash -- **Error Handling**: MongoDB standard exceptions with transaction rollback - -### Document Service (`document_service.py`) - -Orchestrates operations between file and content repositories while maintaining data consistency. - -#### Core Functionality - -##### `create_document(file_path: str, file_bytes: bytes, encoding: str)` - -Creates a new document with automatic attribute calculation and content deduplication. - -**Automatic Calculations:** -- `file_hash`: SHA256 hash of file bytes -- `file_type`: Detection based on file extension -- `mime_type`: Detection via `python-magic` library -- `file_size`: Length of provided bytes -- `detected_at`: Current timestamp -- `metadata`: Empty dictionary (reserved for future extension) - -**Deduplication Logic:** -1. Calculate SHA256 hash of file content -2. Check if `DocumentContent` with this hash already exists -3. If EXISTS: Create only `FileDocument` referencing existing content -4. If NOT EXISTS: Create both `FileDocument` and `DocumentContent` in transaction - -**Transaction Flow:** -``` -BEGIN TRANSACTION - IF content_exists(file_hash): - CREATE FileDocument with content reference - ELSE: - CREATE DocumentContent - CREATE FileDocument with content reference -COMMIT TRANSACTION -``` - -#### Available Methods - -- `create_document(file_path, file_bytes, encoding)`: Create with deduplication -- `get_document_by_id(document_id)`: Retrieve by document ID -- `get_document_by_hash(file_hash)`: Retrieve by file hash -- `get_document_by_filepath(filepath)`: Retrieve by file path -- `list_documents(skip, limit)`: Paginated document listing -- `count_documents()`: Total document count -- `update_document(document_id, update_data)`: Update document metadata -- `delete_document(document_id)`: Remove document and orphaned content - -### Repository Dependencies - -The document service coordinates two existing repositories: - -#### File Repository (`file_repository.py`) -- `create_document()`, `find_document_by_id()`, `find_document_by_hash()` -- `find_document_by_filepath()`, `find_document_by_name()` -- `list_documents()`, `count_documents()` -- `update_document()`, `delete_document()` - -#### Document Content Repository (`document_content_repository.py`) -- `create_document_content()`, `find_document_content_by_id()` -- `find_document_content_by_file_hash()`, `content_exists()` -- `update_document_content()`, `delete_document_content()` -- `list_document_contents()`, `count_document_contents()` - -### Dependencies - -- `python-magic`: MIME type detection -- `hashlib`: SHA256 hashing (standard library) -- `pymongo`: MongoDB transactions support - ## Key Implementation Notes ### Python Standards @@ -483,21 +405,14 @@ The document service coordinates two existing repositories: ### Next Implementation Steps -1. ✅ Create docker-compose.yml with all services => Done -2. ✅ Define user management and authentication architecture => Done -3. ✅ Implement user models and authentication services => - 1. models/user.py => Done - 2. models/auth.py => Done - 3. database/repositories/user_repository.py => Done -4. ✅ Add automatic admin user creation if it does not exists => Done -5. **IN PROGRESS**: Implement file processing pipeline => +1. **IN PROGRESS**: Implement file processing pipeline => 1. Create Pydantic models for files and processing_jobs collections 2. Implement repository layer for file and processing job data access 3. Create Celery tasks for document processing (.txt, .pdf, .docx) 4. Implement Watchdog file monitoring with dedicated observer 5. Integrate file watcher with FastAPI startup -6. Create protected API routes for user management -7. Build React monitoring interface with authentication +2. Create protected API routes for user management +3. Build React monitoring interface with authentication ## Annexes diff --git a/docker-compose.yml b/docker-compose.yml index 57b85e6..7cb68cb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,7 +19,7 @@ services: MONGO_INITDB_ROOT_PASSWORD: password123 MONGO_INITDB_DATABASE: mydocmanager volumes: - - mongodb-data:/data/db + - ./volumes/db:/data/db networks: - mydocmanager-network @@ -38,6 +38,7 @@ services: volumes: - ./src/file-processor:/app - ./volumes/watched_files:/watched_files + - ./volumes/objects:/objects depends_on: - redis - mongodb diff --git a/src/file-processor/app/config/settings.py b/src/file-processor/app/config/settings.py index 68720c1..7f8dc7f 100644 --- a/src/file-processor/app/config/settings.py +++ b/src/file-processor/app/config/settings.py @@ -51,6 +51,15 @@ def get_jwt_secret_key() -> str: raise ValueError("JWT_SECRET environment variable must be set in production") return secret +def get_objects_folder() -> str: + """ + Get Vault path from environment variables. + + Returns: + str: Vault path + """ + return os.getenv("OBJECTS_FOLDER", "/objects") + def get_jwt_algorithm() -> str: """ diff --git a/src/file-processor/app/database/connection.py b/src/file-processor/app/database/connection.py index bba8f82..919b385 100644 --- a/src/file-processor/app/database/connection.py +++ b/src/file-processor/app/database/connection.py @@ -10,7 +10,7 @@ from typing import Optional from pymongo import MongoClient from pymongo.database import Database from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError - +from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase from app.config.settings import get_mongodb_url, get_mongodb_database_name # Global variables for singleton pattern @@ -18,7 +18,7 @@ _client: Optional[MongoClient] = None _database: Optional[Database] = None -def create_mongodb_client() -> MongoClient: +def create_mongodb_client() -> AsyncIOMotorClient: """ Create MongoDB client with connection validation. @@ -32,7 +32,7 @@ def create_mongodb_client() -> MongoClient: try: # Create client with short timeout for fail-fast behavior - client = MongoClient( + client = AsyncIOMotorClient( mongodb_url, serverSelectionTimeoutMS=5000, # 5 seconds timeout connectTimeoutMS=5000, @@ -107,6 +107,15 @@ def get_mongodb_client() -> Optional[MongoClient]: return _client +def get_extra_args(session): + # Build kwargs only if session is provided + kwargs = {} + if session is not None: + kwargs["session"] = session + + return kwargs + + def test_database_connection() -> bool: """ Test if database connection is working. @@ -122,4 +131,4 @@ def test_database_connection() -> bool: db.command('ping') return True except Exception: - return False \ No newline at end of file + return False diff --git a/src/file-processor/app/database/repositories/document_content_repository.py b/src/file-processor/app/database/repositories/document_content_repository.py deleted file mode 100644 index 16b2bd7..0000000 --- a/src/file-processor/app/database/repositories/document_content_repository.py +++ /dev/null @@ -1,214 +0,0 @@ -from typing import List, Optional -from datetime import datetime -from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection -from pymongo.errors import DuplicateKeyError, PyMongoError -from bson import ObjectId - -from app.models.document import DocumentContent - - -class DocumentContentRepository: - """ - Repository class for document content CRUD operations in MongoDB. - - This class handles all database operations related to document content, - following the repository pattern with dependency injection and async/await. - """ - - def __init__(self, database: AsyncIOMotorDatabase): - """ - Initialize repository with database dependency. - - Args: - database (AsyncIOMotorDatabase): MongoDB database instance - """ - self.db = database - self.collection: AsyncIOMotorCollection = database.document_contents - self._ensure_indexes() - - async def initialize(self): - """ - Initialize repository by ensuring required indexes exist. - - Should be called after repository instantiation to setup database indexes. - """ - await self._ensure_indexes() - - async def _ensure_indexes(self): - """ - Ensure required database indexes exist. - - Creates unique index on file_hash field to prevent duplicates. - """ - try: - await self.collection.create_index("file_hash", unique=True) - except PyMongoError: - # Index might already exist, ignore error - pass - - async def create_document_content(self, document_content: DocumentContent) -> DocumentContent: - """ - Create a new document content in the database. - - Args: - document_content (DocumentContent): Document content data - - Returns: - DocumentContent: Created document content with database ID - - Raises: - DuplicateKeyError: If file_hash already exists - ValueError: If document content creation fails due to validation - """ - document_dict = document_content.model_dump(by_alias=True, exclude_unset=True) - - # Remove _id if it's None to let MongoDB generate it - if document_dict.get("_id") is None: - document_dict.pop("_id", None) - - try: - result = await self.collection.insert_one(document_dict) - document_dict["_id"] = result.inserted_id - return DocumentContent(**document_dict) - except DuplicateKeyError as e: - raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}") - except PyMongoError as e: - raise ValueError(f"Failed to create document content: {e}") - - async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]: - """ - Find document content by ID. - - Args: - document_id (str): Document content ID to search for - - Returns: - DocumentContent or None: Document content if found, None otherwise - """ - try: - if not ObjectId.is_valid(document_id): - return None - - document_doc = await self.collection.find_one({"_id": ObjectId(document_id)}) - if document_doc: - return DocumentContent(**document_doc) - return None - except PyMongoError: - return None - - async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]: - """ - Find document content by file hash. - - Args: - file_hash (str): File hash to search for - - Returns: - DocumentContent or None: Document content if found, None otherwise - """ - try: - document_doc = await self.collection.find_one({"file_hash": file_hash}) - if document_doc: - return DocumentContent(**document_doc) - return None - except PyMongoError: - return None - - async def content_exists(self, file_hash: str) -> bool: - """ - Check if document content exists by file hash. - - Args: - file_hash (str): File hash to check - - Returns: - bool: True if document content exists, False otherwise - """ - try: - count = await self.collection.count_documents({"file_hash": file_hash}) - return count > 0 - except PyMongoError: - return False - - async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]: - """ - Update document content information. - - Args: - document_id (str): Document content ID to update - update_data (dict): Updated document content data - - Returns: - DocumentContent or None: Updated document content if found, None otherwise - """ - try: - if not ObjectId.is_valid(document_id): - return None - - # Remove None values and _id from update data - clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"} - - if not clean_update_data: - return await self.find_document_content_by_id(document_id) - - result = await self.collection.find_one_and_update( - {"_id": ObjectId(document_id)}, - {"$set": clean_update_data}, - return_document=True - ) - - if result: - return DocumentContent(**result) - return None - - except PyMongoError: - return None - - async def delete_document_content(self, document_id: str) -> bool: - """ - Delete document content from database. - - Args: - document_id (str): Document content ID to delete - - Returns: - bool: True if document content was deleted, False otherwise - """ - try: - if not ObjectId.is_valid(document_id): - return False - - result = await self.collection.delete_one({"_id": ObjectId(document_id)}) - return result.deleted_count > 0 - except PyMongoError: - return False - - async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]: - """ - List document contents with pagination. - - Args: - skip (int): Number of document contents to skip (default: 0) - limit (int): Maximum number of document contents to return (default: 100) - - Returns: - List[DocumentContent]: List of document contents - """ - try: - cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1) - document_docs = await cursor.to_list(length=limit) - return [DocumentContent(**document_doc) for document_doc in document_docs] - except PyMongoError: - return [] - - async def count_document_contents(self) -> int: - """ - Count total number of document contents. - - Returns: - int: Total number of document contents in database - """ - try: - return await self.collection.count_documents({}) - except PyMongoError: - return 0 diff --git a/src/file-processor/app/database/repositories/document_repository.py b/src/file-processor/app/database/repositories/document_repository.py index 74552db..450b8d6 100644 --- a/src/file-processor/app/database/repositories/document_repository.py +++ b/src/file-processor/app/database/repositories/document_repository.py @@ -9,6 +9,8 @@ from typing import Optional, List from bson import ObjectId from pymongo.errors import DuplicateKeyError, PyMongoError from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase + +from app.database.connection import get_extra_args from app.models.document import FileDocument from app.utils.document_matching import fuzzy_matching, subsequence_matching @@ -37,7 +39,7 @@ class FileDocumentRepository: def __init__(self, database: AsyncIOMotorDatabase): """Initialize file repository with database connection.""" self.db = database - self.collection: AsyncIOMotorCollection = self.db.files + self.collection: AsyncIOMotorCollection = self.db.documents self._ensure_indexes() async def initialize(self): @@ -47,6 +49,7 @@ class FileDocumentRepository: Should be called after repository instantiation to setup database indexes. """ await self._ensure_indexes() + return self async def _ensure_indexes(self): """ @@ -60,26 +63,27 @@ class FileDocumentRepository: # Index might already exist, ignore error pass - async def create_document(self, file_data: FileDocument) -> FileDocument: + async def create_document(self, file_data: FileDocument, session=None) -> FileDocument: """ Create a new file document in database. Args: file_data (FileDocument): File document data to create + session (AsyncIOMotorClientSession, optional): MongoDB session Returns: - FileDocument: Created file document with database ID + FileDocument: Created document with database ID Raises: ValueError: If file creation fails due to validation - DuplicateKeyError: If file with same hash already exists + DuplicateKeyError: If a document with same hash already exists """ try: file_dict = file_data.model_dump(by_alias=True, exclude_unset=True) if "_id" in file_dict and file_dict["_id"] is None: del file_dict["_id"] - result = await self.collection.insert_one(file_dict) + result = await self.collection.insert_one(file_dict, **get_extra_args(session)) file_data.id = result.inserted_id return file_data @@ -204,13 +208,14 @@ class FileDocumentRepository: except PyMongoError: return 0 - async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]: + async def update_document(self, file_id: str, update_data: dict, session=None) -> Optional[FileDocument]: """ Update file document with new data. Args: file_id (str): File document ID to update update_data (dict): Fields to update + session (AsyncIOMotorClientSession, optional): MongoDB session Returns: FileDocument or None: Updated file document if successful, None otherwise @@ -228,7 +233,8 @@ class FileDocumentRepository: result = await self.collection.find_one_and_update( {"_id": ObjectId(file_id)}, {"$set": clean_update_data}, - return_document=True + return_document=True, + **get_extra_args(session) ) if result: @@ -238,12 +244,13 @@ class FileDocumentRepository: except PyMongoError: return None - async def delete_document(self, file_id: str) -> bool: + async def delete_document(self, file_id: str, session=None) -> bool: """ Delete file document from database. Args: file_id (str): File document ID to delete + session (AsyncIOMotorClientSession, optional): MongoDB session Returns: bool: True if file was deleted, False otherwise @@ -252,7 +259,7 @@ class FileDocumentRepository: if not ObjectId.is_valid(file_id): return False - result = await self.collection.delete_one({"_id": ObjectId(file_id)}) + result = await self.collection.delete_one({"_id": ObjectId(file_id)}, **get_extra_args(session)) return result.deleted_count > 0 except PyMongoError: diff --git a/src/file-processor/app/database/repositories/user_repository.py b/src/file-processor/app/database/repositories/user_repository.py index c3b29e3..b02ca21 100644 --- a/src/file-processor/app/database/repositories/user_repository.py +++ b/src/file-processor/app/database/repositories/user_repository.py @@ -32,7 +32,6 @@ class UserRepository: """ self.db = database self.collection: AsyncIOMotorCollection = database.users - self._ensure_indexes() async def initialize(self): """ @@ -41,6 +40,7 @@ class UserRepository: Should be called after repository instantiation to setup database indexes. """ await self._ensure_indexes() + return self async def _ensure_indexes(self): """ diff --git a/src/file-processor/app/main.py b/src/file-processor/app/main.py index b2247e4..7c99fc7 100644 --- a/src/file-processor/app/main.py +++ b/src/file-processor/app/main.py @@ -7,10 +7,11 @@ This service provides API endpoints for health checks and task dispatching. import logging import os from contextlib import asynccontextmanager -from fastapi import FastAPI, HTTPException, Depends -from pydantic import BaseModel + import redis from celery import Celery +from fastapi import FastAPI, HTTPException, Depends +from pydantic import BaseModel from app.database.connection import test_database_connection, get_database from app.database.repositories.user_repository import UserRepository @@ -39,12 +40,11 @@ async def lifespan(app: FastAPI): database = get_database() # Initialize repositories and services - user_repository = UserRepository(database) - user_service = UserService(user_repository) + user_service = await UserService(database).initialize() init_service = InitializationService(user_service) # Run initialization tasks - initialization_result = init_service.initialize_application() + initialization_result = await init_service.initialize_application() if initialization_result["initialization_success"]: logger.info("Application startup completed successfully") @@ -56,6 +56,7 @@ async def lifespan(app: FastAPI): logger.error(f" - {error}") except Exception as e: + raise e logger.error(f"Critical error during application startup: {str(e)}") # You might want to decide if the app should continue or exit here # For now, we log the error but continue @@ -119,6 +120,7 @@ async def create_user( ): return user_service.create_user(user_data) + @app.get("/health") async def health_check(): """ diff --git a/src/file-processor/app/models/document.py b/src/file-processor/app/models/document.py index 1c22ef2..19d9bfe 100644 --- a/src/file-processor/app/models/document.py +++ b/src/file-processor/app/models/document.py @@ -33,15 +33,6 @@ class ExtractionMethod(str, Enum): HYBRID = "hybrid" -class ProcessingStatus(str, Enum): - """Status values for processing jobs.""" - - PENDING = "pending" - PROCESSING = "processing" - COMPLETED = "completed" - FAILED = "failed" - - class FileDocument(BaseModel): """ Model for file documents stored in the 'files' collection. @@ -58,6 +49,9 @@ class FileDocument(BaseModel): metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata") detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected") file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") + encoding: str = Field(default="utf-8", description="Character encoding for text files") + file_size: int = Field(..., ge=0, description="File size in bytes") + mime_type: str = Field(..., description="MIME type detected") @field_validator('filepath') @classmethod @@ -74,69 +68,3 @@ class FileDocument(BaseModel): if not v.strip(): raise ValueError("Filename cannot be empty") return v.strip() - - class Config: - """Pydantic configuration.""" - populate_by_name = True - arbitrary_types_allowed = True - json_encoders = {ObjectId: str} - - -class DocumentContent(BaseModel): - """Model for document content.""" - - id: Optional[PyObjectId] = Field(default=None, alias="_id") - file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") - content: str = Field(..., description="File content") - encoding: str = Field(default="utf-8", description="Character encoding for text files") - file_size: int = Field(..., ge=0, description="File size in bytes") - mime_type: str = Field(..., description="MIME type detected") - - -class ProcessingJob(BaseModel): - """ - Model for processing jobs stored in the 'processing_jobs' collection. - - Tracks the lifecycle and status of document processing tasks. - """ - - id: Optional[PyObjectId] = Field(default=None, alias="_id") - file_id: PyObjectId = Field(..., description="Reference to file document") - status: ProcessingStatus = Field( - default=ProcessingStatus.PENDING, - description="Current processing status" - ) - task_id: Optional[str] = Field( - default=None, - description="Celery task UUID" - ) - created_at: Optional[datetime] = Field( - default=None, - description="Timestamp when job was created" - ) - started_at: Optional[datetime] = Field( - default=None, - description="Timestamp when processing started" - ) - completed_at: Optional[datetime] = Field( - default=None, - description="Timestamp when processing completed" - ) - error_message: Optional[str] = Field( - default=None, - description="Error message if processing failed" - ) - - @field_validator('error_message') - @classmethod - def validate_error_message(cls, v: Optional[str]) -> Optional[str]: - """Clean up error message.""" - if v is not None: - return v.strip() if v.strip() else None - return v - - class Config: - """Pydantic configuration.""" - populate_by_name = True - arbitrary_types_allowed = True - json_encoders = {ObjectId: str} diff --git a/src/file-processor/app/models/job.py b/src/file-processor/app/models/job.py index e69de29..1261dd4 100644 --- a/src/file-processor/app/models/job.py +++ b/src/file-processor/app/models/job.py @@ -0,0 +1,42 @@ +from datetime import datetime +from enum import Enum +from typing import Optional + +from bson import ObjectId +from pydantic import BaseModel, Field, field_validator + +from app.models.types import PyObjectId + + +class ProcessingStatus(str, Enum): + """Status values for processing jobs.""" + + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + + +class ProcessingJob(BaseModel): + """ + Model for processing jobs stored in the 'processing_jobs' collection. + + Tracks the lifecycle and status of document processing tasks. + """ + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + file_id: PyObjectId = Field(..., description="Reference to file document") + status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status") + task_id: Optional[str] = Field(default=None, description="Celery task UUID") + created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created") + started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started") + completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed") + error_message: Optional[str] = Field(default=None, description="Error message if processing failed") + + @field_validator('error_message') + @classmethod + def validate_error_message(cls, v: Optional[str]) -> Optional[str]: + """Clean up error message.""" + if v is not None: + return v.strip() if v.strip() else None + return v \ No newline at end of file diff --git a/src/file-processor/app/services/document_service.py b/src/file-processor/app/services/document_service.py index da58712..ac00fb6 100644 --- a/src/file-processor/app/services/document_service.py +++ b/src/file-processor/app/services/document_service.py @@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions. """ import hashlib -import magic +import os from datetime import datetime from pathlib import Path -from typing import List, Optional, Dict, Any, Tuple +from typing import List, Optional, Dict, Any -from motor.motor_asyncio import AsyncIOMotorClientSession +import magic from pymongo.errors import PyMongoError -from app.database.connection import get_database +from app.config.settings import get_objects_folder from app.database.repositories.document_repository import FileDocumentRepository -from app.database.repositories.document_content_repository import DocumentContentRepository from app.models.document import ( FileDocument, - DocumentContent, FileType, - ProcessingStatus ) from app.models.types import PyObjectId @@ -34,13 +31,25 @@ class DocumentService: and their content while ensuring data consistency through transactions. """ - def __init__(self): - """Initialize the document service with repository dependencies.""" - self.db = get_database() - self.file_repository = FileDocumentRepository(self.db) - self.content_repository = DocumentContentRepository(self.db) + def __init__(self, database, objects_folder: str = None): + """ + Initialize the document service with repository dependencies. + + Args: + database: Database instance + objects_folder: folder to store files by their hash + """ + + self.db = database + self.document_repository = FileDocumentRepository(self.db) + self.objects_folder = objects_folder or get_objects_folder() - def _calculate_file_hash(self, file_bytes: bytes) -> str: + async def initialize(self): + await self.document_repository.initialize() + return self + + @staticmethod + def _calculate_file_hash(file_bytes: bytes) -> str: """ Calculate SHA256 hash of file content. @@ -52,7 +61,8 @@ class DocumentService: """ return hashlib.sha256(file_bytes).hexdigest() - def _detect_file_type(self, file_path: str) -> FileType: + @staticmethod + def _detect_file_type(file_path: str) -> FileType: """ Detect file type from file extension. @@ -72,7 +82,8 @@ class DocumentService: except ValueError: raise ValueError(f"Unsupported file type: {extension}") - def _detect_mime_type(self, file_bytes: bytes) -> str: + @staticmethod + def _detect_mime_type(file_bytes: bytes) -> str: """ Detect MIME type from file content. @@ -84,6 +95,25 @@ class DocumentService: """ return magic.from_buffer(file_bytes, mime=True) + def _get_document_path(self, file_hash): + """ + + :param file_hash: + :return: + """ + return os.path.join(self.objects_folder, file_hash[:24], file_hash) + + def save_content_if_needed(self, file_hash, content: bytes): + target_path = self._get_document_path(file_hash) + if os.path.exists(target_path): + return + + if not os.path.exists(os.path.dirname(target_path)): + os.makedirs(os.path.dirname(target_path)) + + with open(target_path, "wb") as f: + f.write(content) + async def create_document( self, file_path: str, @@ -115,50 +145,32 @@ class DocumentService: mime_type = self._detect_mime_type(file_bytes) file_size = len(file_bytes) filename = Path(file_path).name - detected_at = datetime.utcnow() + detected_at = datetime.now() - # Start MongoDB transaction - async with await self.db.client.start_session() as session: - async with session.start_transaction(): - try: - # Check if content already exists - existing_content = await self.content_repository.find_document_content_by_file_hash( - file_hash, session=session - ) - - # Create DocumentContent if it doesn't exist - if not existing_content: - content_data = DocumentContent( - file_hash=file_hash, - content="", # Will be populated by processing workers - encoding=encoding, - file_size=file_size, - mime_type=mime_type - ) - await self.content_repository.create_document_content( - content_data, session=session - ) - - # Create FileDocument - file_data = FileDocument( - filename=filename, - filepath=file_path, - file_type=file_type, - extraction_method=None, # Will be set by processing workers - metadata={}, # Empty for now - detected_at=detected_at, - file_hash=file_hash - ) - - created_file = await self.file_repository.create_document( - file_data, session=session - ) - - return created_file - - except Exception as e: - # Transaction will automatically rollback - raise PyMongoError(f"Failed to create document: {str(e)}") + try: + self.save_content_if_needed(file_hash, file_bytes) + + # Create FileDocument + file_data = FileDocument( + filename=filename, + filepath=file_path, + file_type=file_type, + extraction_method=None, # Will be set by processing workers + metadata={}, # Empty for now + detected_at=detected_at, + file_hash=file_hash, + encoding=encoding, + file_size=file_size, + mime_type=mime_type + ) + + created_file = await self.document_repository.create_document(file_data) + + return created_file + + except Exception as e: + # Transaction will automatically rollback if supported + raise PyMongoError(f"Failed to create document: {str(e)}") async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]: """ @@ -170,7 +182,7 @@ class DocumentService: Returns: FileDocument if found, None otherwise """ - return await self.file_repository.find_document_by_id(document_id) + return await self.document_repository.find_document_by_id(str(document_id)) async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]: """ @@ -182,7 +194,7 @@ class DocumentService: Returns: FileDocument if found, None otherwise """ - return await self.file_repository.find_document_by_hash(file_hash) + return await self.document_repository.find_document_by_hash(file_hash) async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: """ @@ -194,32 +206,15 @@ class DocumentService: Returns: FileDocument if found, None otherwise """ - return await self.file_repository.find_document_by_filepath(filepath) + return await self.document_repository.find_document_by_filepath(filepath) - async def get_document_with_content( - self, - document_id: PyObjectId - ) -> Optional[Tuple[FileDocument, DocumentContent]]: - """ - Retrieve a document with its associated content. - - Args: - document_id: Document ObjectId - - Returns: - Tuple of (FileDocument, DocumentContent) if found, None otherwise - """ - document = await self.get_document_by_id(document_id) - if not document: + async def get_document_content_by_hash(self, file_hash): + target_path = self._get_document_path(file_hash) + if not os.path.exists(target_path): return None - content = await self.content_repository.find_document_content_by_file_hash( - document.file_hash - ) - if not content: - return None - - return (document, content) + with open(target_path, "rb") as f: + return f.read() async def list_documents( self, @@ -236,7 +231,7 @@ class DocumentService: Returns: List of FileDocument instances """ - return await self.file_repository.list_documents(skip=skip, limit=limit) + return await self.document_repository.list_documents(skip=skip, limit=limit) async def count_documents(self) -> int: """ @@ -245,7 +240,7 @@ class DocumentService: Returns: Total document count """ - return await self.file_repository.count_documents() + return await self.document_repository.count_documents() async def update_document( self, @@ -262,7 +257,12 @@ class DocumentService: Returns: Updated FileDocument if found, None otherwise """ - return await self.file_repository.update_document(document_id, update_data) + if "file_bytes" in update_data: + file_hash = self._calculate_file_hash(update_data["file_bytes"]) + update_data["file_hash"] = file_hash + self.save_content_if_needed(file_hash, update_data["file_bytes"]) + + return await self.document_repository.update_document(document_id, update_data) async def delete_document(self, document_id: PyObjectId) -> bool: """ @@ -281,100 +281,31 @@ class DocumentService: Raises: PyMongoError: If database operation fails """ - # Start MongoDB transaction - async with await self.db.client.start_session() as session: - async with session.start_transaction(): + # Start transaction + + try: + # Get document to find its hash + document = await self.document_repository.find_document_by_id(document_id) + if not document: + return False + + # Delete the document + deleted = await self.document_repository.delete_document(document_id) + if not deleted: + return False + + # Check if content is orphaned + remaining_files = await self.document_repository.find_document_by_hash(document.file_hash) + + # If no other files reference this content, delete it + if not remaining_files: try: - # Get document to find its hash - document = await self.file_repository.find_document_by_id( - document_id, session=session - ) - if not document: - return False - - # Delete the document - deleted = await self.file_repository.delete_document( - document_id, session=session - ) - if not deleted: - return False - - # Check if content is orphaned - remaining_files = await self.file_repository.find_document_by_hash( - document.file_hash, session=session - ) - - # If no other files reference this content, delete it - if not remaining_files: - content = await self.content_repository.find_document_content_by_file_hash( - document.file_hash, session=session - ) - if content: - await self.content_repository.delete_document_content( - content.id, session=session - ) - - return True - - except Exception as e: - # Transaction will automatically rollback - raise PyMongoError(f"Failed to delete document: {str(e)}") - - async def content_exists(self, file_hash: str) -> bool: - """ - Check if content with given hash exists. - - Args: - file_hash: SHA256 hash of file content - - Returns: - True if content exists, False otherwise - """ - return await self.content_repository.content_exists(file_hash) - - async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]: - """ - Retrieve content by file hash. - - Args: - file_hash: SHA256 hash of file content - - Returns: - DocumentContent if found, None otherwise - """ - return await self.content_repository.find_document_content_by_file_hash(file_hash) - - async def update_document_content( - self, - file_hash: str, - content: str, - encoding: str = "utf-8" - ) -> Optional[DocumentContent]: - """ - Update the extracted content for a document. - - This method is typically called by processing workers to store - the extracted text content. - - Args: - file_hash: SHA256 hash of file content - content: Extracted text content - encoding: Character encoding - - Returns: - Updated DocumentContent if found, None otherwise - """ - existing_content = await self.content_repository.find_document_content_by_file_hash( - file_hash - ) - if not existing_content: - return None + os.remove(self._get_document_path(document.file_hash)) + except Exception: + pass + + return True - update_data = { - "content": content, - "encoding": encoding - } - - return await self.content_repository.update_document_content( - existing_content.id, update_data - ) \ No newline at end of file + except Exception as e: + # Transaction will automatically rollback if supported + raise PyMongoError(f"Failed to delete document: {str(e)}") diff --git a/src/file-processor/app/services/init_service.py b/src/file-processor/app/services/init_service.py index fd3464f..ed2cd7d 100644 --- a/src/file-processor/app/services/init_service.py +++ b/src/file-processor/app/services/init_service.py @@ -33,7 +33,7 @@ class InitializationService: self.user_service = user_service - def ensure_admin_user_exists(self) -> Optional[UserInDB]: + async def ensure_admin_user_exists(self) -> Optional[UserInDB]: """ Ensure default admin user exists in the system. @@ -49,7 +49,7 @@ class InitializationService: logger.info("Checking if admin user exists...") # Check if any admin user already exists - if self._admin_user_exists(): + if await self._admin_user_exists(): logger.info("Admin user already exists, skipping creation") return None @@ -64,7 +64,7 @@ class InitializationService: role=UserRole.ADMIN ) - created_user = self.user_service.create_user(admin_data) + created_user = await self.user_service.create_user(admin_data) logger.info(f"Default admin user created successfully with ID: {created_user.id}") logger.warning( "Default admin user created with username 'admin' and password 'admin'. " @@ -77,7 +77,7 @@ class InitializationService: logger.error(f"Failed to create default admin user: {str(e)}") raise Exception(f"Admin user creation failed: {str(e)}") - def _admin_user_exists(self) -> bool: + async def _admin_user_exists(self) -> bool: """ Check if any admin user exists in the system. @@ -86,7 +86,7 @@ class InitializationService: """ try: # Get all users and check if any have admin role - users = self.user_service.list_users(limit=1000) # Reasonable limit for admin check + users = await self.user_service.list_users(limit=1000) # Reasonable limit for admin check for user in users: if user.role == UserRole.ADMIN and user.is_active: @@ -99,7 +99,7 @@ class InitializationService: # In case of error, assume admin exists to avoid creating duplicates return True - def initialize_application(self) -> dict: + async def initialize_application(self) -> dict: """ Perform all application initialization tasks. @@ -119,7 +119,7 @@ class InitializationService: try: # Ensure admin user exists - created_admin = self.ensure_admin_user_exists() + created_admin = await self.ensure_admin_user_exists() if created_admin: initialization_summary["admin_user_created"] = True diff --git a/src/file-processor/app/services/user_service.py b/src/file-processor/app/services/user_service.py index de9fcef..b34716e 100644 --- a/src/file-processor/app/services/user_service.py +++ b/src/file-processor/app/services/user_service.py @@ -6,11 +6,11 @@ retrieval, updates, and authentication operations with proper error handling. """ from typing import Optional, List + from pymongo.errors import DuplicateKeyError -from app.models.user import UserCreate, UserInDB, UserUpdate, UserResponse, UserCreateNoValidation -from app.models.auth import UserRole from app.database.repositories.user_repository import UserRepository +from app.models.user import UserCreate, UserInDB, UserUpdate, UserCreateNoValidation from app.services.auth_service import AuthService @@ -22,17 +22,22 @@ class UserService: authentication, and data management with proper validation. """ - def __init__(self, user_repository: UserRepository): + def __init__(self, database): """ Initialize user service with repository dependency. Args: user_repository (UserRepository): Repository for user data operations """ - self.user_repository = user_repository + self.db = database + self.user_repository = UserRepository(self.db) self.auth_service = AuthService() + + async def initialize(self): + await self.user_repository.initialize() + return self - def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB: + async def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB: """ Create a new user with business logic validation. @@ -55,11 +60,11 @@ class UserService: raise ValueError(f"User with email '{user_data.email}' already exists") try: - return self.user_repository.create_user(user_data) + return await self.user_repository.create_user(user_data) except DuplicateKeyError: raise ValueError(f"User with username '{user_data.username}' already exists") - def get_user_by_username(self, username: str) -> Optional[UserInDB]: + async def get_user_by_username(self, username: str) -> Optional[UserInDB]: """ Retrieve user by username. @@ -69,9 +74,9 @@ class UserService: Returns: UserInDB or None: User if found, None otherwise """ - return self.user_repository.find_user_by_username(username) + return await self.user_repository.find_user_by_username(username) - def get_user_by_id(self, user_id: str) -> Optional[UserInDB]: + async def get_user_by_id(self, user_id: str) -> Optional[UserInDB]: """ Retrieve user by ID. @@ -81,9 +86,9 @@ class UserService: Returns: UserInDB or None: User if found, None otherwise """ - return self.user_repository.find_user_by_id(user_id) + return await self.user_repository.find_user_by_id(user_id) - def authenticate_user(self, username: str, password: str) -> Optional[UserInDB]: + async def authenticate_user(self, username: str, password: str) -> Optional[UserInDB]: """ Authenticate user with username and password. @@ -106,7 +111,7 @@ class UserService: return user - def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]: + async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]: """ Update user information. @@ -132,9 +137,9 @@ class UserService: if existing_user and str(existing_user.id) != user_id: raise ValueError(f"Email '{user_update.email}' is already taken") - return self.user_repository.update_user(user_id, user_update) + return await self.user_repository.update_user(user_id, user_update) - def delete_user(self, user_id: str) -> bool: + async def delete_user(self, user_id: str) -> bool: """ Delete user from system. @@ -146,7 +151,7 @@ class UserService: """ return self.user_repository.delete_user(user_id) - def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]: + async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]: """ List users with pagination. @@ -157,18 +162,18 @@ class UserService: Returns: List[UserInDB]: List of users """ - return self.user_repository.list_users(skip=skip, limit=limit) + return await self.user_repository.list_users(skip=skip, limit=limit) - def count_users(self) -> int: + async def count_users(self) -> int: """ Count total number of users. Returns: int: Total number of users in system """ - return self.user_repository.count_users() + return await self.user_repository.count_users() - def user_exists(self, username: str) -> bool: + async def user_exists(self, username: str) -> bool: """ Check if user exists by username. @@ -178,4 +183,4 @@ class UserService: Returns: bool: True if user exists, False otherwise """ - return self.user_repository.user_exists(username) + return await self.user_repository.user_exists(username) diff --git a/tests/database/__init__.py b/tests/database/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_user_models.py b/tests/models/test_user_models.py similarity index 100% rename from tests/test_user_models.py rename to tests/models/test_user_models.py diff --git a/tests/repositories/__init__.py b/tests/repositories/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/repositories/test_document_repository.py b/tests/repositories/test_document_repository.py new file mode 100644 index 0000000..45f7a92 --- /dev/null +++ b/tests/repositories/test_document_repository.py @@ -0,0 +1,672 @@ +""" +Test suite for FileDocumentRepository with async/await support. + +This module contains comprehensive tests for all FileDocumentRepository methods +using mongomock-motor for in-memory MongoDB testing. +""" + +import pytest +from datetime import datetime +from typing import Dict, Any + +import pytest_asyncio +from bson import ObjectId +from pymongo.errors import DuplicateKeyError, PyMongoError +from mongomock_motor import AsyncMongoMockClient + +from app.database.repositories.document_repository import ( + FileDocumentRepository, + MatchMethodBase, + SubsequenceMatching, + FuzzyMatching +) +from app.models.document import FileDocument, FileType, ExtractionMethod + + +@pytest_asyncio.fixture +async def in_memory_repository(): + """Create an in-memory FileDocumentRepository for testing.""" + client = AsyncMongoMockClient() + db = client.test_database + repo = FileDocumentRepository(db) + await repo.initialize() + return repo + + +@pytest.fixture +def sample_file_document(): + """Sample FileDocument data for testing.""" + return FileDocument( + filename="sample_document.pdf", + filepath="/home/user/documents/sample_document.pdf", + file_type=FileType.PDF, + extraction_method=ExtractionMethod.OCR, + metadata={"pages": 5, "language": "en", "author": "John Doe"}, + detected_at=datetime.now(), + file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", + encoding="utf-8", + file_size=1024000, + mime_type="application/pdf" + ) + + +@pytest.fixture +def sample_update_data(): + """Sample update data for testing.""" + return { + "extraction_method": ExtractionMethod.HYBRID, + "metadata": {"pages": 10, "language": "fr", "updated": True}, + "file_size": 2048000 + } + + +@pytest.fixture +def multiple_sample_files(): + """Multiple FileDocument objects for list/search testing.""" + base_time = datetime.now() + return [ + FileDocument( + filename="first_doc.txt", + filepath="/docs/first_doc.txt", + file_type=FileType.TXT, + extraction_method=ExtractionMethod.DIRECT_TEXT, + metadata={"words": 500}, + detected_at=base_time, + file_hash="hash1" + "0" * 58, + encoding="utf-8", + file_size=5000, + mime_type="text/plain" + ), + FileDocument( + filename="second_document.pdf", + filepath="/docs/second_document.pdf", + file_type=FileType.PDF, + extraction_method=ExtractionMethod.OCR, + metadata={"pages": 8}, + detected_at=base_time, + file_hash="hash2" + "0" * 58, + encoding="utf-8", + file_size=10000, + mime_type="application/pdf" + ), + FileDocument( + filename="third_file.docx", + filepath="/docs/third_file.docx", + file_type=FileType.DOCX, + extraction_method=ExtractionMethod.HYBRID, + metadata={"paragraphs": 15}, + detected_at=base_time, + file_hash="hash3" + "0" * 58, + encoding="utf-8", + file_size=15000, + mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + ] + + +class TestFileDocumentRepositoryInitialization: + """Tests for repository initialization.""" + + @pytest.mark.asyncio + async def test_i_can_initialize_repository(self): + """Test repository initialization.""" + # Arrange + client = AsyncMongoMockClient() + db = client.test_database + repo = FileDocumentRepository(db) + await repo.initialize() + + # Act & Assert (should not raise any exception) + assert repo.db is not None + assert repo.collection is not None + # TODO : check that the indexes are created + + +class TestFileDocumentRepositoryCreation: + """Tests for file document creation functionality.""" + + @pytest.mark.asyncio + async def test_i_can_create_file_document(self, in_memory_repository, sample_file_document): + """Test successful file document creation.""" + # Act + created_file = await in_memory_repository.create_document(sample_file_document) + + # Assert + assert created_file is not None + assert created_file.filename == sample_file_document.filename + assert created_file.filepath == sample_file_document.filepath + assert created_file.file_type == sample_file_document.file_type + assert created_file.extraction_method == sample_file_document.extraction_method + assert created_file.metadata == sample_file_document.metadata + assert created_file.file_hash == sample_file_document.file_hash + assert created_file.file_size == sample_file_document.file_size + assert created_file.mime_type == sample_file_document.mime_type + assert created_file.id is not None + assert isinstance(created_file.id, ObjectId) + + @pytest.mark.asyncio + async def test_i_can_create_file_document_without_id(self, in_memory_repository, sample_file_document): + """Test creating file document with _id set to None (should be removed).""" + # Arrange + sample_file_document.id = None + + # Act + created_file = await in_memory_repository.create_document(sample_file_document) + + # Assert + assert created_file is not None + assert created_file.id is not None + assert isinstance(created_file.id, ObjectId) + + @pytest.mark.asyncio + async def test_i_cannot_create_duplicate_file_document(self, in_memory_repository, sample_file_document): + """Test that creating file document with duplicate filepath raises DuplicateKeyError.""" + # Arrange + await in_memory_repository.create_document(sample_file_document) + duplicate_file = FileDocument( + filename="different_name.pdf", + filepath=sample_file_document.filepath, # Same filepath + file_type=FileType.PDF, + extraction_method=ExtractionMethod.OCR, + metadata={"different": "metadata"}, + detected_at=datetime.now(), + file_hash="different_hash_123456789012345678901234567890123456789012345678", + encoding="utf-8", + file_size=2000, + mime_type="application/pdf" + ) + + # Act & Assert + with pytest.raises(DuplicateKeyError) as exc_info: + await in_memory_repository.create_document(duplicate_file) + + assert "already exists" in str(exc_info.value) + + @pytest.mark.asyncio + async def test_i_cannot_create_file_document_with_pymongo_error(self, in_memory_repository, + sample_file_document, mocker): + """Test handling of PyMongo errors during file document creation.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(ValueError) as exc_info: + await in_memory_repository.create_document(sample_file_document) + + assert "Failed to create file document" in str(exc_info.value) + + +class TestFileDocumentRepositoryFinding: + """Tests for file document finding functionality.""" + + @pytest.mark.asyncio + async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document): + """Test finding file document by valid ObjectId.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + + # Act + found_file = await in_memory_repository.find_document_by_id(str(created_file.id)) + + # Assert + assert found_file is not None + assert found_file.id == created_file.id + assert found_file.filename == created_file.filename + assert found_file.filepath == created_file.filepath + + @pytest.mark.asyncio + async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository): + """Test that invalid ObjectId returns None.""" + # Act + found_file = await in_memory_repository.find_document_by_id("invalid_id") + + # Assert + assert found_file is None + + @pytest.mark.asyncio + async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository): + """Test that nonexistent but valid ObjectId returns None.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + found_file = await in_memory_repository.find_document_by_id(nonexistent_id) + + # Assert + assert found_file is None + + @pytest.mark.asyncio + async def test_i_can_find_document_by_file_hash(self, in_memory_repository, sample_file_document): + """Test finding file document by file hash.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + + # Act + found_file = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash) + + # Assert + assert found_file is not None + assert found_file.file_hash == created_file.file_hash + assert found_file.id == created_file.id + + @pytest.mark.asyncio + async def test_i_cannot_find_document_with_nonexistent_file_hash(self, in_memory_repository): + """Test that nonexistent file hash returns None.""" + # Act + found_file = await in_memory_repository.find_document_by_hash("nonexistent_hash") + + # Assert + assert found_file is None + + @pytest.mark.asyncio + async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document): + """Test finding file document by filepath.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + + # Act + found_file = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath) + + # Assert + assert found_file is not None + assert found_file.filepath == created_file.filepath + assert found_file.id == created_file.id + + @pytest.mark.asyncio + async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository): + """Test that nonexistent filepath returns None.""" + # Act + found_file = await in_memory_repository.find_document_by_filepath("/nonexistent/path/file.pdf") + + # Assert + assert found_file is None + + @pytest.mark.asyncio + async def test_i_cannot_find_document_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during file document finding.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error")) + + # Act + found_file = await in_memory_repository.find_document_by_hash("test_hash") + + # Assert + assert found_file is None + + +class TestFileDocumentRepositoryNameMatching: + """Tests for file document name matching functionality.""" + + @pytest.mark.asyncio + async def test_i_can_find_documents_by_name_with_fuzzy_matching(self, in_memory_repository, multiple_sample_files): + """Test finding file documents by filename using fuzzy matching.""" + # Arrange + for file_doc in multiple_sample_files: + await in_memory_repository.create_document(file_doc) + + # Act + fuzzy_method = FuzzyMatching(threshold=0.5) + found_files = await in_memory_repository.find_document_by_name("document", fuzzy_method) + + # Assert + assert len(found_files) >= 1 + assert all(isinstance(file_doc, FileDocument) for file_doc in found_files) + # Should find files with "document" in the name + found_filenames = [f.filename for f in found_files] + assert any("document" in fname.lower() for fname in found_filenames) + + @pytest.mark.asyncio + async def test_i_can_find_documents_by_name_with_subsequence_matching(self, in_memory_repository, + multiple_sample_files): + """Test finding file documents by filename using subsequence matching.""" + # Arrange + for file_doc in multiple_sample_files: + await in_memory_repository.create_document(file_doc) + + # Act + subsequence_method = SubsequenceMatching() + found_files = await in_memory_repository.find_document_by_name("doc", subsequence_method) + + # Assert + assert len(found_files) >= 1 + assert all(isinstance(file_doc, FileDocument) for file_doc in found_files) + + @pytest.mark.asyncio + async def test_i_can_find_documents_by_name_with_default_method(self, in_memory_repository, multiple_sample_files): + """Test finding file documents by filename with default matching method.""" + # Arrange + for file_doc in multiple_sample_files: + await in_memory_repository.create_document(file_doc) + + # Act + found_files = await in_memory_repository.find_document_by_name("first") + + # Assert + assert len(found_files) >= 0 + assert all(isinstance(file_doc, FileDocument) for file_doc in found_files) + + @pytest.mark.asyncio + async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during document name matching.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) + + # Act + found_files = await in_memory_repository.find_document_by_name("test") + + # Assert + assert found_files == [] + + +class TestFileDocumentRepositoryListing: + """Tests for file document listing functionality.""" + + @pytest.mark.asyncio + async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_files): + """Test listing file documents with default pagination.""" + # Arrange + for file_doc in multiple_sample_files: + await in_memory_repository.create_document(file_doc) + + # Act + files = await in_memory_repository.list_documents() + + # Assert + assert len(files) == len(multiple_sample_files) + assert all(isinstance(file_doc, FileDocument) for file_doc in files) + + @pytest.mark.asyncio + async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_files): + """Test listing file documents with custom pagination.""" + # Arrange + for file_doc in multiple_sample_files: + await in_memory_repository.create_document(file_doc) + + # Act + files_page1 = await in_memory_repository.list_documents(skip=0, limit=2) + files_page2 = await in_memory_repository.list_documents(skip=2, limit=2) + + # Assert + assert len(files_page1) == 2 + assert len(files_page2) == 1 # Only 3 total files + + # Ensure no overlap between pages + page1_ids = [file_doc.id for file_doc in files_page1] + page2_ids = [file_doc.id for file_doc in files_page2] + assert len(set(page1_ids).intersection(set(page2_ids))) == 0 + + @pytest.mark.asyncio + async def test_i_can_list_documents_sorted_by_detected_at(self, in_memory_repository, sample_file_document): + """Test that file documents are sorted by detected_at in descending order.""" + # Arrange + file1 = sample_file_document.model_copy() + file1.filepath = "/docs/file1.pdf" + file1.filename = "file1.pdf" + file1.file_hash = "hash1" + "0" * 58 + file1.detected_at = datetime(2024, 1, 1, 10, 0, 0) + + file2 = sample_file_document.model_copy() + file2.filepath = "/docs/file2.pdf" + file2.filename = "file2.pdf" + file2.file_hash = "hash2" + "0" * 58 + file2.detected_at = datetime(2024, 1, 2, 10, 0, 0) # Later date + + created_file1 = await in_memory_repository.create_document(file1) + created_file2 = await in_memory_repository.create_document(file2) + + # Act + files = await in_memory_repository.list_documents() + + # Assert + assert len(files) == 2 + # Most recent (latest detected_at) should be first + assert files[0].id == created_file2.id + assert files[1].id == created_file1.id + + @pytest.mark.asyncio + async def test_i_can_list_empty_documents(self, in_memory_repository): + """Test listing file documents from empty collection.""" + # Act + files = await in_memory_repository.list_documents() + + # Assert + assert files == [] + + @pytest.mark.asyncio + async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during file document listing.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) + + # Act + files = await in_memory_repository.list_documents() + + # Assert + assert files == [] + + +class TestFileDocumentRepositoryUpdate: + """Tests for file document update functionality.""" + + @pytest.mark.asyncio + async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document, + sample_update_data): + """Test successful file document update.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + + # Act + updated_file = await in_memory_repository.update_document(str(created_file.id), sample_update_data) + + # Assert + assert updated_file is not None + assert updated_file.extraction_method == sample_update_data["extraction_method"] + assert updated_file.metadata == sample_update_data["metadata"] + assert updated_file.file_size == sample_update_data["file_size"] + assert updated_file.id == created_file.id + assert updated_file.filename == created_file.filename # Unchanged fields remain + assert updated_file.filepath == created_file.filepath + + @pytest.mark.asyncio + async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document): + """Test updating file document with partial data.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + partial_update = {"file_size": 999999} + + # Act + updated_file = await in_memory_repository.update_document(str(created_file.id), partial_update) + + # Assert + assert updated_file is not None + assert updated_file.file_size == 999999 + assert updated_file.filename == created_file.filename # Should remain unchanged + assert updated_file.metadata == created_file.metadata # Should remain unchanged + + @pytest.mark.asyncio + async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document): + """Test that None values are filtered out from update data.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + update_with_none = {"file_size": 777777, "metadata": None} + + # Act + updated_file = await in_memory_repository.update_document(str(created_file.id), update_with_none) + + # Assert + assert updated_file is not None + assert updated_file.file_size == 777777 + assert updated_file.metadata == created_file.metadata # Should remain unchanged (None filtered out) + + @pytest.mark.asyncio + async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document): + """Test updating file document with empty data returns current document.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + empty_update = {} + + # Act + result = await in_memory_repository.update_document(str(created_file.id), empty_update) + + # Assert + assert result is not None + assert result.filename == created_file.filename + assert result.filepath == created_file.filepath + assert result.metadata == created_file.metadata + + @pytest.mark.asyncio + async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data): + """Test that updating with invalid ID returns None.""" + # Act + result = await in_memory_repository.update_document("invalid_id", sample_update_data) + + # Assert + assert result is None + + @pytest.mark.asyncio + async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data): + """Test that updating nonexistent file document returns None.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + result = await in_memory_repository.update_document(nonexistent_id, sample_update_data) + + # Assert + assert result is None + + @pytest.mark.asyncio + async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document, + sample_update_data, mocker): + """Test handling of PyMongo errors during file document update.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + mocker.patch.object(in_memory_repository.collection, 'find_one_and_update', + side_effect=PyMongoError("Database error")) + + # Act + result = await in_memory_repository.update_document(str(created_file.id), sample_update_data) + + # Assert + assert result is None + + +class TestFileDocumentRepositoryDeletion: + """Tests for file document deletion functionality.""" + + @pytest.mark.asyncio + async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document): + """Test successful file document deletion.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + + # Act + deletion_result = await in_memory_repository.delete_document(str(created_file.id)) + + # Assert + assert deletion_result is True + + # Verify document is actually deleted + found_file = await in_memory_repository.find_document_by_id(str(created_file.id)) + assert found_file is None + + @pytest.mark.asyncio + async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository): + """Test that deleting with invalid ID returns False.""" + # Act + result = await in_memory_repository.delete_document("invalid_id") + + # Assert + assert result is False + + @pytest.mark.asyncio + async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository): + """Test that deleting nonexistent file document returns False.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + result = await in_memory_repository.delete_document(nonexistent_id) + + # Assert + assert result is False + + @pytest.mark.asyncio + async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker): + """Test handling of PyMongo errors during file document deletion.""" + # Arrange + created_file = await in_memory_repository.create_document(sample_file_document) + mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error")) + + # Act + result = await in_memory_repository.delete_document(str(created_file.id)) + + # Assert + assert result is False + + +class TestFileDocumentRepositoryUtilities: + """Tests for utility methods.""" + + @pytest.mark.asyncio + async def test_i_can_count_documents(self, in_memory_repository, sample_file_document): + """Test counting file documents.""" + # Arrange + initial_count = await in_memory_repository.count_documents() + await in_memory_repository.create_document(sample_file_document) + + # Act + final_count = await in_memory_repository.count_documents() + + # Assert + assert final_count == initial_count + 1 + + @pytest.mark.asyncio + async def test_i_can_count_zero_documents(self, in_memory_repository): + """Test counting file documents in empty collection.""" + # Act + count = await in_memory_repository.count_documents() + + # Assert + assert count == 0 + + @pytest.mark.asyncio + async def test_i_cannot_count_documents_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during file document counting.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'count_documents', side_effect=PyMongoError("Database error")) + + # Act + count = await in_memory_repository.count_documents() + + # Assert + assert count == 0 + + +class TestMatchingMethods: + """Tests for matching method classes.""" + + def test_i_can_create_fuzzy_matching_with_default_threshold(self): + """Test creating FuzzyMatching with default threshold.""" + # Act + fuzzy = FuzzyMatching() + + # Assert + assert fuzzy.threshold == 0.6 + + def test_i_can_create_fuzzy_matching_with_custom_threshold(self): + """Test creating FuzzyMatching with custom threshold.""" + # Act + fuzzy = FuzzyMatching(threshold=0.8) + + # Assert + assert fuzzy.threshold == 0.8 + + def test_i_can_create_subsequence_matching(self): + """Test creating SubsequenceMatching.""" + # Act + subsequence = SubsequenceMatching() + + # Assert + assert isinstance(subsequence, MatchMethodBase) + assert isinstance(subsequence, SubsequenceMatching) diff --git a/tests/test_user_repository.py b/tests/repositories/test_user_repository.py similarity index 100% rename from tests/test_user_repository.py rename to tests/repositories/test_user_repository.py diff --git a/tests/services/__init__.py b/tests/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/services/test_document_service.py b/tests/services/test_document_service.py new file mode 100644 index 0000000..2938555 --- /dev/null +++ b/tests/services/test_document_service.py @@ -0,0 +1,587 @@ +""" +Unit tests for DocumentService using in-memory MongoDB. + +Tests the orchestration logic with real MongoDB operations +using mongomock for better integration testing. +""" +import os +from datetime import datetime +from unittest.mock import patch + +import pytest +import pytest_asyncio +from bson import ObjectId +from mongomock_motor import AsyncMongoMockClient + +from app.models.document import FileType +from app.services.document_service import DocumentService + + +@pytest.fixture(autouse=True) +def cleanup_test_folder(): + """Clean up test folder.""" + import shutil + shutil.rmtree("test_folder", ignore_errors=True) + + +@pytest_asyncio.fixture +async def in_memory_database(): + """Create an in-memory database for testing.""" + client = AsyncMongoMockClient() + return client.test_database + + +@pytest_asyncio.fixture +async def document_service(in_memory_database): + """Create DocumentService with in-memory repositories.""" + service = DocumentService(in_memory_database, objects_folder="test_folder") + return service + + +@pytest.fixture +def sample_file_bytes(): + """Sample file content as bytes.""" + return b"This is a test PDF content" + + +@pytest.fixture +def sample_text_bytes(): + """Sample text file content as bytes.""" + return b"This is a test text file content" + + +@pytest.fixture +def sample_file_hash(): + """Expected SHA256 hash for sample file bytes.""" + import hashlib + return hashlib.sha256(b"This is a test PDF content").hexdigest() + + +def validate_file_saved(document_service, file_hash, file_bytes): + # Verify file is saved to disk + target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash) + assert os.path.exists(target_file_path) + + with open(target_file_path, "rb") as f: + content = f.read() + assert content == file_bytes + + +class TestCreateDocument: + """Tests for create_document method.""" + + @patch('app.services.document_service.magic.from_buffer') + @patch('app.services.document_service.datetime') + @pytest.mark.asyncio + async def test_i_can_create_document_with_new_content( + self, + mock_datetime, + mock_magic, + document_service, + sample_file_bytes + ): + """Test creating document when content doesn't exist yet.""" + # Setup mocks + fixed_time = datetime(2025, 1, 1, 10, 30, 0) + mock_datetime.now.return_value = fixed_time + mock_magic.return_value = "application/pdf" + + # Execute + result = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify document creation + assert result is not None + assert result.filename == "test.pdf" + assert result.filepath == "/test/test.pdf" + assert result.file_type == FileType.PDF + assert result.detected_at == fixed_time + assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes) + + # Verify document created in database + doc_in_db = await document_service.document_repository.find_document_by_id(result.id) + assert doc_in_db is not None + assert doc_in_db.id == result.id + assert doc_in_db.filename == result.filename + assert doc_in_db.filepath == result.filepath + assert doc_in_db.file_type == result.file_type + assert doc_in_db.detected_at == fixed_time + assert doc_in_db.file_hash == result.file_hash + + # Verify file is saved to disk + validate_file_saved(document_service, result.file_hash, sample_file_bytes) + + @patch('app.services.document_service.magic.from_buffer') + @patch('app.services.document_service.datetime') + @pytest.mark.asyncio + async def test_i_can_create_document_with_existing_content( + self, + mock_datetime, + mock_magic, + document_service, + sample_file_bytes + ): + """Test creating document when content already exists (deduplication).""" + # Setup mocks + fixed_time = datetime(2025, 1, 1, 10, 30, 0) + mock_datetime.now.return_value = fixed_time + mock_magic.return_value = "application/pdf" + + # Create first document + first_doc = await document_service.create_document( + "/test/first.pdf", + sample_file_bytes, + "utf-8" + ) + + # Create second document with same content + second_doc = await document_service.create_document( + "/test/second.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify both documents exist but share same hash + assert first_doc.file_hash == second_doc.file_hash + assert first_doc.filename != second_doc.filename + assert first_doc.filepath != second_doc.filepath + + @pytest.mark.asyncio + async def test_i_cannot_create_document_with_unsupported_file_type( + self, + document_service, + sample_file_bytes + ): + """Test that unsupported file types raise ValueError.""" + with pytest.raises(ValueError, match="Unsupported file type"): + await document_service.create_document( + "/test/test.xyz", # Unsupported extension + sample_file_bytes, + "utf-8" + ) + + @pytest.mark.asyncio + async def test_i_cannot_create_document_with_empty_file_path( + self, + document_service, + sample_file_bytes + ): + """Test that empty file path raises ValueError.""" + with pytest.raises(ValueError): + await document_service.create_document( + "", # Empty path + sample_file_bytes, + "utf-8" + ) + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_create_document_with_empty_bytes( + self, + mock_magic, + document_service + ): + """Test behavior with empty file bytes.""" + # Setup + mock_magic.return_value = "text/plain" + + # Execute with empty bytes + result = await document_service.create_document( + "/test/empty.txt", + b"", # Empty bytes + "utf-8" + ) + + # Verify file is saved to disk + validate_file_saved(document_service, result.file_hash, b"") + + +class TestGetMethods: + """Tests for document retrieval methods.""" + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_get_document_by_id( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by ID.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = await document_service.get_document_by_id(created_doc.id) + + # Verify + assert result is not None + assert result.id == created_doc.id + assert result.filename == created_doc.filename + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_get_document_by_hash( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by file hash.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = await document_service.get_document_by_hash(created_doc.file_hash) + + # Verify + assert result is not None + assert result.file_hash == created_doc.file_hash + assert result.filename == created_doc.filename + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_get_document_by_filepath( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by file path.""" + # Setup + mock_magic.return_value = "application/pdf" + test_path = "/test/unique_test.pdf" + + # Create a document first + created_doc = await document_service.create_document( + test_path, + sample_file_bytes, + "utf-8" + ) + + # Execute + result = await document_service.get_document_by_filepath(test_path) + + # Verify + assert result is not None + assert result.filepath == test_path + assert result.id == created_doc.id + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_get_document_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document with associated content.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = await document_service.get_document_content_by_hash(created_doc.file_hash) + + # Verify + assert result == sample_file_bytes + + @pytest.mark.asyncio + async def test_i_cannot_get_nonexistent_document_by_id( + self, + document_service + ): + """Test that nonexistent document returns None.""" + # Execute with random ObjectId + result = await document_service.get_document_by_id(ObjectId()) + + # Verify + assert result is None + + @pytest.mark.asyncio + async def test_i_cannot_get_nonexistent_document_by_hash( + self, + document_service + ): + """Test that nonexistent document hash returns None.""" + # Execute + result = await document_service.get_document_by_hash("nonexistent_hash") + + # Verify + assert result is None + + +class TestPaginationAndCounting: + """Tests for document listing and counting.""" + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_list_documents_with_pagination( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test document listing with pagination parameters.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create multiple documents + for i in range(5): + await document_service.create_document( + f"/test/test{i}.pdf", + sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique + "utf-8" + ) + + # Execute with pagination + result = await document_service.list_documents(skip=1, limit=2) + + # Verify + assert len(result) == 2 + + # Test counting + total_count = await document_service.count_documents() + assert total_count == 5 + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_count_documents( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test document counting.""" + # Setup + mock_magic.return_value = "text/plain" + + # Initially should be 0 + initial_count = await document_service.count_documents() + assert initial_count == 0 + + # Create some documents + for i in range(3): + await document_service.create_document( + f"/test/test{i}.txt", + sample_file_bytes + bytes(str(i), 'utf-8'), + "utf-8" + ) + + # Execute + final_count = await document_service.count_documents() + + # Verify + assert final_count == 3 + + +class TestUpdateAndDelete: + """Tests for document update and deletion operations.""" + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_update_document_metadata( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test updating document metadata.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute update + update_data = {"metadata": {"page_count": 5}} + result = await document_service.update_document(created_doc.id, update_data) + + # Verify + assert result is not None + assert result.metadata.get("page_count") == 5 + assert result.filename == created_doc.filename + assert result.filepath == created_doc.filepath + assert result.file_hash == created_doc.file_hash + assert result.file_type == created_doc.file_type + assert result.metadata == update_data['metadata'] + + @pytest.mark.asyncio + async def test_i_can_update_document_content( + self, + document_service, + sample_file_bytes + ): + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute update + update_data = {"file_bytes": b"this is an updated file content"} + result = await document_service.update_document(created_doc.id, update_data) + + assert result.filename == created_doc.filename + assert result.filepath == created_doc.filepath + assert result.file_hash != created_doc.file_hash + assert result.file_type == created_doc.file_type + assert result.metadata == created_doc.metadata + + # Verify file is saved to disk + validate_file_saved(document_service, result.file_hash, b"this is an updated file content") + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_delete_document_and_orphaned_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test deleting document with orphaned content cleanup.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify content exists + validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes) + + # Execute deletion + result = await document_service.delete_document(created_doc.id) + + # Verify document and content are deleted + assert result is True + + deleted_doc = await document_service.get_document_by_id(created_doc.id) + assert deleted_doc is None + + # validate content is deleted + file_hash = created_doc.file_hash[:24] + target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash) + assert not os.path.exists(target_file_path) + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_delete_document_without_affecting_shared_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test deleting document without removing shared content.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create two documents with same content + doc1 = await document_service.create_document( + "/test/test1.pdf", + sample_file_bytes, + "utf-8" + ) + + doc2 = await document_service.create_document( + "/test/test2.pdf", + sample_file_bytes, + "utf-8" + ) + + # They should share the same hash + assert doc1.file_hash == doc2.file_hash + + # Delete first document + result = await document_service.delete_document(doc1.id) + assert result is True + + # Verify first document is deleted but content still exists + deleted_doc = await document_service.get_document_by_id(doc1.id) + assert deleted_doc is None + + remaining_doc = await document_service.get_document_by_id(doc2.id) + assert remaining_doc is not None + + validate_file_saved(document_service, doc2.file_hash, sample_file_bytes) + + +class TestHashCalculation: + """Tests for file hash calculation utility.""" + + def test_i_can_calculate_consistent_file_hash(self, document_service): + """Test that file hash calculation is consistent.""" + test_bytes = b"Test content for hashing" + + # Calculate hash multiple times + hash1 = document_service._calculate_file_hash(test_bytes) + hash2 = document_service._calculate_file_hash(test_bytes) + + # Should be identical + assert hash1 == hash2 + assert len(hash1) == 64 # SHA256 produces 64-character hex string + + def test_i_get_different_hashes_for_different_content(self, document_service): + """Test that different content produces different hashes.""" + content1 = b"First content" + content2 = b"Second content" + + hash1 = document_service._calculate_file_hash(content1) + hash2 = document_service._calculate_file_hash(content2) + + assert hash1 != hash2 + + +class TestFileTypeDetection: + """Tests for file type detection.""" + + def test_i_can_detect_pdf_file_type(self, document_service): + """Test PDF file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.pdf") + assert file_type == FileType.PDF + + def test_i_can_detect_txt_file_type(self, document_service): + """Test text file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.txt") + assert file_type == FileType.TXT + + def test_i_can_detect_docx_file_type(self, document_service): + """Test DOCX file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.docx") + assert file_type == FileType.DOCX + + def test_i_cannot_detect_unsupported_file_type(self, document_service): + """Test unsupported file type raises ValueError.""" + with pytest.raises(ValueError, match="Unsupported file type"): + document_service._detect_file_type("/path/to/document.xyz") diff --git a/tests/test_connection.py b/tests/test_connection.py deleted file mode 100644 index 1fb9968..0000000 --- a/tests/test_connection.py +++ /dev/null @@ -1,187 +0,0 @@ -""" -Unit tests for MongoDB database connection module. - -Tests the database connection functionality with mocking -to avoid requiring actual MongoDB instance during tests. -""" - -import pytest -from unittest.mock import Mock, patch, MagicMock -from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError - -from app.database.connection import ( - create_mongodb_client, - get_database, - close_database_connection, - get_mongodb_client, - test_database_connection -) - - -def test_i_can_get_database_connection(): - """Test successful database connection creation.""" - mock_client = Mock() - mock_database = Mock() - - # Configure the mock to support dictionary-like access - mock_client.__getitem__ = Mock(return_value=mock_database) - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"): - with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"): - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - result = get_database() - - assert result == mock_database - mock_client.admin.command.assert_called_with('ping') - # Verify that __getitem__ was called with the database name - mock_client.__getitem__.assert_called_with("testdb") - - -def test_i_cannot_connect_to_invalid_mongodb_url(): - """Test fail-fast behavior with invalid MongoDB URL.""" - mock_client = Mock() - mock_client.admin.command.side_effect = ConnectionFailure("Connection failed") - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://invalid:27017"): - with pytest.raises(SystemExit) as exc_info: - create_mongodb_client() - - assert exc_info.value.code == 1 - - -def test_i_cannot_connect_with_server_selection_timeout(): - """Test fail-fast behavior with server selection timeout.""" - mock_client = Mock() - mock_client.admin.command.side_effect = ServerSelectionTimeoutError("Timeout") - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://timeout:27017"): - with pytest.raises(SystemExit) as exc_info: - create_mongodb_client() - - assert exc_info.value.code == 1 - - -def test_i_cannot_connect_with_unexpected_error(): - """Test fail-fast behavior with unexpected connection error.""" - with patch('app.database.connection.MongoClient', side_effect=Exception("Unexpected error")): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://error:27017"): - with pytest.raises(SystemExit) as exc_info: - create_mongodb_client() - - assert exc_info.value.code == 1 - - -def test_i_can_get_database_singleton(): - """Test that get_database returns the same instance (singleton pattern).""" - mock_client = Mock() - mock_database = Mock() - mock_client.__getitem__ = Mock(return_value=mock_database) - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"): - with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"): - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - # First call - db1 = get_database() - # Second call - db2 = get_database() - - assert db1 is db2 - # MongoClient should be called only once - assert mock_client.admin.command.call_count == 1 - - -def test_i_can_close_database_connection(): - """Test closing database connection.""" - mock_client = Mock() - mock_database = Mock() - mock_client.__getitem__ = Mock(return_value=mock_database) - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"): - with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"): - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - # Create connection - get_database() - - # Close connection - close_database_connection() - - mock_client.close.assert_called_once() - assert app.database.connection._client is None - assert app.database.connection._database is None - - -def test_i_can_get_mongodb_client(): - """Test getting raw MongoDB client instance.""" - mock_client = Mock() - mock_database = Mock() - mock_client.__getitem__ = Mock(return_value=mock_database) - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"): - with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"): - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - # Create connection first - get_database() - - # Get client - result = get_mongodb_client() - - assert result == mock_client - - -def test_i_can_get_none_mongodb_client_when_not_connected(): - """Test getting MongoDB client returns None when not connected.""" - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - result = get_mongodb_client() - assert result is None - - -def test_i_can_test_database_connection_success(): - """Test database connection health check - success case.""" - mock_database = Mock() - mock_database.command.return_value = True - - with patch('app.database.connection.get_database', return_value=mock_database): - result = test_database_connection() - - assert result is True - mock_database.command.assert_called_with('ping') - - -def test_i_can_close_connection_when_no_client(): - """Test closing connection when no client exists (should not raise error).""" - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - # Should not raise any exception - close_database_connection() - - assert app.database.connection._client is None - assert app.database.connection._database is None \ No newline at end of file diff --git a/tests/test_document_content_repository.py b/tests/test_document_content_repository.py deleted file mode 100644 index 1033e59..0000000 --- a/tests/test_document_content_repository.py +++ /dev/null @@ -1,311 +0,0 @@ -""" -Test suite for DocumentContentRepository with async/await support. - -This module contains comprehensive tests for all DocumentContentRepository methods -using mongomock-motor for in-memory MongoDB testing. -""" - -import pytest -import hashlib -from datetime import datetime - -import pytest_asyncio -from bson import ObjectId -from pymongo.errors import DuplicateKeyError -from mongomock_motor import AsyncMongoMockClient - -from app.database.repositories.document_content_repository import DocumentContentRepository -from app.models.document import DocumentContent - - -@pytest_asyncio.fixture -async def in_memory_repository(): - """Create an in-memory DocumentContentRepository for testing.""" - client = AsyncMongoMockClient() - db = client.test_database - repo = DocumentContentRepository(db) - await repo.initialize() - return repo - - -@pytest.fixture -def sample_document_content(): - """Sample DocumentContent data for testing.""" - content = "This is sample document content for testing purposes." - file_hash = hashlib.sha256(content.encode()).hexdigest() - - return DocumentContent( - file_hash=file_hash, - content=content, - encoding="utf-8", - file_size=len(content.encode()), - mime_type="text/plain" - ) - - -@pytest.fixture -def another_document_content(): - """Another sample DocumentContent data for testing.""" - content = "This is another sample document with different content." - file_hash = hashlib.sha256(content.encode()).hexdigest() - - return DocumentContent( - file_hash=file_hash, - content=content, - encoding="utf-8", - file_size=len(content.encode()), - mime_type="text/plain" - ) - - -class TestDocumentContentRepositoryCreation: - """Tests for document content creation functionality.""" - - @pytest.mark.asyncio - async def test_i_can_create_document_content(self, in_memory_repository, sample_document_content): - """Test successful document content creation.""" - # Act - created_content = await in_memory_repository.create_document_content(sample_document_content) - - # Assert - assert created_content is not None - assert created_content.file_hash == sample_document_content.file_hash - assert created_content.content == sample_document_content.content - assert created_content.encoding == sample_document_content.encoding - assert created_content.file_size == sample_document_content.file_size - assert created_content.mime_type == sample_document_content.mime_type - assert created_content.id is not None - - @pytest.mark.asyncio - async def test_i_cannot_create_document_content_with_duplicate_file_hash(self, in_memory_repository, - sample_document_content): - """Test that creating document content with duplicate file_hash raises DuplicateKeyError.""" - # Arrange - await in_memory_repository.create_document_content(sample_document_content) - - # Act & Assert - with pytest.raises(DuplicateKeyError) as exc_info: - await in_memory_repository.create_document_content(sample_document_content) - - assert "already exists" in str(exc_info.value) - - -class TestDocumentContentRepositoryFinding: - """Tests for document content finding functionality.""" - - @pytest.mark.asyncio - async def test_i_can_find_document_content_by_id(self, in_memory_repository, sample_document_content): - """Test finding document content by valid ID.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - - # Act - found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id)) - - # Assert - assert found_content is not None - assert found_content.id == created_content.id - assert found_content.file_hash == created_content.file_hash - assert found_content.content == created_content.content - - @pytest.mark.asyncio - async def test_i_cannot_find_document_content_by_invalid_id(self, in_memory_repository): - """Test that invalid ObjectId returns None.""" - # Act - found_content = await in_memory_repository.find_document_content_by_id("invalid_id") - - # Assert - assert found_content is None - - @pytest.mark.asyncio - async def test_i_cannot_find_document_content_by_nonexistent_id(self, in_memory_repository): - """Test that nonexistent but valid ObjectId returns None.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - found_content = await in_memory_repository.find_document_content_by_id(nonexistent_id) - - # Assert - assert found_content is None - - @pytest.mark.asyncio - async def test_i_can_find_document_content_by_file_hash(self, in_memory_repository, sample_document_content): - """Test finding document content by file hash.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - - # Act - found_content = await in_memory_repository.find_document_content_by_file_hash(sample_document_content.file_hash) - - # Assert - assert found_content is not None - assert found_content.file_hash == created_content.file_hash - assert found_content.id == created_content.id - - @pytest.mark.asyncio - async def test_i_cannot_find_document_content_by_nonexistent_file_hash(self, in_memory_repository): - """Test that nonexistent file hash returns None.""" - # Act - found_content = await in_memory_repository.find_document_content_by_file_hash("nonexistent_hash") - - # Assert - assert found_content is None - - -class TestDocumentContentRepositoryUpdate: - """Tests for document content update functionality.""" - - @pytest.mark.asyncio - async def test_i_can_update_document_content(self, in_memory_repository, sample_document_content): - """Test successful document content update.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - update_data = { - "content": "Updated content for testing", - "encoding": "utf-16", - "mime_type": "text/html" - } - - # Act - updated_content = await in_memory_repository.update_document_content(str(created_content.id), update_data) - - # Assert - assert updated_content is not None - assert updated_content.content == update_data["content"] - assert updated_content.encoding == update_data["encoding"] - assert updated_content.mime_type == update_data["mime_type"] - assert updated_content.id == created_content.id - assert updated_content.file_hash == created_content.file_hash # Should remain unchanged - - @pytest.mark.asyncio - async def test_i_cannot_update_document_content_with_invalid_id(self, in_memory_repository): - """Test that updating with invalid ID returns None.""" - # Act - result = await in_memory_repository.update_document_content("invalid_id", {"content": "test"}) - - # Assert - assert result is None - - @pytest.mark.asyncio - async def test_i_can_update_document_content_with_partial_data(self, in_memory_repository, sample_document_content): - """Test updating document content with partial data.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - partial_update = {"encoding": "iso-8859-1"} - - # Act - updated_content = await in_memory_repository.update_document_content(str(created_content.id), partial_update) - - # Assert - assert updated_content is not None - assert updated_content.encoding == "iso-8859-1" - assert updated_content.content == created_content.content # Should remain unchanged - assert updated_content.mime_type == created_content.mime_type # Should remain unchanged - - @pytest.mark.asyncio - async def test_i_can_update_document_content_with_empty_data(self, in_memory_repository, sample_document_content): - """Test updating document content with empty data returns current content.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - empty_update = {} - - # Act - result = await in_memory_repository.update_document_content(str(created_content.id), empty_update) - - # Assert - assert result is not None - assert result.content == created_content.content - assert result.encoding == created_content.encoding - assert result.mime_type == created_content.mime_type - - -class TestDocumentContentRepositoryDeletion: - """Tests for document content deletion functionality.""" - - @pytest.mark.asyncio - async def test_i_can_delete_document_content(self, in_memory_repository, sample_document_content): - """Test successful document content deletion.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - - # Act - deletion_result = await in_memory_repository.delete_document_content(str(created_content.id)) - - # Assert - assert deletion_result is True - - # Verify content is actually deleted - found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id)) - assert found_content is None - - @pytest.mark.asyncio - async def test_i_cannot_delete_document_content_with_invalid_id(self, in_memory_repository): - """Test that deleting with invalid ID returns False.""" - # Act - result = await in_memory_repository.delete_document_content("invalid_id") - - # Assert - assert result is False - - @pytest.mark.asyncio - async def test_i_cannot_delete_nonexistent_document_content(self, in_memory_repository): - """Test that deleting nonexistent document content returns False.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - result = await in_memory_repository.delete_document_content(nonexistent_id) - - # Assert - assert result is False - - -class TestDocumentContentRepositoryUtilities: - """Tests for utility methods.""" - - @pytest.mark.asyncio - async def test_i_can_check_content_exists(self, in_memory_repository, sample_document_content): - """Test checking if document content exists by file hash.""" - # Arrange - await in_memory_repository.create_document_content(sample_document_content) - - # Act - exists = await in_memory_repository.content_exists(sample_document_content.file_hash) - not_exists = await in_memory_repository.content_exists("nonexistent_hash") - - # Assert - assert exists is True - assert not_exists is False - - @pytest.mark.asyncio - async def test_i_can_list_document_contents(self, in_memory_repository, sample_document_content, - another_document_content): - """Test listing document contents with pagination.""" - # Arrange - await in_memory_repository.create_document_content(sample_document_content) - await in_memory_repository.create_document_content(another_document_content) - - # Act - all_contents = await in_memory_repository.list_document_contents() - limited_contents = await in_memory_repository.list_document_contents(skip=0, limit=1) - - # Assert - assert len(all_contents) == 2 - assert len(limited_contents) == 1 - assert all(isinstance(content, DocumentContent) for content in all_contents) - - @pytest.mark.asyncio - async def test_i_can_count_document_contents(self, in_memory_repository, sample_document_content, - another_document_content): - """Test counting document contents.""" - # Arrange - initial_count = await in_memory_repository.count_document_contents() - await in_memory_repository.create_document_content(sample_document_content) - await in_memory_repository.create_document_content(another_document_content) - - # Act - final_count = await in_memory_repository.count_document_contents() - - # Assert - assert final_count == initial_count + 2 \ No newline at end of file diff --git a/tests/test_document_repository.py b/tests/test_document_repository.py deleted file mode 100644 index a5cc5c1..0000000 --- a/tests/test_document_repository.py +++ /dev/null @@ -1,566 +0,0 @@ -""" -Test suite for FileDocumentRepository with async/await support. - -This module contains comprehensive tests for all FileDocumentRepository methods -using mongomock-motor for in-memory MongoDB testing. -""" - -import pytest -from datetime import datetime -from typing import Dict, Any - -import pytest_asyncio -from bson import ObjectId -from pymongo.errors import DuplicateKeyError, PyMongoError -from mongomock_motor import AsyncMongoMockClient - -from app.database.repositories.document_repository import FileDocumentRepository -from app.models.document import FileDocument, FileType - - -@pytest_asyncio.fixture -async def in_memory_repository(): - """Create an in-memory FileDocumentRepository for testing.""" - client = AsyncMongoMockClient() - db = client.test_database - repo = FileDocumentRepository(db) - # repo.db = db - # repo.collection = db.files - await repo.initialize() - return repo - - -@pytest.fixture -def sample_file_document(): - """Sample FileDocument data for testing.""" - return FileDocument( - filename="test_document.pdf", - filepath="/path/to/test_document.pdf", - file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", - file_type=FileType("pdf"), - detected_at=datetime.now(), - ) - - -@pytest.fixture -def sample_update_data(): - """Sample update data for testing.""" - return { - "metadata": {"tags": ["updated", "document"]}, - "file_type": FileType("txt"), - } - - -@pytest.fixture -def multiple_sample_documents(): - """Multiple FileDocument objects for list/search testing.""" - base_time = datetime.now() - return [ - FileDocument( - filename="document1.pdf", - filepath="/path/to/document1.pdf", - file_hash="hash1" + "0" * 58, - file_type=FileType("pdf"), - detected_at=base_time, - ), - FileDocument( - filename="similar_document.pdf", - filepath="/path/to/similar_document.pdf", - file_hash="hash2" + "0" * 58, - file_type=FileType("pdf"), - detected_at=base_time, - ), - FileDocument( - filename="completely_different.txt", - filepath="/path/to/completely_different.txt", - file_hash="hash3" + "0" * 58, - file_type=FileType("pdf"), - detected_at=base_time, - ) - ] - - -class TestFileDocumentRepositoryInitialization: - """Tests for repository initialization.""" - - @pytest.mark.asyncio - async def test_i_can_initialize_repository(self): - """Test repository initialization.""" - # Arrange - client = AsyncMongoMockClient() - db = client.test_database - repo = FileDocumentRepository(db) - await repo.initialize() - - # Act & Assert (should not raise any exception) - assert repo.db is not None - assert repo.collection is not None - # TODO : check that the indexes are create - - -class TestFileDocumentRepositoryCreation: - """Tests for file document creation functionality.""" - - @pytest.mark.asyncio - async def test_i_can_create_document(self, in_memory_repository, sample_file_document): - """Test successful file document creation.""" - # Act - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Assert - assert created_doc is not None - assert created_doc.filename == sample_file_document.filename - assert created_doc.filepath == sample_file_document.filepath - assert created_doc.file_hash == sample_file_document.file_hash - assert created_doc.file_type == sample_file_document.file_type - assert created_doc.id is not None - assert isinstance(created_doc.id, ObjectId) - - @pytest.mark.asyncio - async def test_i_can_create_document_without_id(self, in_memory_repository, sample_file_document): - """Test creating document with _id set to None (should be removed).""" - # Arrange - sample_file_document.id = None - - # Act - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Assert - assert created_doc is not None - assert created_doc.id is not None - assert isinstance(created_doc.id, ObjectId) - - @pytest.mark.asyncio - async def test_i_cannot_create_duplicate_document(self, in_memory_repository, sample_file_document): - """Test that creating document with duplicate hash raises DuplicateKeyError.""" - # Arrange - await in_memory_repository.create_document(sample_file_document) - duplicate_doc = FileDocument( - filename="different_name.pdf", - filepath=sample_file_document.filepath, - file_hash="different_hash" + "0" * 58, - file_type=FileType("pdf"), - detected_at=datetime.now() - ) - - # Act & Assert - with pytest.raises(DuplicateKeyError) as exc_info: - await in_memory_repository.create_document(duplicate_doc) - - assert "already exists" in str(exc_info.value) - - @pytest.mark.asyncio - async def test_i_cannot_create_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker): - """Test handling of PyMongo errors during document creation.""" - # Arrange - mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error")) - - # Act & Assert - with pytest.raises(ValueError) as exc_info: - await in_memory_repository.create_document(sample_file_document) - - assert "Failed to create file document" in str(exc_info.value) - - -class TestFileDocumentRepositoryFinding: - """Tests for file document finding functionality.""" - - @pytest.mark.asyncio - async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document): - """Test finding document by valid ObjectId.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id)) - - # Assert - assert found_doc is not None - assert found_doc.id == created_doc.id - assert found_doc.filename == created_doc.filename - assert found_doc.file_hash == created_doc.file_hash - - @pytest.mark.asyncio - async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository): - """Test that invalid ObjectId returns None.""" - # Act - found_doc = await in_memory_repository.find_document_by_id("invalid_id") - - # Assert - assert found_doc is None - - @pytest.mark.asyncio - async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository): - """Test that nonexistent but valid ObjectId returns None.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - found_doc = await in_memory_repository.find_document_by_id(nonexistent_id) - - # Assert - assert found_doc is None - - @pytest.mark.asyncio - async def test_i_can_find_document_by_hash(self, in_memory_repository, sample_file_document): - """Test finding document by file hash.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - found_doc = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash) - - # Assert - assert found_doc is not None - assert found_doc.file_hash == created_doc.file_hash - assert found_doc.id == created_doc.id - - @pytest.mark.asyncio - async def test_i_cannot_find_document_with_nonexistent_hash(self, in_memory_repository): - """Test that nonexistent hash returns None.""" - # Act - found_doc = await in_memory_repository.find_document_by_hash("nonexistent_hash") - - # Assert - assert found_doc is None - - @pytest.mark.asyncio - async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document): - """Test finding document by exact filepath.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - found_doc = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath) - - # Assert - assert found_doc is not None - assert found_doc.filepath == created_doc.filepath - assert found_doc.id == created_doc.id - - @pytest.mark.asyncio - async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository): - """Test that nonexistent filepath returns None.""" - # Act - found_doc = await in_memory_repository.find_document_by_filepath("/nonexistent/path.pdf") - - # Assert - assert found_doc is None - - -class TestFileDocumentRepositoryFuzzySearch: - """Tests for fuzzy search functionality by filename.""" - - @pytest.mark.asyncio - async def test_i_can_find_documents_by_exact_name(self, in_memory_repository, multiple_sample_documents): - """Test finding documents with exact filename match.""" - # Arrange - for doc in multiple_sample_documents: - await in_memory_repository.create_document(doc) - - # Act - found_docs = await in_memory_repository.find_document_by_name("document1.pdf") - - # Assert - assert len(found_docs) == 1 - assert found_docs[0].filename == "document1.pdf" - - @pytest.mark.asyncio - async def test_i_can_find_documents_by_fuzzy_name(self, in_memory_repository, multiple_sample_documents): - """Test finding documents with fuzzy matching using default threshold.""" - # Arrange - for doc in multiple_sample_documents: - await in_memory_repository.create_document(doc) - - # Act - found_docs = await in_memory_repository.find_document_by_name("document") - - # Assert - assert len(found_docs) >= 2 # Should find document1.pdf and similar_document.pdf - filenames = [doc.filename for doc in found_docs] - assert "document1.pdf" in filenames - assert "similar_document.pdf" in filenames - - @pytest.mark.asyncio - async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker): - """Test handling of PyMongo errors during name search.""" - # Arrange - mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) - - # Act - found_docs = await in_memory_repository.find_document_by_name("test") - - # Assert - assert found_docs == [] - - -class TestFileDocumentRepositoryListing: - """Tests for document listing functionality.""" - - @pytest.mark.asyncio - async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_documents): - """Test listing documents with default pagination.""" - # Arrange - for doc in multiple_sample_documents: - await in_memory_repository.create_document(doc) - - # Act - docs = await in_memory_repository.list_documents() - - # Assert - assert len(docs) == len(multiple_sample_documents) - assert all(isinstance(doc, FileDocument) for doc in docs) - - @pytest.mark.asyncio - async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_documents): - """Test listing documents with custom pagination.""" - # Arrange - for doc in multiple_sample_documents: - await in_memory_repository.create_document(doc) - - # Act - docs_page1 = await in_memory_repository.list_documents(skip=0, limit=2) - docs_page2 = await in_memory_repository.list_documents(skip=2, limit=2) - - # Assert - assert len(docs_page1) == 2 - assert len(docs_page2) == 1 # Only 3 total documents - - # Ensure no overlap between pages - page1_ids = [doc.id for doc in docs_page1] - page2_ids = [doc.id for doc in docs_page2] - assert len(set(page1_ids).intersection(set(page2_ids))) == 0 - - @pytest.mark.asyncio - async def test_i_can_list_documents_sorted_by_date(self, in_memory_repository, sample_file_document): - """Test that documents are sorted by detected_at in descending order.""" - # Arrange - from datetime import timedelta - - # Create documents with different timestamps - doc1 = sample_file_document.model_copy() - doc1.filename = "oldest.pdf" - doc1.filepath = f"/path/to/{doc1.filename}" - doc1.file_hash = "hash1" + "0" * 58 - doc1.detected_at = datetime.now() - timedelta(hours=2) - - doc2 = sample_file_document.model_copy() - doc2.filename = "newest.pdf" - doc2.filepath = f"/path/to/{doc2.filename}" - doc2.file_hash = "hash2" + "0" * 58 - doc2.detected_at = datetime.now() - - await in_memory_repository.create_document(doc1) - await in_memory_repository.create_document(doc2) - - # Act - docs = await in_memory_repository.list_documents() - - # Assert - assert len(docs) == 2 - assert docs[0].filename == "newest.pdf" # Most recent first - assert docs[1].filename == "oldest.pdf" - - @pytest.mark.asyncio - async def test_i_can_list_empty_documents(self, in_memory_repository): - """Test listing documents from empty collection.""" - # Act - docs = await in_memory_repository.list_documents() - - # Assert - assert docs == [] - - @pytest.mark.asyncio - async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker): - """Test handling of PyMongo errors during document listing.""" - # Arrange - mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) - - # Act - docs = await in_memory_repository.list_documents() - - # Assert - assert docs == [] - - -class TestFileDocumentRepositoryUpdate: - """Tests for document update functionality.""" - - @pytest.mark.asyncio - async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document, - sample_update_data): - """Test successful document update.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - updated_doc = await in_memory_repository.update_document(str(created_doc.id), sample_update_data) - - # Assert - assert updated_doc is not None - assert updated_doc.file_type == sample_update_data["file_type"] - assert updated_doc.id == created_doc.id - assert updated_doc.filename == created_doc.filename # Unchanged fields remain - - @pytest.mark.asyncio - async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document): - """Test updating document with partial data.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - partial_update = {"file_type": FileType("txt")} - - # Act - updated_doc = await in_memory_repository.update_document(str(created_doc.id), partial_update) - - # Assert - assert updated_doc is not None - assert updated_doc.file_type == FileType("txt") - assert updated_doc.filename == created_doc.filename # Should remain unchanged - assert updated_doc.filepath == created_doc.filepath # Should remain unchanged - - @pytest.mark.asyncio - async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document): - """Test that None values are filtered out from update data.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - update_with_none = {"metadata": {"tags": ["updated", "document"]}, "file_type": None} - - # Act - updated_doc = await in_memory_repository.update_document(str(created_doc.id), update_with_none) - - # Assert - assert updated_doc is not None - assert updated_doc.metadata == {"tags": ["updated", "document"]} - assert updated_doc.file_type == created_doc.file_type # Should remain unchanged (None filtered out) - - @pytest.mark.asyncio - async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document): - """Test updating document with empty data returns current document.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - empty_update = {} - - # Act - result = await in_memory_repository.update_document(str(created_doc.id), empty_update) - - # Assert - assert result is not None - assert result.filename == created_doc.filename - assert result.file_hash == created_doc.file_hash - assert result.metadata == created_doc.metadata - - @pytest.mark.asyncio - async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data): - """Test that updating with invalid ID returns None.""" - # Act - result = await in_memory_repository.update_document("invalid_id", sample_update_data) - - # Assert - assert result is None - - @pytest.mark.asyncio - async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data): - """Test that updating nonexistent document returns None.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - result = await in_memory_repository.update_document(nonexistent_id, sample_update_data) - - # Assert - assert result is None - - @pytest.mark.asyncio - async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document, - sample_update_data, mocker): - """Test handling of PyMongo errors during document update.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - mocker.patch.object(in_memory_repository.collection, 'find_one_and_update', - side_effect=PyMongoError("Database error")) - - # Act - result = await in_memory_repository.update_document(str(created_doc.id), sample_update_data) - - # Assert - assert result is None - - -class TestFileDocumentRepositoryDeletion: - """Tests for document deletion functionality.""" - - @pytest.mark.asyncio - async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document): - """Test successful document deletion.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - deletion_result = await in_memory_repository.delete_document(str(created_doc.id)) - - # Assert - assert deletion_result is True - - # Verify document is actually deleted - found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id)) - assert found_doc is None - - @pytest.mark.asyncio - async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository): - """Test that deleting with invalid ID returns False.""" - # Act - result = await in_memory_repository.delete_document("invalid_id") - - # Assert - assert result is False - - @pytest.mark.asyncio - async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository): - """Test that deleting nonexistent document returns False.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - result = await in_memory_repository.delete_document(nonexistent_id) - - # Assert - assert result is False - - @pytest.mark.asyncio - async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker): - """Test handling of PyMongo errors during document deletion.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error")) - - # Act - result = await in_memory_repository.delete_document(str(created_doc.id)) - - # Assert - assert result is False - - -class TestFileDocumentRepositoryUtilities: - """Tests for utility methods.""" - - @pytest.mark.asyncio - async def test_i_can_count_documents(self, in_memory_repository, sample_file_document): - """Test counting documents.""" - # Arrange - initial_count = await in_memory_repository.count_documents() - await in_memory_repository.create_document(sample_file_document) - - # Act - final_count = await in_memory_repository.count_documents() - - # Assert - assert final_count == initial_count + 1 - - @pytest.mark.asyncio - async def test_i_can_count_zero_documents(self, in_memory_repository): - """Test counting documents in empty collection.""" - # Act - count = await in_memory_repository.count_documents() - - # Assert - assert count == 0 diff --git a/tests/test_document_service.py b/tests/test_document_service.py deleted file mode 100644 index 532c2c4..0000000 --- a/tests/test_document_service.py +++ /dev/null @@ -1,697 +0,0 @@ -""" -Unit tests for DocumentService using in-memory MongoDB. - -Tests the orchestration logic with real MongoDB operations -using mongomock for better integration testing. -""" - -import pytest -import pytest_asyncio -from unittest.mock import Mock, patch -from datetime import datetime -from bson import ObjectId -from pathlib import Path - -from mongomock_motor import AsyncMongoMockClient - -from app.services.document_service import DocumentService -from app.database.repositories.document_repository import FileDocumentRepository -from app.database.repositories.document_content_repository import DocumentContentRepository -from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod -from app.models.types import PyObjectId - - -@pytest_asyncio.fixture -async def in_memory_file_repository(): - """Create an in-memory FileDocumentRepository for testing.""" - client = AsyncMongoMockClient() - db = client.test_database - repo = FileDocumentRepository(db) - await repo.initialize() - return repo - - -@pytest_asyncio.fixture -async def in_memory_content_repository(): - """Create an in-memory DocumentContentRepository for testing.""" - client = AsyncMongoMockClient() - db = client.test_database - repo = DocumentContentRepository(db) - await repo.initialize() - return repo - - -@pytest_asyncio.fixture -async def in_memory_database(): - """Create an in-memory database for testing.""" - client = AsyncMongoMockClient() - return client.test_database - - -@pytest_asyncio.fixture -async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database): - """Create DocumentService with in-memory repositories.""" - with patch('app.services.document_service.get_database', return_value=in_memory_database): - service = DocumentService() - service.file_repository = in_memory_file_repository - service.content_repository = in_memory_content_repository - return service - - -@pytest.fixture -def sample_file_bytes(): - """Sample file content as bytes.""" - return b"This is a test PDF content" - - -@pytest.fixture -def sample_text_bytes(): - """Sample text file content as bytes.""" - return b"This is a test text file content" - - -@pytest.fixture -def sample_file_hash(): - """Expected SHA256 hash for sample file bytes.""" - import hashlib - return hashlib.sha256(b"This is a test PDF content").hexdigest() - - -@pytest.fixture -def sample_file_document(): - """Sample FileDocument for testing.""" - return FileDocument( - id=ObjectId(), - filename="test.pdf", - filepath="/test/test.pdf", - file_type=FileType.PDF, - extraction_method=None, - metadata={}, - detected_at=datetime(2024, 1, 15, 10, 30, 0), - file_hash="test_hash" - ) - - -class TestCreateDocument: - """Tests for create_document method.""" - - @patch('app.services.document_service.magic.from_buffer') - @patch('app.services.document_service.datetime') - @pytest.mark.asyncio - async def test_i_can_create_document_with_new_content( - self, - mock_datetime, - mock_magic, - document_service, - sample_file_bytes - ): - """Test creating document when content doesn't exist yet.""" - # Setup mocks - fixed_time = datetime(2024, 1, 15, 10, 30, 0) - mock_datetime.utcnow.return_value = fixed_time - mock_magic.return_value = "application/pdf" - - # Execute - result = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Verify document creation - assert result is not None - assert result.filename == "test.pdf" - assert result.filepath == "/test/test.pdf" - assert result.file_type == FileType.PDF - assert result.detected_at == fixed_time - assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes) - - # Verify content was created - content = await document_service.content_repository.find_document_content_by_file_hash( - result.file_hash - ) - assert content is not None - assert content.file_hash == result.file_hash - assert content.file_size == len(sample_file_bytes) - assert content.mime_type == "application/pdf" - assert content.encoding == "utf-8" - - @patch('app.services.document_service.magic.from_buffer') - @patch('app.services.document_service.datetime') - @pytest.mark.asyncio - async def test_i_can_create_document_with_existing_content( - self, - mock_datetime, - mock_magic, - document_service, - sample_file_bytes - ): - """Test creating document when content already exists (deduplication).""" - # Setup mocks - fixed_time = datetime(2024, 1, 15, 10, 30, 0) - mock_datetime.utcnow.return_value = fixed_time - mock_magic.return_value = "application/pdf" - - # Create first document - first_doc = await document_service.create_document( - "/test/first.pdf", - sample_file_bytes, - "utf-8" - ) - - # Create second document with same content - second_doc = await document_service.create_document( - "/test/second.pdf", - sample_file_bytes, - "utf-8" - ) - - # Verify both documents exist but share same hash - assert first_doc.file_hash == second_doc.file_hash - assert first_doc.filename != second_doc.filename - assert first_doc.filepath != second_doc.filepath - - # Verify only one content document exists - all_content = await document_service.content_repository.list_document_content() - content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash] - assert len(content_for_hash) == 1 - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_create_document_with_different_encodings( - self, - mock_magic, - document_service, - sample_text_bytes - ): - """Test creating documents with different text encodings.""" - # Setup - mock_magic.return_value = "text/plain" - - # Test with different encodings - encodings = ["utf-8", "latin-1", "ascii"] - - for i, encoding in enumerate(encodings): - result = await document_service.create_document( - f"/test/test{i}.txt", - sample_text_bytes, - encoding - ) - - # Verify document was created - assert result is not None - assert result.file_type == FileType.TXT - - # Verify content has correct encoding - content = await document_service.content_repository.find_document_content_by_file_hash( - result.file_hash - ) - assert content.encoding == encoding - - @pytest.mark.asyncio - async def test_i_cannot_create_document_with_unsupported_file_type( - self, - document_service, - sample_file_bytes - ): - """Test that unsupported file types raise ValueError.""" - with pytest.raises(ValueError, match="Unsupported file type"): - await document_service.create_document( - "/test/test.xyz", # Unsupported extension - sample_file_bytes, - "utf-8" - ) - - @pytest.mark.asyncio - async def test_i_cannot_create_document_with_empty_file_path( - self, - document_service, - sample_file_bytes - ): - """Test that empty file path raises ValueError.""" - with pytest.raises(ValueError): - await document_service.create_document( - "", # Empty path - sample_file_bytes, - "utf-8" - ) - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_create_document_with_empty_bytes( - self, - mock_magic, - document_service - ): - """Test behavior with empty file bytes.""" - # Setup - mock_magic.return_value = "text/plain" - - # Execute with empty bytes - result = await document_service.create_document( - "/test/empty.txt", - b"", # Empty bytes - "utf-8" - ) - - # Should still work but with zero file size - assert result is not None - content = await document_service.content_repository.find_document_content_by_file_hash( - result.file_hash - ) - assert content.file_size == 0 - - -class TestGetMethods: - """Tests for document retrieval methods.""" - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_get_document_by_id( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test retrieving document by ID.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Execute - result = await document_service.get_document_by_id(created_doc.id) - - # Verify - assert result is not None - assert result.id == created_doc.id - assert result.filename == created_doc.filename - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_get_document_by_hash( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test retrieving document by file hash.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Execute - result = await document_service.get_document_by_hash(created_doc.file_hash) - - # Verify - assert result is not None - assert result.file_hash == created_doc.file_hash - assert result.filename == created_doc.filename - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_get_document_by_filepath( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test retrieving document by file path.""" - # Setup - mock_magic.return_value = "application/pdf" - test_path = "/test/unique_test.pdf" - - # Create a document first - created_doc = await document_service.create_document( - test_path, - sample_file_bytes, - "utf-8" - ) - - # Execute - result = await document_service.get_document_by_filepath(test_path) - - # Verify - assert result is not None - assert result.filepath == test_path - assert result.id == created_doc.id - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_get_document_with_content( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test retrieving document with associated content.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Execute - result = await document_service.get_document_with_content(created_doc.id) - - # Verify - assert result is not None - document, content = result - assert document.id == created_doc.id - assert content is not None - assert content.file_hash == created_doc.file_hash - - @pytest.mark.asyncio - async def test_i_cannot_get_nonexistent_document_by_id( - self, - document_service - ): - """Test that nonexistent document returns None.""" - # Execute with random ObjectId - result = await document_service.get_document_by_id(ObjectId()) - - # Verify - assert result is None - - @pytest.mark.asyncio - async def test_i_cannot_get_nonexistent_document_by_hash( - self, - document_service - ): - """Test that nonexistent document hash returns None.""" - # Execute - result = await document_service.get_document_by_hash("nonexistent_hash") - - # Verify - assert result is None - - -class TestPaginationAndCounting: - """Tests for document listing and counting.""" - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_list_documents_with_pagination( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test document listing with pagination parameters.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create multiple documents - for i in range(5): - await document_service.create_document( - f"/test/test{i}.pdf", - sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique - "utf-8" - ) - - # Execute with pagination - result = await document_service.list_documents(skip=1, limit=2) - - # Verify - assert len(result) == 2 - - # Test counting - total_count = await document_service.count_documents() - assert total_count == 5 - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_count_documents( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test document counting.""" - # Setup - mock_magic.return_value = "text/plain" - - # Initially should be 0 - initial_count = await document_service.count_documents() - assert initial_count == 0 - - # Create some documents - for i in range(3): - await document_service.create_document( - f"/test/test{i}.txt", - sample_file_bytes + bytes(str(i), 'utf-8'), - "utf-8" - ) - - # Execute - final_count = await document_service.count_documents() - - # Verify - assert final_count == 3 - - -class TestUpdateAndDelete: - """Tests for document update and deletion operations.""" - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_update_document_metadata( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test updating document metadata.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Execute update - update_data = {"metadata": {"page_count": 5}} - result = await document_service.update_document(created_doc.id, update_data) - - # Verify - assert result is not None - assert result.metadata.get("page_count") == 5 - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_delete_document_and_orphaned_content( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test deleting document with orphaned content cleanup.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Verify content exists - content_before = await document_service.content_repository.find_document_content_by_file_hash( - created_doc.file_hash - ) - assert content_before is not None - - # Execute deletion - result = await document_service.delete_document(created_doc.id) - - # Verify document and content are deleted - assert result is True - - deleted_doc = await document_service.get_document_by_id(created_doc.id) - assert deleted_doc is None - - content_after = await document_service.content_repository.find_document_content_by_file_hash( - created_doc.file_hash - ) - assert content_after is None - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_delete_document_without_affecting_shared_content( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test deleting document without removing shared content.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create two documents with same content - doc1 = await document_service.create_document( - "/test/test1.pdf", - sample_file_bytes, - "utf-8" - ) - - doc2 = await document_service.create_document( - "/test/test2.pdf", - sample_file_bytes, - "utf-8" - ) - - # They should share the same hash - assert doc1.file_hash == doc2.file_hash - - # Delete first document - result = await document_service.delete_document(doc1.id) - assert result is True - - # Verify first document is deleted but content still exists - deleted_doc = await document_service.get_document_by_id(doc1.id) - assert deleted_doc is None - - remaining_doc = await document_service.get_document_by_id(doc2.id) - assert remaining_doc is not None - - content = await document_service.content_repository.find_document_content_by_file_hash( - doc2.file_hash - ) - assert content is not None - - -class TestUtilityMethods: - """Tests for utility methods.""" - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_check_content_exists( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test checking if content exists by hash.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Initially content doesn't exist - test_hash = "nonexistent_hash" - exists_before = await document_service.content_exists(test_hash) - assert exists_before is False - - # Create a document - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Now content should exist - exists_after = await document_service.content_exists(created_doc.file_hash) - assert exists_after is True - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_update_document_content( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test updating extracted document content.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Update content - new_content = "Updated extracted content" - result = await document_service.update_document_content( - created_doc.file_hash, - new_content - ) - - # Verify update - assert result is not None - assert result.content == new_content - - # Verify persistence - updated_content = await document_service.content_repository.find_document_content_by_file_hash( - created_doc.file_hash - ) - assert updated_content.content == new_content - - -class TestHashCalculation: - """Tests for file hash calculation utility.""" - - def test_i_can_calculate_consistent_file_hash(self, document_service): - """Test that file hash calculation is consistent.""" - test_bytes = b"Test content for hashing" - - # Calculate hash multiple times - hash1 = document_service._calculate_file_hash(test_bytes) - hash2 = document_service._calculate_file_hash(test_bytes) - - # Should be identical - assert hash1 == hash2 - assert len(hash1) == 64 # SHA256 produces 64-character hex string - - def test_i_get_different_hashes_for_different_content(self, document_service): - """Test that different content produces different hashes.""" - content1 = b"First content" - content2 = b"Second content" - - hash1 = document_service._calculate_file_hash(content1) - hash2 = document_service._calculate_file_hash(content2) - - assert hash1 != hash2 - - -class TestFileTypeDetection: - """Tests for file type detection.""" - - def test_i_can_detect_pdf_file_type(self, document_service): - """Test PDF file type detection.""" - file_type = document_service._detect_file_type("/path/to/document.pdf") - assert file_type == FileType.PDF - - def test_i_can_detect_txt_file_type(self, document_service): - """Test text file type detection.""" - file_type = document_service._detect_file_type("/path/to/document.txt") - assert file_type == FileType.TXT - - def test_i_can_detect_docx_file_type(self, document_service): - """Test DOCX file type detection.""" - file_type = document_service._detect_file_type("/path/to/document.docx") - assert file_type == FileType.DOCX - - def test_i_cannot_detect_unsupported_file_type(self, document_service): - """Test unsupported file type raises ValueError.""" - with pytest.raises(ValueError, match="Unsupported file type"): - document_service._detect_file_type("/path/to/document.xyz") \ No newline at end of file diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_utils_document_matching.py b/tests/utils/test_document_matching.py similarity index 95% rename from tests/test_utils_document_matching.py rename to tests/utils/test_document_matching.py index ea83895..9025502 100644 --- a/tests/test_utils_document_matching.py +++ b/tests/utils/test_document_matching.py @@ -14,6 +14,8 @@ def get_doc(filename: str = None): file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"), detected_at=datetime.now(), + file_size=1024, + mime_type="application/pdf" ) diff --git a/tests/test_security.py b/tests/utils/test_security.py similarity index 100% rename from tests/test_security.py rename to tests/utils/test_security.py