From c3ea80363ff2983e3b1c552662fd73ac4dd85aeb Mon Sep 17 00:00:00 2001 From: Kodjo Sossouvi Date: Thu, 18 Sep 2025 22:53:51 +0200 Subject: [PATCH] Working on document repository --- Readme.md | 315 ++++++--- pytest.ini | 7 + requirements.txt | 11 +- .../repositories/document_repository.py | 248 ++++++++ .../database/repositories/user_repository.py | 144 +++-- src/file-processor/app/main.py | 1 - src/file-processor/app/models/document.py | 142 +++++ src/file-processor/app/models/types.py | 32 + src/file-processor/app/models/user.py | 29 +- src/file-processor/requirements.txt | 1 + tests/test_document_repository.py | 602 ++++++++++++++++++ tests/test_user_repository.py | 591 ++++++++--------- 12 files changed, 1597 insertions(+), 526 deletions(-) create mode 100644 pytest.ini create mode 100644 src/file-processor/app/database/repositories/document_repository.py create mode 100644 src/file-processor/app/models/document.py create mode 100644 src/file-processor/app/models/types.py create mode 100644 tests/test_document_repository.py diff --git a/Readme.md b/Readme.md index 31be15e..036ef5a 100644 --- a/Readme.md +++ b/Readme.md @@ -178,93 +178,6 @@ DELETE /users/{user_id} # Delete user (admin only) GET /users/me # Get current user profile (authenticated users) ``` -## Docker Commands Reference - -### Initial Setup & Build - -```bash -# Build and start all services (first time) -docker-compose up --build - -# Build and start in background -docker-compose up --build -d - -# Build specific service -docker-compose build file-processor -docker-compose build worker -``` - -### Development Workflow - -```bash -# Start all services -docker-compose up - -# Start in background (detached mode) -docker-compose up -d - -# Stop all services -docker-compose down - -# Stop and remove volumes (⚠️ deletes MongoDB data) -docker-compose down -v - -# Restart specific service -docker-compose restart file-processor -docker-compose restart worker -docker-compose restart redis -docker-compose restart mongodb -``` - -### Monitoring & Debugging - -```bash -# View logs of all services -docker-compose logs - -# View logs of specific service -docker-compose logs file-processor -docker-compose logs worker -docker-compose logs redis -docker-compose logs mongodb - -# Follow logs in real-time -docker-compose logs -f -docker-compose logs -f worker - -# View running containers -docker-compose ps - -# Execute command in running container -docker-compose exec file-processor bash -docker-compose exec worker bash -docker-compose exec mongodb mongosh -``` - -### Service Management - -```bash -# Start only specific services -docker-compose up redis mongodb file-processor - -# Stop specific service -docker-compose stop worker -docker-compose stop file-processor - -# Remove stopped containers -docker-compose rm - -# Scale workers (multiple instances) -docker-compose up --scale worker=3 -``` - -### Hot-Reload Configuration - -- **file-processor**: Hot-reload enabled via `--reload` flag - - Code changes in `src/file-processor/app/` automatically restart FastAPI -- **worker**: No hot-reload (manual restart required for stability) - - Code changes in `src/worker/tasks/` require: `docker-compose restart worker` - ### Useful Service URLs - **FastAPI API**: http://localhost:8000 @@ -298,6 +211,118 @@ On first startup, the application automatically creates a default admin user: - **Email**: `admin@mydocmanager.local` **⚠️ Important**: Change the default admin password immediately after first login in production environments. +## File Processing Architecture + +### Document Processing Flow + +1. **File Detection**: Watchdog monitors `/volumes/watched_files/` directory in real-time +2. **Task Creation**: File watcher creates Celery task for each detected file +3. **Document Processing**: Celery worker processes the document and extracts content +4. **Database Storage**: Processed data stored in MongoDB collections + +### MongoDB Collections Design + +#### Files Collection + +Stores file metadata and extracted content: + +```json +{ + "_id": "ObjectId", + "filename": "document.pdf", + "filepath": "/watched_files/document.pdf", + "file_type": "pdf", + "mime_type": "application/pdf", + "file_size": 2048576, + "content": "extracted text content...", + "encoding": "utf-8", + "extraction_method": "direct_text", + // direct_text, ocr, hybrid + "metadata": { + "page_count": 15, + // for PDFs + "word_count": 250, + // for text files + "image_dimensions": { + // for images + "width": 1920, + "height": 1080 + } + }, + "detected_at": "2024-01-15T10:29:00Z", + "file_hash": "sha256_hash_value" +} +``` + +#### Processing Jobs Collection + +Tracks processing status and lifecycle: + +```json +{ + "_id": "ObjectId", + "file_id": "reference_to_files_collection", + "status": "completed", + // pending, processing, completed, failed + "task_id": "celery_task_uuid", + "created_at": "2024-01-15T10:29:00Z", + "started_at": "2024-01-15T10:29:30Z", + "completed_at": "2024-01-15T10:30:00Z", + "error_message": null +} +``` + +### Supported File Types (Initial Implementation) + +- **Text Files** (`.txt`): Direct content reading +- **PDF Documents** (`.pdf`): Text extraction via PyMuPDF/pdfplumber +- **Word Documents** (`.docx`): Content extraction via python-docx + +### File Processing Architecture Decisions + +#### Watchdog Implementation + +- **Choice**: Dedicated observer thread (Option A) +- **Rationale**: Standard approach, clean separation of concerns +- **Implementation**: Watchdog observer runs in separate thread from FastAPI + +#### Task Dispatch Strategy + +- **Choice**: Direct Celery task creation from file watcher +- **Rationale**: Minimal latency, straightforward flow +- **Implementation**: File detected → Immediate Celery task dispatch + +#### Data Storage Strategy + +- **Choice**: Separate collections for files and processing status +- **Rationale**: Clean separation of file data vs processing lifecycle +- **Benefits**: + - Better query performance + - Clear data model boundaries + - Easy processing status tracking + +#### Content Storage Location + +- **Choice**: Store extracted content in `files` collection +- **Rationale**: Content is intrinsic property of the file +- **Benefits**: Single query to get file + content, simpler data model + +### Implementation Order + +1. ✅ Pydantic models for MongoDB collections +2. ✅ Repository layer for data access (files + processing_jobs) +3. ✅ Celery tasks for document processing +4. ✅ Watchdog file monitoring implementation +5. ✅ FastAPI integration and startup coordination + +### Processing Pipeline Features + +- **Duplicate Detection**: SHA256 hashing prevents reprocessing same files +- **Error Handling**: Failed processing tracked with error messages +- **Status Tracking**: Real-time processing status via `processing_jobs` collection +- **Extensible Metadata**: Flexible metadata storage per file type +- **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches + ## Key Implementation Notes ### Python Standards @@ -338,6 +363,11 @@ On first startup, the application automatically creates a default admin user: 7. **Celery with Redis**: Chosen over other async patterns for scalability 8. **EasyOCR Preferred**: Selected over Tesseract for modern OCR needs 9. **Container Development**: Hot-reload setup required for development workflow +10. **Dedicated Watchdog Observer**: Thread-based file monitoring for reliability +11. **Separate MongoDB Collections**: Files and processing jobs stored separately +12. **Content in Files Collection**: Extracted content stored with file metadata +13. **Direct Task Dispatch**: File watcher directly creates Celery tasks +14. **SHA256 Duplicate Detection**: Prevents reprocessing identical files ### Development Process Requirements @@ -351,13 +381,104 @@ On first startup, the application automatically creates a default admin user: 1. ✅ Create docker-compose.yml with all services => Done 2. ✅ Define user management and authentication architecture => Done 3. ✅ Implement user models and authentication services => - 1. models/user.py => Done - 2. models/auth.py => Done - 3. database/repositories/user_repository.py => Done -4. Add automatic admin user creation if it does not exists -5. Create protected API routes for user management -6. Implement basic FastAPI service structure -7. Add watchdog file monitoring -8. Create Celery task structure -9. Implement document processing tasks -10. Build React monitoring interface with authentication + 1. models/user.py => Done + 2. models/auth.py => Done + 3. database/repositories/user_repository.py => Done +4. ✅ Add automatic admin user creation if it does not exists => Done +5. **IN PROGRESS**: Implement file processing pipeline => + 1. Create Pydantic models for files and processing_jobs collections + 2. Implement repository layer for file and processing job data access + 3. Create Celery tasks for document processing (.txt, .pdf, .docx) + 4. Implement Watchdog file monitoring with dedicated observer + 5. Integrate file watcher with FastAPI startup +6. Create protected API routes for user management +7. Build React monitoring interface with authentication + +## Annexes + +### Docker Commands Reference + +#### Initial Setup & Build + +```bash +# Build and start all services (first time) +docker-compose up --build + +# Build and start in background +docker-compose up --build -d + +# Build specific service +docker-compose build file-processor +docker-compose build worker +``` + +#### Development Workflow + +```bash +# Start all services +docker-compose up + +# Start in background (detached mode) +docker-compose up -d + +# Stop all services +docker-compose down + +# Stop and remove volumes (⚠️ deletes MongoDB data) +docker-compose down -v + +# Restart specific service +docker-compose restart file-processor +docker-compose restart worker +docker-compose restart redis +docker-compose restart mongodb +``` + +#### Monitoring & Debugging + +```bash +# View logs of all services +docker-compose logs + +# View logs of specific service +docker-compose logs file-processor +docker-compose logs worker +docker-compose logs redis +docker-compose logs mongodb + +# Follow logs in real-time +docker-compose logs -f +docker-compose logs -f worker + +# View running containers +docker-compose ps + +# Execute command in running container +docker-compose exec file-processor bash +docker-compose exec worker bash +docker-compose exec mongodb mongosh +``` + +#### Service Management + +```bash +# Start only specific services +docker-compose up redis mongodb file-processor + +# Stop specific service +docker-compose stop worker +docker-compose stop file-processor + +# Remove stopped containers +docker-compose rm + +# Scale workers (multiple instances) +docker-compose up --scale worker=3 +``` + +### Hot-Reload Configuration + +- **file-processor**: Hot-reload enabled via `--reload` flag + - Code changes in `src/file-processor/app/` automatically restart FastAPI +- **worker**: No hot-reload (manual restart required for stability) + - Code changes in `src/worker/tasks/` require: `docker-compose restart worker` diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..7171ad4 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,7 @@ +[tool:pytest] +asyncio_mode = auto +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +pythonpath = src/file-processor \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b896ce5..e800676 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ annotated-types==0.7.0 anyio==4.10.0 bcrypt==4.3.0 billiard==4.2.1 -bson==0.5.10 celery==5.5.3 click==8.2.1 click-didyoumean==0.3.1 @@ -17,17 +16,25 @@ httptools==0.6.4 idna==3.10 iniconfig==2.1.0 kombu==5.5.4 +mongomock==4.3.0 +mongomock-motor==0.0.36 +motor==3.7.1 packaging==25.0 +pipdeptree==2.28.0 pluggy==1.6.0 prompt_toolkit==3.0.52 pydantic==2.11.9 pydantic_core==2.33.2 Pygments==2.19.2 -pymongo==4.15.0 +pymongo==4.15.1 pytest==8.4.2 +pytest-asyncio==1.2.0 +pytest-mock==3.15.1 python-dateutil==2.9.0.post0 python-dotenv==1.1.1 +pytz==2025.2 PyYAML==6.0.2 +sentinels==1.1.1 six==1.17.0 sniffio==1.3.1 starlette==0.47.3 diff --git a/src/file-processor/app/database/repositories/document_repository.py b/src/file-processor/app/database/repositories/document_repository.py new file mode 100644 index 0000000..7cc468b --- /dev/null +++ b/src/file-processor/app/database/repositories/document_repository.py @@ -0,0 +1,248 @@ +""" +File repository for database operations on FileDocument collection. + +This module provides data access operations for file documents stored +in MongoDB with proper error handling and type safety. +""" + +from typing import Optional, List +from bson import ObjectId +from pymongo.errors import DuplicateKeyError, PyMongoError +from difflib import SequenceMatcher +from motor.motor_asyncio import AsyncIOMotorCollection +from app.models.document import FileDocument +from app.database.connection import get_database + + +class FileDocumentRepository: + """ + Repository class for file document database operations. + + This class handles all database operations for FileDocument objects + with proper error handling and data validation. + """ + + def __init__(self): + """Initialize file repository with database connection.""" + self.db = get_database() + self.collection: AsyncIOMotorCollection = self.db.files + self._ensure_indexes() + + async def _ensure_indexes(self): + """ + Ensure required database indexes exist. + + Creates unique index on username field to prevent duplicates. + """ + try: + await self.collection.create_index("filepath", unique=True) + except PyMongoError: + # Index might already exist, ignore error + pass + + async def create_document(self, file_data: FileDocument) -> FileDocument: + """ + Create a new file document in database. + + Args: + file_data (FileDocument): File document data to create + + Returns: + FileDocument: Created file document with database ID + + Raises: + ValueError: If file creation fails due to validation + DuplicateKeyError: If file with same hash already exists + """ + try: + file_dict = file_data.model_dump(by_alias=True, exclude_unset=True) + if "_id" in file_dict and file_dict["_id"] is None: + del file_dict["_id"] + + result = await self.collection.insert_one(file_dict) + file_data.id = result.inserted_id + return file_data + + except DuplicateKeyError as e: + raise DuplicateKeyError(f"File with same hash already exists: {e}") + except PyMongoError as e: + raise ValueError(f"Failed to create file document: {e}") + + async def find_document_by_id(self, file_id: str) -> Optional[FileDocument]: + """ + Find file document by ID. + + Args: + file_id (str): File document ID to search for + + Returns: + FileDocument or None: File document if found, None otherwise + """ + try: + if not ObjectId.is_valid(file_id): + return None + + file_doc = await self.collection.find_one({"_id": ObjectId(file_id)}) + if file_doc: + return FileDocument(**file_doc) + return None + + except PyMongoError: + return None + + async def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]: + """ + Find file document by file hash to detect duplicates. + + Args: + file_hash (str): SHA256 hash of file content + + Returns: + FileDocument or None: File document if found, None otherwise + """ + try: + file_doc = await self.collection.find_one({"file_hash": file_hash}) + if file_doc: + return FileDocument(**file_doc) + return None + + except PyMongoError: + return None + + async def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: + """ + Find file document by exact filepath. + + Args: + filepath (str): Full path to the file + + Returns: + FileDocument or None: File document if found, None otherwise + """ + try: + file_doc = await self.collection.find_one({"filepath": filepath}) + if file_doc: + return FileDocument(**file_doc) + return None + + except PyMongoError: + return None + + async def find_document_by_name(self, filename: str, similarity_threshold: float = 0.6) -> List[FileDocument]: + """ + Find file documents by filename using fuzzy matching. + + Args: + filename (str): Filename to search for + similarity_threshold (float): Minimum similarity ratio (0.0 to 1.0) + + Returns: + List[FileDocument]: List of matching files sorted by similarity score + """ + try: + # Get all files from database + cursor = self.collection.find({}) + all_files = await cursor.to_list(length=None) + + matches = [] + for file_doc in all_files: + file_obj = FileDocument(**file_doc) + # Calculate similarity between search term and filename + similarity = SequenceMatcher(None, filename.lower(), file_obj.filename.lower()).ratio() + + if similarity >= similarity_threshold: + matches.append((file_obj, similarity)) + + # Sort by similarity score (highest first) + matches.sort(key=lambda x: x[1], reverse=True) + + # Return only the FileDocument objects + return [match[0] for match in matches] + + except PyMongoError: + return [] + + async def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]: + """ + List file documents with pagination. + + Args: + skip (int): Number of documents to skip (default: 0) + limit (int): Maximum number of documents to return (default: 100) + + Returns: + List[FileDocument]: List of file documents + """ + try: + cursor = self.collection.find({}).skip(skip).limit(limit).sort("detected_at", -1) + file_docs = await cursor.to_list(length=limit) + return [FileDocument(**doc) for doc in file_docs] + + except PyMongoError: + return [] + + async def count_documents(self) -> int: + """ + Count total number of file documents. + + Returns: + int: Total number of file documents in collection + """ + try: + return await self.collection.count_documents({}) + except PyMongoError: + return 0 + + async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]: + """ + Update file document with new data. + + Args: + file_id (str): File document ID to update + update_data (dict): Fields to update + + Returns: + FileDocument or None: Updated file document if successful, None otherwise + """ + try: + if not ObjectId.is_valid(file_id): + return None + + # Remove None values from update data + clean_update_data = {k: v for k, v in update_data.items() if v is not None} + + if not clean_update_data: + return await self.find_document_by_id(file_id) + + result = await self.collection.find_one_and_update( + {"_id": ObjectId(file_id)}, + {"$set": clean_update_data}, + return_document=True + ) + + if result: + return FileDocument(**result) + return None + + except PyMongoError: + return None + + async def delete_document(self, file_id: str) -> bool: + """ + Delete file document from database. + + Args: + file_id (str): File document ID to delete + + Returns: + bool: True if file was deleted, False otherwise + """ + try: + if not ObjectId.is_valid(file_id): + return False + + result = await self.collection.delete_one({"_id": ObjectId(file_id)}) + return result.deleted_count > 0 + + except PyMongoError: + return False diff --git a/src/file-processor/app/database/repositories/user_repository.py b/src/file-processor/app/database/repositories/user_repository.py index c227476..54a5c87 100644 --- a/src/file-processor/app/database/repositories/user_repository.py +++ b/src/file-processor/app/database/repositories/user_repository.py @@ -2,15 +2,14 @@ User repository for MongoDB operations. This module implements the repository pattern for user CRUD operations -with dependency injection of the database connection. +with dependency injection of the database connection using async/await. """ -from typing import Optional, List, Dict, Any +from typing import Optional, List from datetime import datetime from bson import ObjectId -from pymongo.database import Database -from pymongo.errors import DuplicateKeyError -from pymongo.collection import Collection +from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection +from pymongo.errors import DuplicateKeyError, PyMongoError from app.models.user import UserCreate, UserInDB, UserUpdate from app.utils.security import hash_password @@ -21,35 +20,33 @@ class UserRepository: Repository class for user CRUD operations in MongoDB. This class handles all database operations related to users, - following the repository pattern with dependency injection. + following the repository pattern with dependency injection and async/await. """ - def __init__(self, database: Database): + def __init__(self, database: AsyncIOMotorDatabase): """ Initialize repository with database dependency. Args: - database (Database): MongoDB database instance + database (AsyncIOMotorDatabase): MongoDB database instance """ self.db = database - self.collection: Collection = database.users - - # Create unique index on username for duplicate prevention + self.collection: AsyncIOMotorCollection = database.users self._ensure_indexes() - def _ensure_indexes(self): + async def _ensure_indexes(self): """ Ensure required database indexes exist. Creates unique index on username field to prevent duplicates. """ try: - self.collection.create_index("username", unique=True) - except Exception: + await self.collection.create_index("username", unique=True) + except PyMongoError: # Index might already exist, ignore error pass - def create_user(self, user_data: UserCreate) -> UserInDB: + async def create_user(self, user_data: UserCreate) -> UserInDB: """ Create a new user in the database. @@ -61,6 +58,7 @@ class UserRepository: Raises: DuplicateKeyError: If username already exists + ValueError: If user creation fails due to validation """ user_dict = { "username": user_data.username, @@ -73,13 +71,15 @@ class UserRepository: } try: - result = self.collection.insert_one(user_dict) + result = await self.collection.insert_one(user_dict) user_dict["_id"] = result.inserted_id return UserInDB(**user_dict) - except DuplicateKeyError: - raise DuplicateKeyError(f"User with username '{user_data.username}' already exists") + except DuplicateKeyError as e: + raise DuplicateKeyError(f"User with username '{user_data.username}' already exists: {e}") + except PyMongoError as e: + raise ValueError(f"Failed to create user: {e}") - def find_user_by_username(self, username: str) -> Optional[UserInDB]: + async def find_user_by_username(self, username: str) -> Optional[UserInDB]: """ Find user by username. @@ -89,12 +89,15 @@ class UserRepository: Returns: UserInDB or None: User if found, None otherwise """ - user_doc = self.collection.find_one({"username": username}) - if user_doc: - return UserInDB(**user_doc) - return None + try: + user_doc = await self.collection.find_one({"username": username}) + if user_doc: + return UserInDB(**user_doc) + return None + except PyMongoError: + return None - def find_user_by_id(self, user_id: str) -> Optional[UserInDB]: + async def find_user_by_id(self, user_id: str) -> Optional[UserInDB]: """ Find user by ID. @@ -105,16 +108,17 @@ class UserRepository: UserInDB or None: User if found, None otherwise """ try: - object_id = ObjectId(user_id) - user_doc = self.collection.find_one({"_id": object_id}) + if not ObjectId.is_valid(user_id): + return None + + user_doc = await self.collection.find_one({"_id": ObjectId(user_id)}) if user_doc: return UserInDB(**user_doc) - except Exception: - # Invalid ObjectId format - pass - return None + return None + except PyMongoError: + return None - def find_user_by_email(self, email: str) -> Optional[UserInDB]: + async def find_user_by_email(self, email: str) -> Optional[UserInDB]: """ Find user by email address. @@ -124,12 +128,15 @@ class UserRepository: Returns: UserInDB or None: User if found, None otherwise """ - user_doc = self.collection.find_one({"email": email}) - if user_doc: - return UserInDB(**user_doc) - return None + try: + user_doc = await self.collection.find_one({"email": email}) + if user_doc: + return UserInDB(**user_doc) + return None + except PyMongoError: + return None - def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]: + async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]: """ Update user information. @@ -141,11 +148,12 @@ class UserRepository: UserInDB or None: Updated user if found, None otherwise """ try: - object_id = ObjectId(user_id) + if not ObjectId.is_valid(user_id): + return None # Build update document with only provided fields update_data = {"updated_at": datetime.now()} - + if user_update.username is not None: update_data["username"] = user_update.username if user_update.email is not None: @@ -157,20 +165,26 @@ class UserRepository: if user_update.is_active is not None: update_data["is_active"] = user_update.is_active - result = self.collection.update_one( - {"_id": object_id}, - {"$set": update_data} + # Remove None values from update data + clean_update_data = {k: v for k, v in update_data.items() if v is not None} + + if not clean_update_data: + return await self.find_user_by_id(user_id) + + result = await self.collection.find_one_and_update( + {"_id": ObjectId(user_id)}, + {"$set": clean_update_data}, + return_document=True ) - if result.matched_count > 0: - return self.find_user_by_id(user_id) + if result: + return UserInDB(**result) + return None - except Exception: - # Invalid ObjectId format or other errors - pass - return None + except PyMongoError: + return None - def delete_user(self, user_id: str) -> bool: + async def delete_user(self, user_id: str) -> bool: """ Delete user from database. @@ -181,14 +195,15 @@ class UserRepository: bool: True if user was deleted, False otherwise """ try: - object_id = ObjectId(user_id) - result = self.collection.delete_one({"_id": object_id}) + if not ObjectId.is_valid(user_id): + return False + + result = await self.collection.delete_one({"_id": ObjectId(user_id)}) return result.deleted_count > 0 - except Exception: - # Invalid ObjectId format + except PyMongoError: return False - def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]: + async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]: """ List users with pagination. @@ -199,19 +214,26 @@ class UserRepository: Returns: List[UserInDB]: List of users """ - cursor = self.collection.find().skip(skip).limit(limit) - return [UserInDB(**user_doc) for user_doc in cursor] + try: + cursor = self.collection.find({}).skip(skip).limit(limit).sort("created_at", -1) + user_docs = await cursor.to_list(length=limit) + return [UserInDB(**user_doc) for user_doc in user_docs] + except PyMongoError: + return [] - def count_users(self) -> int: + async def count_users(self) -> int: """ Count total number of users. Returns: int: Total number of users in database """ - return self.collection.count_documents({}) + try: + return await self.collection.count_documents({}) + except PyMongoError: + return 0 - def user_exists(self, username: str) -> bool: + async def user_exists(self, username: str) -> bool: """ Check if user exists by username. @@ -221,4 +243,8 @@ class UserRepository: Returns: bool: True if user exists, False otherwise """ - return self.collection.count_documents({"username": username}) > 0 + try: + count = await self.collection.count_documents({"username": username}) + return count > 0 + except PyMongoError: + return False diff --git a/src/file-processor/app/main.py b/src/file-processor/app/main.py index f4e493f..b2247e4 100644 --- a/src/file-processor/app/main.py +++ b/src/file-processor/app/main.py @@ -119,7 +119,6 @@ async def create_user( ): return user_service.create_user(user_data) - @app.get("/health") async def health_check(): """ diff --git a/src/file-processor/app/models/document.py b/src/file-processor/app/models/document.py new file mode 100644 index 0000000..5cc1adc --- /dev/null +++ b/src/file-processor/app/models/document.py @@ -0,0 +1,142 @@ +""" +Pydantic models for document processing collections. + +This module defines the data models for file documents and processing jobs +stored in MongoDB collections. +""" + +from datetime import datetime +from enum import Enum +from typing import Any, Dict, Optional + +from bson import ObjectId +from pydantic import BaseModel, Field, field_validator + +from app.models.types import PyObjectId + + +class FileType(str, Enum): + """Supported file types for document processing.""" + + TXT = "txt" + PDF = "pdf" + DOCX = "docx" + JPG = "jpg" + PNG = "png" + + +class ExtractionMethod(str, Enum): + """Methods used to extract content from documents.""" + + DIRECT_TEXT = "direct_text" + OCR = "ocr" + HYBRID = "hybrid" + + +class ProcessingStatus(str, Enum): + """Status values for processing jobs.""" + + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + + +class FileDocument(BaseModel): + """ + Model for file documents stored in the 'files' collection. + + Represents a file detected in the watched directory with its + metadata and extracted content. + """ + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + filename: str = Field(..., description="Original filename") + filepath: str = Field(..., description="Full path to the file") + file_type: FileType = Field(..., description="Type of the file") + extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content") + metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata") + detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected") + file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") + + @field_validator('filepath') + @classmethod + def validate_filepath(cls, v: str) -> str: + """Validate filepath format.""" + if not v.strip(): + raise ValueError("Filepath cannot be empty") + return v.strip() + + @field_validator('filename') + @classmethod + def validate_filename(cls, v: str) -> str: + """Validate filename format.""" + if not v.strip(): + raise ValueError("Filename cannot be empty") + return v.strip() + + class Config: + """Pydantic configuration.""" + populate_by_name = True + arbitrary_types_allowed = True + json_encoders = {ObjectId: str} + + +class DocumentContent(BaseModel): + """Model for document content.""" + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + file_hash: Optional[str] = Field(..., description="SHA256 hash of file content") + content: str = Field(..., description="File content") + encoding: str = Field(default="utf-8", description="Character encoding for text files") + file_size: int = Field(..., ge=0, description="File size in bytes") + mime_type: str = Field(..., description="MIME type detected") + + +class ProcessingJob(BaseModel): + """ + Model for processing jobs stored in the 'processing_jobs' collection. + + Tracks the lifecycle and status of document processing tasks. + """ + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + file_id: PyObjectId = Field(..., description="Reference to file document") + status: ProcessingStatus = Field( + default=ProcessingStatus.PENDING, + description="Current processing status" + ) + task_id: Optional[str] = Field( + default=None, + description="Celery task UUID" + ) + created_at: Optional[datetime] = Field( + default=None, + description="Timestamp when job was created" + ) + started_at: Optional[datetime] = Field( + default=None, + description="Timestamp when processing started" + ) + completed_at: Optional[datetime] = Field( + default=None, + description="Timestamp when processing completed" + ) + error_message: Optional[str] = Field( + default=None, + description="Error message if processing failed" + ) + + @field_validator('error_message') + @classmethod + def validate_error_message(cls, v: Optional[str]) -> Optional[str]: + """Clean up error message.""" + if v is not None: + return v.strip() if v.strip() else None + return v + + class Config: + """Pydantic configuration.""" + populate_by_name = True + arbitrary_types_allowed = True + json_encoders = {ObjectId: str} diff --git a/src/file-processor/app/models/types.py b/src/file-processor/app/models/types.py new file mode 100644 index 0000000..890db06 --- /dev/null +++ b/src/file-processor/app/models/types.py @@ -0,0 +1,32 @@ +from typing import Any + +from bson import ObjectId +from pydantic_core import core_schema + + +class PyObjectId(ObjectId): + """Custom ObjectId type for Pydantic v2 compatibility.""" + + @classmethod + def __get_pydantic_core_schema__( + cls, source_type: Any, handler + ) -> core_schema.CoreSchema: + return core_schema.json_or_python_schema( + json_schema=core_schema.str_schema(), + python_schema=core_schema.union_schema([ + core_schema.is_instance_schema(ObjectId), + core_schema.chain_schema([ + core_schema.str_schema(), + core_schema.no_info_plain_validator_function(cls.validate), + ]) + ]), + serialization=core_schema.plain_serializer_function_ser_schema( + lambda x: str(x) + ), + ) + + @classmethod + def validate(cls, v): + if not ObjectId.is_valid(v): + raise ValueError("Invalid ObjectId") + return ObjectId(v) diff --git a/src/file-processor/app/models/user.py b/src/file-processor/app/models/user.py index c11a068..5759b04 100644 --- a/src/file-processor/app/models/user.py +++ b/src/file-processor/app/models/user.py @@ -13,34 +13,7 @@ from pydantic import BaseModel, Field, field_validator, EmailStr from pydantic_core import core_schema from app.models.auth import UserRole - - -class PyObjectId(ObjectId): - """Custom ObjectId type for Pydantic v2 compatibility.""" - - @classmethod - def __get_pydantic_core_schema__( - cls, source_type: Any, handler - ) -> core_schema.CoreSchema: - return core_schema.json_or_python_schema( - json_schema=core_schema.str_schema(), - python_schema=core_schema.union_schema([ - core_schema.is_instance_schema(ObjectId), - core_schema.chain_schema([ - core_schema.str_schema(), - core_schema.no_info_plain_validator_function(cls.validate), - ]) - ]), - serialization=core_schema.plain_serializer_function_ser_schema( - lambda x: str(x) - ), - ) - - @classmethod - def validate(cls, v): - if not ObjectId.is_valid(v): - raise ValueError("Invalid ObjectId") - return ObjectId(v) +from app.models.types import PyObjectId def validate_password_strength(password: str) -> str: diff --git a/src/file-processor/requirements.txt b/src/file-processor/requirements.txt index 0e686f9..8a4b4be 100644 --- a/src/file-processor/requirements.txt +++ b/src/file-processor/requirements.txt @@ -3,6 +3,7 @@ celery==5.5.3 email-validator==2.3.0 fastapi==0.116.1 httptools==0.6.4 +motor==3.7.1 pymongo==4.15.0 pydantic==2.11.9 redis==6.4.0 diff --git a/tests/test_document_repository.py b/tests/test_document_repository.py new file mode 100644 index 0000000..4ef5d85 --- /dev/null +++ b/tests/test_document_repository.py @@ -0,0 +1,602 @@ +""" +Test suite for FileDocumentRepository with async/await support. + +This module contains comprehensive tests for all FileDocumentRepository methods +using mongomock-motor for in-memory MongoDB testing. +""" + +import pytest +from datetime import datetime +from typing import Dict, Any + +import pytest_asyncio +from bson import ObjectId +from pymongo.errors import DuplicateKeyError, PyMongoError +from mongomock_motor import AsyncMongoMockClient + +from app.database.repositories.document_repository import FileDocumentRepository +from app.models.document import FileDocument, FileType + + +@pytest_asyncio.fixture +async def in_memory_repository(): + """Create an in-memory FileDocumentRepository for testing.""" + client = AsyncMongoMockClient() + db = client.test_database + repo = FileDocumentRepository() + repo.db = db + repo.collection = db.files + return repo + + +@pytest.fixture +def sample_file_document(): + """Sample FileDocument data for testing.""" + return FileDocument( + filename="test_document.pdf", + filepath="/path/to/test_document.pdf", + file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", + file_type=FileType("pdf"), + detected_at=datetime.now(), + ) + + +@pytest.fixture +def sample_update_data(): + """Sample update data for testing.""" + return { + "metadata": {"tags": ["updated", "document"]}, + "file_type": FileType("txt"), + } + + +@pytest.fixture +def multiple_sample_documents(): + """Multiple FileDocument objects for list/search testing.""" + base_time = datetime.now() + return [ + FileDocument( + filename="document1.pdf", + filepath="/path/to/document1.pdf", + file_hash="hash1" + "0" * 58, + file_type=FileType("pdf"), + detected_at=base_time, + ), + FileDocument( + filename="similar_document.pdf", + filepath="/path/to/similar_document.pdf", + file_hash="hash2" + "0" * 58, + file_type=FileType("pdf"), + detected_at=base_time, + ), + FileDocument( + filename="completely_different.txt", + filepath="/path/to/completely_different.txt", + file_hash="hash3" + "0" * 58, + file_type=FileType("pdf"), + detected_at=base_time, + ) + ] + + +class TestFileDocumentRepositoryInitialization: + """Tests for repository initialization.""" + + @pytest.mark.asyncio + async def test_i_can_initialize_repository(self): + """Test repository initialization.""" + # Arrange + repo = FileDocumentRepository() + + # Act & Assert (should not raise any exception) + assert repo.db is not None + assert repo.collection is not None + + +class TestFileDocumentRepositoryCreation: + """Tests for file document creation functionality.""" + + @pytest.mark.asyncio + async def test_i_can_create_document(self, in_memory_repository, sample_file_document): + """Test successful file document creation.""" + # Act + created_doc = await in_memory_repository.create_document(sample_file_document) + + # Assert + assert created_doc is not None + assert created_doc.filename == sample_file_document.filename + assert created_doc.filepath == sample_file_document.filepath + assert created_doc.file_hash == sample_file_document.file_hash + assert created_doc.file_type == sample_file_document.file_type + assert created_doc.id is not None + assert isinstance(created_doc.id, ObjectId) + + @pytest.mark.asyncio + async def test_i_can_create_document_without_id(self, in_memory_repository, sample_file_document): + """Test creating document with _id set to None (should be removed).""" + # Arrange + sample_file_document.id = None + + # Act + created_doc = await in_memory_repository.create_document(sample_file_document) + + # Assert + assert created_doc is not None + assert created_doc.id is not None + assert isinstance(created_doc.id, ObjectId) + + @pytest.mark.asyncio + async def test_i_cannot_create_duplicate_document(self, in_memory_repository, sample_file_document): + """Test that creating document with duplicate hash raises DuplicateKeyError.""" + # Arrange + await in_memory_repository.create_document(sample_file_document) + duplicate_doc = FileDocument( + filename="different_name.pdf", + filepath=sample_file_document.filepath, + file_hash="different_hash" + "0" * 58, + file_type=FileType("pdf"), + detected_at=datetime.now() + ) + + # Act & Assert + with pytest.raises(DuplicateKeyError) as exc_info: + await in_memory_repository.create_document(duplicate_doc) + + assert "already exists" in str(exc_info.value) + + @pytest.mark.asyncio + async def test_i_cannot_create_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker): + """Test handling of PyMongo errors during document creation.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(ValueError) as exc_info: + await in_memory_repository.create_document(sample_file_document) + + assert "Failed to create file document" in str(exc_info.value) + + +class TestFileDocumentRepositoryFinding: + """Tests for file document finding functionality.""" + + @pytest.mark.asyncio + async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document): + """Test finding document by valid ObjectId.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + + # Act + found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id)) + + # Assert + assert found_doc is not None + assert found_doc.id == created_doc.id + assert found_doc.filename == created_doc.filename + assert found_doc.file_hash == created_doc.file_hash + + @pytest.mark.asyncio + async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository): + """Test that invalid ObjectId returns None.""" + # Act + found_doc = await in_memory_repository.find_document_by_id("invalid_id") + + # Assert + assert found_doc is None + + @pytest.mark.asyncio + async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository): + """Test that nonexistent but valid ObjectId returns None.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + found_doc = await in_memory_repository.find_document_by_id(nonexistent_id) + + # Assert + assert found_doc is None + + @pytest.mark.asyncio + async def test_i_can_find_document_by_hash(self, in_memory_repository, sample_file_document): + """Test finding document by file hash.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + + # Act + found_doc = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash) + + # Assert + assert found_doc is not None + assert found_doc.file_hash == created_doc.file_hash + assert found_doc.id == created_doc.id + + @pytest.mark.asyncio + async def test_i_cannot_find_document_with_nonexistent_hash(self, in_memory_repository): + """Test that nonexistent hash returns None.""" + # Act + found_doc = await in_memory_repository.find_document_by_hash("nonexistent_hash") + + # Assert + assert found_doc is None + + @pytest.mark.asyncio + async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document): + """Test finding document by exact filepath.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + + # Act + found_doc = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath) + + # Assert + assert found_doc is not None + assert found_doc.filepath == created_doc.filepath + assert found_doc.id == created_doc.id + + @pytest.mark.asyncio + async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository): + """Test that nonexistent filepath returns None.""" + # Act + found_doc = await in_memory_repository.find_document_by_filepath("/nonexistent/path.pdf") + + # Assert + assert found_doc is None + + +class TestFileDocumentRepositoryFuzzySearch: + """Tests for fuzzy search functionality by filename.""" + + @pytest.mark.asyncio + async def test_i_can_find_documents_by_exact_name(self, in_memory_repository, multiple_sample_documents): + """Test finding documents with exact filename match.""" + # Arrange + for doc in multiple_sample_documents: + await in_memory_repository.create_document(doc) + + # Act + found_docs = await in_memory_repository.find_document_by_name("document1.pdf") + + # Assert + assert len(found_docs) == 1 + assert found_docs[0].filename == "document1.pdf" + + @pytest.mark.asyncio + async def test_i_can_find_documents_by_fuzzy_name(self, in_memory_repository, multiple_sample_documents): + """Test finding documents with fuzzy matching using default threshold.""" + # Arrange + for doc in multiple_sample_documents: + await in_memory_repository.create_document(doc) + + # Act + found_docs = await in_memory_repository.find_document_by_name("document") + + # Assert + assert len(found_docs) >= 2 # Should find document1.pdf and similar_document.pdf + filenames = [doc.filename for doc in found_docs] + assert "document1.pdf" in filenames + assert "similar_document.pdf" in filenames + + @pytest.mark.asyncio + async def test_i_can_find_documents_with_custom_threshold(self, in_memory_repository, multiple_sample_documents): + """Test finding documents with custom similarity threshold.""" + # Arrange + for doc in multiple_sample_documents: + await in_memory_repository.create_document(doc) + + # Act - Very high threshold should only match exact or very similar names + found_docs = await in_memory_repository.find_document_by_name("document1.pdf", similarity_threshold=0.9) + + # Assert + assert len(found_docs) == 1 + assert found_docs[0].filename == "document1.pdf" + + @pytest.mark.asyncio + async def test_i_can_find_documents_sorted_by_similarity(self, in_memory_repository, multiple_sample_documents): + """Test that documents are sorted by similarity score (highest first).""" + # Arrange + for doc in multiple_sample_documents: + await in_memory_repository.create_document(doc) + + # Act + found_docs = await in_memory_repository.find_document_by_name("document1", similarity_threshold=0.3) + + # Assert + assert len(found_docs) >= 1 + # First result should be the most similar (document1.pdf) + assert found_docs[0].filename == "document1.pdf" + + @pytest.mark.asyncio + async def test_i_cannot_find_documents_below_threshold(self, in_memory_repository, multiple_sample_documents): + """Test that no documents are returned when similarity is below threshold.""" + # Arrange + for doc in multiple_sample_documents: + await in_memory_repository.create_document(doc) + + # Act + found_docs = await in_memory_repository.find_document_by_name("xyz", similarity_threshold=0.6) + + # Assert + assert len(found_docs) == 0 + + @pytest.mark.asyncio + async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during name search.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) + + # Act + found_docs = await in_memory_repository.find_document_by_name("test") + + # Assert + assert found_docs == [] + + +class TestFileDocumentRepositoryListing: + """Tests for document listing functionality.""" + + @pytest.mark.asyncio + async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_documents): + """Test listing documents with default pagination.""" + # Arrange + for doc in multiple_sample_documents: + await in_memory_repository.create_document(doc) + + # Act + docs = await in_memory_repository.list_documents() + + # Assert + assert len(docs) == len(multiple_sample_documents) + assert all(isinstance(doc, FileDocument) for doc in docs) + + @pytest.mark.asyncio + async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_documents): + """Test listing documents with custom pagination.""" + # Arrange + for doc in multiple_sample_documents: + await in_memory_repository.create_document(doc) + + # Act + docs_page1 = await in_memory_repository.list_documents(skip=0, limit=2) + docs_page2 = await in_memory_repository.list_documents(skip=2, limit=2) + + # Assert + assert len(docs_page1) == 2 + assert len(docs_page2) == 1 # Only 3 total documents + + # Ensure no overlap between pages + page1_ids = [doc.id for doc in docs_page1] + page2_ids = [doc.id for doc in docs_page2] + assert len(set(page1_ids).intersection(set(page2_ids))) == 0 + + @pytest.mark.asyncio + async def test_i_can_list_documents_sorted_by_date(self, in_memory_repository, sample_file_document): + """Test that documents are sorted by detected_at in descending order.""" + # Arrange + from datetime import timedelta + + # Create documents with different timestamps + doc1 = sample_file_document.model_copy() + doc1.filename = "oldest.pdf" + doc1.file_hash = "hash1" + "0" * 58 + doc1.detected_at = datetime.now() - timedelta(hours=2) + + doc2 = sample_file_document.model_copy() + doc2.filename = "newest.pdf" + doc2.file_hash = "hash2" + "0" * 58 + doc2.detected_at = datetime.now() + + await in_memory_repository.create_document(doc1) + await in_memory_repository.create_document(doc2) + + # Act + docs = await in_memory_repository.list_documents() + + # Assert + assert len(docs) == 2 + assert docs[0].filename == "newest.pdf" # Most recent first + assert docs[1].filename == "oldest.pdf" + + @pytest.mark.asyncio + async def test_i_can_list_empty_documents(self, in_memory_repository): + """Test listing documents from empty collection.""" + # Act + docs = await in_memory_repository.list_documents() + + # Assert + assert docs == [] + + @pytest.mark.asyncio + async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during document listing.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) + + # Act + docs = await in_memory_repository.list_documents() + + # Assert + assert docs == [] + + +class TestFileDocumentRepositoryUpdate: + """Tests for document update functionality.""" + + @pytest.mark.asyncio + async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document, + sample_update_data): + """Test successful document update.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + + # Act + updated_doc = await in_memory_repository.update_document(str(created_doc.id), sample_update_data) + + # Assert + assert updated_doc is not None + assert updated_doc.tags == sample_update_data["tags"] + assert updated_doc.file_type == sample_update_data["file_type"] + assert updated_doc.id == created_doc.id + assert updated_doc.filename == created_doc.filename # Unchanged fields remain + + @pytest.mark.asyncio + async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document): + """Test updating document with partial data.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + partial_update = {"tags": ["new_tag"]} + + # Act + updated_doc = await in_memory_repository.update_document(str(created_doc.id), partial_update) + + # Assert + assert updated_doc is not None + assert updated_doc.tags == ["new_tag"] + assert updated_doc.filename == created_doc.filename # Should remain unchanged + assert updated_doc.file_type == created_doc.file_type # Should remain unchanged + + @pytest.mark.asyncio + async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document): + """Test that None values are filtered out from update data.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + update_with_none = {"tags": ["new_tag"], "file_type": None} + + # Act + updated_doc = await in_memory_repository.update_document(str(created_doc.id), update_with_none) + + # Assert + assert updated_doc is not None + assert updated_doc.tags == ["new_tag"] + assert updated_doc.file_type == created_doc.file_type # Should remain unchanged (None filtered out) + + @pytest.mark.asyncio + async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document): + """Test updating document with empty data returns current document.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + empty_update = {} + + # Act + result = await in_memory_repository.update_document(str(created_doc.id), empty_update) + + # Assert + assert result is not None + assert result.filename == created_doc.filename + assert result.file_hash == created_doc.file_hash + assert result.tags == created_doc.tags + + @pytest.mark.asyncio + async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data): + """Test that updating with invalid ID returns None.""" + # Act + result = await in_memory_repository.update_document("invalid_id", sample_update_data) + + # Assert + assert result is None + + @pytest.mark.asyncio + async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data): + """Test that updating nonexistent document returns None.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + result = await in_memory_repository.update_document(nonexistent_id, sample_update_data) + + # Assert + assert result is None + + @pytest.mark.asyncio + async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document, + sample_update_data, mocker): + """Test handling of PyMongo errors during document update.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + mocker.patch.object(in_memory_repository.collection, 'find_one_and_update', + side_effect=PyMongoError("Database error")) + + # Act + result = await in_memory_repository.update_document(str(created_doc.id), sample_update_data) + + # Assert + assert result is None + + +class TestFileDocumentRepositoryDeletion: + """Tests for document deletion functionality.""" + + @pytest.mark.asyncio + async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document): + """Test successful document deletion.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + + # Act + deletion_result = await in_memory_repository.delete_document(str(created_doc.id)) + + # Assert + assert deletion_result is True + + # Verify document is actually deleted + found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id)) + assert found_doc is None + + @pytest.mark.asyncio + async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository): + """Test that deleting with invalid ID returns False.""" + # Act + result = await in_memory_repository.delete_document("invalid_id") + + # Assert + assert result is False + + @pytest.mark.asyncio + async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository): + """Test that deleting nonexistent document returns False.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + result = await in_memory_repository.delete_document(nonexistent_id) + + # Assert + assert result is False + + @pytest.mark.asyncio + async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker): + """Test handling of PyMongo errors during document deletion.""" + # Arrange + created_doc = await in_memory_repository.create_document(sample_file_document) + mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error")) + + # Act + result = await in_memory_repository.delete_document(str(created_doc.id)) + + # Assert + assert result is False + + +class TestFileDocumentRepositoryUtilities: + """Tests for utility methods.""" + + @pytest.mark.asyncio + async def test_i_can_count_documents(self, in_memory_repository, sample_file_document): + """Test counting documents.""" + # Arrange + initial_count = await in_memory_repository.count_documents() + await in_memory_repository.create_document(sample_file_document) + + # Act + final_count = await in_memory_repository.count_documents() + + # Assert + assert final_count == initial_count + 1 + + @pytest.mark.asyncio + async def test_i_can_count_zero_documents(self, in_memory_repository): + """Test counting documents in empty collection.""" + # Act + count = await in_memory_repository.count_documents() + + # Assert + assert count == 0 diff --git a/tests/test_user_repository.py b/tests/test_user_repository.py index 87794b3..c6548be 100644 --- a/tests/test_user_repository.py +++ b/tests/test_user_repository.py @@ -1,388 +1,301 @@ """ -Unit tests for user repository module. +Test suite for UserRepository with async/await support. -Tests all CRUD operations for users with MongoDB mocking -to ensure proper database interactions without requiring -actual MongoDB instance during tests. +This module contains comprehensive tests for all UserRepository methods +using mongomock-motor for in-memory MongoDB testing. """ import pytest -from unittest.mock import Mock, MagicMock from datetime import datetime + +import pytest_asyncio from bson import ObjectId from pymongo.errors import DuplicateKeyError +from mongomock_motor import AsyncMongoMockClient from app.database.repositories.user_repository import UserRepository -from app.models.user import UserCreate, UserUpdate, UserInDB, UserRole +from app.models.user import UserCreate, UserUpdate, UserInDB -@pytest.fixture -def mock_database(): - """Create mock database with users collection.""" - db = Mock() - collection = Mock() - db.users = collection - return db - - -@pytest.fixture -def user_repository(mock_database): - """Create UserRepository instance with mocked database.""" - return UserRepository(mock_database) +@pytest_asyncio.fixture +async def in_memory_repository(): + """Create an in-memory UserRepository for testing.""" + client = AsyncMongoMockClient() + db = client.test_database + repo = UserRepository(db) + #await repo.initialize() + return repo @pytest.fixture def sample_user_create(): - """Create sample UserCreate object for testing.""" + """Sample UserCreate data for testing.""" return UserCreate( username="testuser", email="test@example.com", - password="#Passw0rd", - role=UserRole.USER, + password="#TestPassword123", + role="user" ) @pytest.fixture def sample_user_update(): - """Create sample UserUpdate object for testing.""" + """Sample UserUpdate data for testing.""" return UserUpdate( username="updateduser", email="updated@example.com", - password="#updated-Passw0rd", - role=UserRole.ADMIN, - is_active=False, + role="admin" ) -def test_i_can_create_user(user_repository, mock_database, sample_user_create): - """Test successful user creation.""" - # Mock successful insertion - mock_result = Mock() - mock_result.inserted_id = ObjectId() - mock_database.users.insert_one.return_value = mock_result +class TestUserRepositoryCreation: + """Tests for user creation functionality.""" - result = user_repository.create_user(sample_user_create) + @pytest.mark.asyncio + async def test_i_can_create_user(self, in_memory_repository, sample_user_create): + """Test successful user creation.""" + # Act + created_user = await in_memory_repository.create_user(sample_user_create) + + # Assert + assert created_user is not None + assert created_user.username == sample_user_create.username + assert created_user.email == sample_user_create.email + assert created_user.role == sample_user_create.role + assert created_user.is_active is True + assert created_user.id is not None + assert created_user.created_at is not None + assert created_user.updated_at is not None + assert created_user.hashed_password != sample_user_create.password # Should be hashed - assert isinstance(result, UserInDB) - assert result.username == sample_user_create.username - assert result.email == sample_user_create.email - assert result.hashed_password is not None - assert result.role == sample_user_create.role - assert result.is_active is True - assert result.id is not None - assert isinstance(result.created_at, datetime) - assert isinstance(result.updated_at, datetime) - - # Verify insert_one was called with correct data - mock_database.users.insert_one.assert_called_once() - call_args = mock_database.users.insert_one.call_args[0][0] - assert call_args["username"] == sample_user_create.username - assert call_args["email"] == sample_user_create.email + @pytest.mark.asyncio + async def test_i_cannot_create_user_with_duplicate_username(self, in_memory_repository, sample_user_create): + """Test that creating user with duplicate username raises DuplicateKeyError.""" + # Arrange + await in_memory_repository.create_user(sample_user_create) + + # Act & Assert + with pytest.raises(DuplicateKeyError) as exc_info: + await in_memory_repository.create_user(sample_user_create) + + assert "already exists" in str(exc_info.value) -def test_i_cannot_create_duplicate_username(user_repository, mock_database, sample_user_create): - """Test that creating user with duplicate username raises DuplicateKeyError.""" - # Mock DuplicateKeyError from MongoDB - mock_database.users.insert_one.side_effect = DuplicateKeyError("duplicate key error") +class TestUserRepositoryFinding: + """Tests for user finding functionality.""" - with pytest.raises(DuplicateKeyError, match="User with username 'testuser' already exists"): - user_repository.create_user(sample_user_create) + @pytest.mark.asyncio + async def test_i_can_find_user_by_id(self, in_memory_repository, sample_user_create): + """Test finding user by valid ID.""" + # Arrange + created_user = await in_memory_repository.create_user(sample_user_create) + + # Act + found_user = await in_memory_repository.find_user_by_id(str(created_user.id)) + + # Assert + assert found_user is not None + assert found_user.id == created_user.id + assert found_user.username == created_user.username + assert found_user.email == created_user.email + + @pytest.mark.asyncio + async def test_i_cannot_find_user_by_invalid_id(self, in_memory_repository): + """Test that invalid ObjectId returns None.""" + # Act + found_user = await in_memory_repository.find_user_by_id("invalid_id") + + # Assert + assert found_user is None + + @pytest.mark.asyncio + async def test_i_cannot_find_user_by_nonexistent_id(self, in_memory_repository): + """Test that nonexistent but valid ObjectId returns None.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + found_user = await in_memory_repository.find_user_by_id(nonexistent_id) + + # Assert + assert found_user is None + + @pytest.mark.asyncio + async def test_i_can_find_user_by_username(self, in_memory_repository, sample_user_create): + """Test finding user by username.""" + # Arrange + created_user = await in_memory_repository.create_user(sample_user_create) + + # Act + found_user = await in_memory_repository.find_user_by_username(sample_user_create.username) + + # Assert + assert found_user is not None + assert found_user.username == created_user.username + assert found_user.id == created_user.id + + @pytest.mark.asyncio + async def test_i_cannot_find_user_by_nonexistent_username(self, in_memory_repository): + """Test that nonexistent username returns None.""" + # Act + found_user = await in_memory_repository.find_user_by_username("nonexistent") + + # Assert + assert found_user is None + + @pytest.mark.asyncio + async def test_i_can_find_user_by_email(self, in_memory_repository, sample_user_create): + """Test finding user by email.""" + # Arrange + created_user = await in_memory_repository.create_user(sample_user_create) + + # Act + found_user = await in_memory_repository.find_user_by_email(str(sample_user_create.email)) + + # Assert + assert found_user is not None + assert found_user.email == created_user.email + assert found_user.id == created_user.id + + @pytest.mark.asyncio + async def test_i_cannot_find_user_by_nonexistent_email(self, in_memory_repository): + """Test that nonexistent email returns None.""" + # Act + found_user = await in_memory_repository.find_user_by_email("nonexistent@example.com") + + # Assert + assert found_user is None -def test_i_can_find_user_by_username(user_repository, mock_database): - """Test finding user by username.""" - # Mock user document from database - user_doc = { - "_id": ObjectId(), - "username": "testuser", - "email": "test@example.com", - "hashed_password": "hashed_password_123", - "role": "user", - "is_active": True, - "created_at": datetime.now(), - "updated_at": datetime.now() - } - mock_database.users.find_one.return_value = user_doc +class TestUserRepositoryUpdate: + """Tests for user update functionality.""" - result = user_repository.find_user_by_username("testuser") + @pytest.mark.asyncio + async def test_i_can_update_user(self, in_memory_repository, sample_user_create, sample_user_update): + """Test successful user update.""" + # Arrange + created_user = await in_memory_repository.create_user(sample_user_create) + original_updated_at = created_user.updated_at + + # Act + updated_user = await in_memory_repository.update_user(str(created_user.id), sample_user_update) + + # Assert + assert updated_user is not None + assert updated_user.username == sample_user_update.username + assert updated_user.email == sample_user_update.email + assert updated_user.role == sample_user_update.role + assert updated_user.id == created_user.id - assert isinstance(result, UserInDB) - assert result.username == "testuser" - assert result.email == "test@example.com" + @pytest.mark.asyncio + async def test_i_cannot_update_user_with_invalid_id(self, in_memory_repository, sample_user_update): + """Test that updating with invalid ID returns None.""" + # Act + result = await in_memory_repository.update_user("invalid_id", sample_user_update) + + # Assert + assert result is None - mock_database.users.find_one.assert_called_once_with({"username": "testuser"}) + @pytest.mark.asyncio + async def test_i_can_update_user_with_partial_data(self, in_memory_repository, sample_user_create): + """Test updating user with partial data.""" + # Arrange + created_user = await in_memory_repository.create_user(sample_user_create) + partial_update = UserUpdate(username="newusername") + + # Act + updated_user = await in_memory_repository.update_user(str(created_user.id), partial_update) + + # Assert + assert updated_user is not None + assert updated_user.username == "newusername" + assert updated_user.email == created_user.email # Should remain unchanged + assert updated_user.role == created_user.role # Should remain unchanged + + @pytest.mark.asyncio + async def test_i_can_update_user_with_empty_data(self, in_memory_repository, sample_user_create): + """Test updating user with empty data returns current user.""" + # Arrange + created_user = await in_memory_repository.create_user(sample_user_create) + empty_update = UserUpdate() + + # Act + result = await in_memory_repository.update_user(str(created_user.id), empty_update) + + # Assert + assert result is not None + assert result.username == created_user.username + assert result.email == created_user.email -def test_i_cannot_find_nonexistent_user_by_username(user_repository, mock_database): - """Test finding nonexistent user by username returns None.""" - mock_database.users.find_one.return_value = None +class TestUserRepositoryDeletion: + """Tests for user deletion functionality.""" - result = user_repository.find_user_by_username("nonexistent") + @pytest.mark.asyncio + async def test_i_can_delete_user(self, in_memory_repository, sample_user_create): + """Test successful user deletion.""" + # Arrange + created_user = await in_memory_repository.create_user(sample_user_create) + + # Act + deletion_result = await in_memory_repository.delete_user(str(created_user.id)) + + # Assert + assert deletion_result is True + + # Verify user is actually deleted + found_user = await in_memory_repository.find_user_by_id(str(created_user.id)) + assert found_user is None - assert result is None - mock_database.users.find_one.assert_called_once_with({"username": "nonexistent"}) + @pytest.mark.asyncio + async def test_i_cannot_delete_user_with_invalid_id(self, in_memory_repository): + """Test that deleting with invalid ID returns False.""" + # Act + result = await in_memory_repository.delete_user("invalid_id") + + # Assert + assert result is False + + @pytest.mark.asyncio + async def test_i_cannot_delete_nonexistent_user(self, in_memory_repository): + """Test that deleting nonexistent user returns False.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + result = await in_memory_repository.delete_user(nonexistent_id) + + # Assert + assert result is False -def test_i_can_find_user_by_id(user_repository, mock_database): - """Test finding user by ID.""" - user_id = ObjectId() - user_doc = { - "_id": user_id, - "username": "testuser", - "email": "test@example.com", - "hashed_password": "hashed_password_123", - "role": "user", - "is_active": True, - "created_at": datetime.now(), - "updated_at": datetime.now() - } - mock_database.users.find_one.return_value = user_doc +class TestUserRepositoryUtilities: + """Tests for utility methods.""" - result = user_repository.find_user_by_id(str(user_id)) + @pytest.mark.asyncio + async def test_i_can_count_users(self, in_memory_repository, sample_user_create): + """Test counting users.""" + # Arrange + initial_count = await in_memory_repository.count_users() + await in_memory_repository.create_user(sample_user_create) + + # Act + final_count = await in_memory_repository.count_users() + + # Assert + assert final_count == initial_count + 1 - assert isinstance(result, UserInDB) - assert result.id == user_id - assert result.username == "testuser" - - mock_database.users.find_one.assert_called_once_with({"_id": user_id}) + @pytest.mark.asyncio + async def test_i_can_check_user_exists(self, in_memory_repository, sample_user_create): + """Test checking if user exists.""" + # Arrange + await in_memory_repository.create_user(sample_user_create) + + # Act + exists = await in_memory_repository.user_exists(sample_user_create.username) + not_exists = await in_memory_repository.user_exists("nonexistent") + + # Assert + assert exists is True + assert not_exists is False - -def test_i_cannot_find_user_with_invalid_id(user_repository, mock_database): - """Test finding user with invalid ObjectId returns None.""" - result = user_repository.find_user_by_id("invalid_id") - - assert result is None - # find_one should not be called with invalid ID - mock_database.users.find_one.assert_not_called() - - -def test_i_cannot_find_nonexistent_user_by_id(user_repository, mock_database): - """Test finding nonexistent user by ID returns None.""" - user_id = ObjectId() - mock_database.users.find_one.return_value = None - - result = user_repository.find_user_by_id(str(user_id)) - - assert result is None - mock_database.users.find_one.assert_called_once_with({"_id": user_id}) - - -def test_i_can_find_user_by_email(user_repository, mock_database): - """Test finding user by email address.""" - user_doc = { - "_id": ObjectId(), - "username": "testuser", - "email": "test@example.com", - "hashed_password": "hashed_password_123", - "role": "user", - "is_active": True, - "created_at": datetime.now(), - "updated_at": datetime.now() - } - mock_database.users.find_one.return_value = user_doc - - result = user_repository.find_user_by_email("test@example.com") - - assert isinstance(result, UserInDB) - assert result.email == "test@example.com" - - mock_database.users.find_one.assert_called_once_with({"email": "test@example.com"}) - - -def test_i_can_update_user(user_repository, mock_database, sample_user_update): - """Test updating user information.""" - user_id = ObjectId() - - # Mock successful update - mock_update_result = Mock() - mock_update_result.matched_count = 1 - mock_database.users.update_one.return_value = mock_update_result - - # Mock find_one for returning updated user - user_to_update = { - "_id": user_id, - "username": "testuser", - "email": "updated@example.com", - "hashed_password": "hashed_password_123", - "role": "admin", - "is_active": False, - "created_at": datetime.now(), - "updated_at": datetime.now() - } - mock_database.users.find_one.return_value = user_to_update - - result = user_repository.update_user(str(user_id), sample_user_update) - - # Mock will return the first find_one result, which is the updated user_to_update - # assert isinstance(result, UserInDB) - # assert result.username == "updateduser" - # assert result.email == "updated@example.com" - # assert result.role == UserRole.ADMIN - # assert result.is_active is False - - # Verify update_one was called with correct data - mock_database.users.update_one.assert_called_once() - call_args = mock_database.users.update_one.call_args - assert call_args[0][0] == {"_id": user_id} # Filter - update_data = call_args[0][1]["$set"] # Update data - assert update_data["email"] == "updated@example.com" - assert update_data["role"] == UserRole.ADMIN - assert update_data["is_active"] is False - assert "updated_at" in update_data - - -def test_i_cannot_update_nonexistent_user(user_repository, mock_database, sample_user_update): - """Test updating nonexistent user returns None.""" - user_id = ObjectId() - - # Mock no match found - mock_update_result = Mock() - mock_update_result.matched_count = 0 - mock_database.users.update_one.return_value = mock_update_result - - result = user_repository.update_user(str(user_id), sample_user_update) - - assert result is None - - -def test_i_cannot_update_user_with_invalid_id(user_repository, mock_database, sample_user_update): - """Test updating user with invalid ID returns None.""" - result = user_repository.update_user("invalid_id", sample_user_update) - - assert result is None - # update_one should not be called with invalid ID - mock_database.users.update_one.assert_not_called() - - -def test_i_can_delete_user(user_repository, mock_database): - """Test successful user deletion.""" - user_id = ObjectId() - - # Mock successful deletion - mock_delete_result = Mock() - mock_delete_result.deleted_count = 1 - mock_database.users.delete_one.return_value = mock_delete_result - - result = user_repository.delete_user(str(user_id)) - - assert result is True - mock_database.users.delete_one.assert_called_once_with({"_id": user_id}) - - -def test_i_cannot_delete_nonexistent_user(user_repository, mock_database): - """Test deleting nonexistent user returns False.""" - user_id = ObjectId() - - # Mock no deletion occurred - mock_delete_result = Mock() - mock_delete_result.deleted_count = 0 - mock_database.users.delete_one.return_value = mock_delete_result - - result = user_repository.delete_user(str(user_id)) - - assert result is False - - -def test_i_cannot_delete_user_with_invalid_id(user_repository, mock_database): - """Test deleting user with invalid ID returns False.""" - result = user_repository.delete_user("invalid_id") - - assert result is False - # delete_one should not be called with invalid ID - mock_database.users.delete_one.assert_not_called() - - -def test_i_can_list_users(user_repository, mock_database): - """Test listing users with pagination.""" - # Mock cursor with user documents - user_docs = [ - { - "_id": ObjectId(), - "username": "user1", - "email": "user1@example.com", - "hashed_password": "hash1", - "role": "user", - "is_active": True, - "created_at": datetime.now(), - "updated_at": datetime.now() - }, - { - "_id": ObjectId(), - "username": "user2", - "email": "user2@example.com", - "hashed_password": "hash2", - "role": "admin", - "is_active": False, - "created_at": datetime.now(), - "updated_at": datetime.now() - } - ] - - mock_cursor = Mock() - mock_cursor.__iter__ = Mock(return_value=iter(user_docs)) - mock_cursor.skip.return_value = mock_cursor - mock_cursor.limit.return_value = mock_cursor - mock_database.users.find.return_value = mock_cursor - - result = user_repository.list_users(skip=10, limit=50) - - assert len(result) == 2 - assert all(isinstance(user, UserInDB) for user in result) - assert result[0].username == "user1" - assert result[1].username == "user2" - - mock_database.users.find.assert_called_once() - mock_cursor.skip.assert_called_once_with(10) - mock_cursor.limit.assert_called_once_with(50) - - -def test_i_can_count_users(user_repository, mock_database): - """Test counting total users.""" - mock_database.users.count_documents.return_value = 42 - - result = user_repository.count_users() - - assert result == 42 - mock_database.users.count_documents.assert_called_once_with({}) - - -def test_i_can_check_user_exists(user_repository, mock_database): - """Test checking if user exists by username.""" - mock_database.users.count_documents.return_value = 1 - - result = user_repository.user_exists("testuser") - - assert result is True - mock_database.users.count_documents.assert_called_once_with({"username": "testuser"}) - - -def test_i_can_check_user_does_not_exist(user_repository, mock_database): - """Test checking if user does not exist by username.""" - mock_database.users.count_documents.return_value = 0 - - result = user_repository.user_exists("nonexistent") - - assert result is False - mock_database.users.count_documents.assert_called_once_with({"username": "nonexistent"}) - - -def test_i_can_create_indexes_on_initialization(mock_database): - """Test that indexes are created when repository is initialized.""" - # Mock create_index to not raise exception - mock_database.users.create_index.return_value = None - - repository = UserRepository(mock_database) - - mock_database.users.create_index.assert_called_once_with("username", unique=True) - - -def test_i_can_handle_index_creation_error(mock_database): - """Test that index creation errors are handled gracefully.""" - # Mock create_index to raise exception (index already exists) - mock_database.users.create_index.side_effect = Exception("Index already exists") - - # Should not raise exception - repository = UserRepository(mock_database) - - assert repository is not None - mock_database.users.create_index.assert_called_once_with("username", unique=True)