diff --git a/.gitignore b/.gitignore index 70054a0..6896de6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +volumes + # Byte-compiled / optimized / DLL files __pycache__/ *.py[codz] diff --git a/Readme.md b/Readme.md index fc03512..2625f99 100644 --- a/Readme.md +++ b/Readme.md @@ -13,7 +13,7 @@ architecture with Redis for task queuing and MongoDB for data persistence. - **Backend API**: FastAPI (Python 3.12) - **Task Processing**: Celery with Redis broker - **Document Processing**: EasyOCR, PyMuPDF, python-docx, pdfplumber -- **Database**: MongoDB +- **Database**: MongoDB (pymongo) - **Frontend**: React - **Containerization**: Docker & Docker Compose - **File Monitoring**: Python watchdog library @@ -95,25 +95,32 @@ MyDocManager/ │ │ ├── requirements.txt │ │ ├── app/ │ │ │ ├── main.py -│ │ │ ├── file_watcher.py -│ │ │ ├── celery_app.py +│ │ │ ├── file_watcher.py # FileWatcher class with observer thread +│ │ │ ├── celery_app.py # Celery Configuration │ │ │ ├── config/ │ │ │ │ ├── __init__.py │ │ │ │ └── settings.py # JWT, MongoDB config │ │ │ ├── models/ │ │ │ │ ├── __init__.py │ │ │ │ ├── user.py # User Pydantic models -│ │ │ │ └── auth.py # Auth Pydantic models +│ │ │ │ ├── auth.py # Auth Pydantic models +│ │ │ │ ├── document.py # Document Pydantic models +│ │ │ │ ├── job.py # Job Processing Pydantic models +│ │ │ │ └── types.py # PyObjectId and other useful types │ │ │ ├── database/ │ │ │ │ ├── __init__.py -│ │ │ │ ├── connection.py # MongoDB connection +│ │ │ │ ├── connection.py # MongoDB connection (pymongo) │ │ │ │ └── repositories/ │ │ │ │ ├── __init__.py -│ │ │ │ └── user_repository.py # User CRUD operations +│ │ │ │ ├── user_repository.py # User CRUD operations (synchronous) +│ │ │ │ ├── document_repository.py # Document CRUD operations (synchronous) +│ │ │ │ └── job_repository.py # Job CRUD operations (synchronous) │ │ │ ├── services/ │ │ │ │ ├── __init__.py -│ │ │ │ ├── auth_service.py # JWT & password logic -│ │ │ │ ├── user_service.py # User business logic +│ │ │ │ ├── auth_service.py # JWT & password logic (synchronous) +│ │ │ │ ├── user_service.py # User business logic (synchronous) +│ │ │ │ ├── document_service.py # Document business logic (synchronous) +│ │ │ │ ├── job_service.py # Job processing logic (synchronous) │ │ │ │ └── init_service.py # Admin creation at startup │ │ │ ├── api/ │ │ │ │ ├── __init__.py @@ -125,7 +132,7 @@ MyDocManager/ │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── security.py # Password utilities -│ │ │ └── exceptions.py # Custom exceptions +│ │ │ └── document_matching.py # Fuzzy matching Algorithms │ ├── worker/ │ │ ├── Dockerfile │ │ ├── requirements.txt @@ -133,7 +140,13 @@ MyDocManager/ │ └── frontend/ │ ├── Dockerfile │ ├── package.json +│ ├── index.html │ └── src/ +│ ├── assets/ +│ ├── App.css +│ ├── App.jsx +│ ├── main.css +│ └── main.jsx ├── tests/ │ ├── file-processor/ │ │ ├── test_auth/ @@ -224,78 +237,76 @@ On first startup, the application automatically creates a default admin user: #### Files Collection -Stores file metadata and extracted content: +Stores file metadata and extracted content using Pydantic models: -```json -{ - "_id": "ObjectId", - "filename": "document.pdf", - "filepath": "/watched_files/document.pdf", - "file_type": "pdf", - "extraction_method": "direct_text", // direct_text, ocr, hybrid - "metadata": { - "page_count": 15, // for PDFs - "word_count": 250, // for text files - "image_dimensions": { // for images - "width": 1920, - "height": 1080 - } - }, - "detected_at": "2024-01-15T10:29:00Z", - "file_hash": "sha256_hash_value" -} -``` -#### Document Contents Collection +```python +class FileDocument(BaseModel): + """ + Model for file documents stored in the 'files' collection. -Stores actual file content and technical metadata: -```json -{ - "_id": "ObjectId", - "file_hash": "sha256_hash_value", - "content": "extracted text content...", - "encoding": "utf-8", - "file_size": 2048576, - "mime_type": "application/pdf" -} + Represents a file detected in the watched directory with its + metadata and extracted content. + """ + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + filename: str = Field(..., description="Original filename") + filepath: str = Field(..., description="Full path to the file") + file_type: FileType = Field(..., description="Type of the file") + extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content") + metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata") + detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected") + file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") + encoding: str = Field(default="utf-8", description="Character encoding for text files") + file_size: int = Field(..., ge=0, description="File size in bytes") + mime_type: str = Field(..., description="MIME type detected") + + @field_validator('filepath') + @classmethod + def validate_filepath(cls, v: str) -> str: + """Validate filepath format.""" + if not v.strip(): + raise ValueError("Filepath cannot be empty") + return v.strip() + + @field_validator('filename') + @classmethod + def validate_filename(cls, v: str) -> str: + """Validate filename format.""" + if not v.strip(): + raise ValueError("Filename cannot be empty") + return v.strip() ``` #### Processing Jobs Collection Tracks processing status and lifecycle: -```json -{ - "_id": "ObjectId", - "file_id": "reference_to_files_collection", - "status": "completed", - // pending, processing, completed, failed - "task_id": "celery_task_uuid", - "created_at": "2024-01-15T10:29:00Z", - "started_at": "2024-01-15T10:29:30Z", - "completed_at": "2024-01-15T10:30:00Z", - "error_message": null -} +```python +class ProcessingJob(BaseModel): + """ + Model for processing jobs stored in the 'processing_jobs' collection. + + Tracks the lifecycle and status of document processing tasks. + """ + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + file_id: PyObjectId = Field(..., description="Reference to file document") + status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status") + task_id: Optional[str] = Field(default=None, description="Celery task UUID") + created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created") + started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started") + completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed") + error_message: Optional[str] = Field(default=None, description="Error message if processing failed") + + @field_validator('error_message') + @classmethod + def validate_error_message(cls, v: Optional[str]) -> Optional[str]: + """Clean up error message.""" + if v is not None: + return v.strip() if v.strip() else None + return v ``` -### Data Storage Strategy - -- **Choice**: Three separate collections for files, content, and processing status -- **Rationale**: Normalization prevents content duplication when multiple files have identical content -- **Benefits**: - - Content deduplication via SHA256 hash - - Better query performance for metadata vs content searches - - Clear separation of concerns between file metadata, content, and processing lifecycle - - Multiple files can reference the same content (e.g., identical copies in different locations) - -### Content Storage Location - -- **Choice**: Store extracted content in separate `document_contents` collection -- **Rationale**: Content normalization and deduplication -- **Benefits**: - - Single content storage per unique file hash - - Multiple file entries can reference same content - - Efficient storage for duplicate files - ### Supported File Types (Initial Implementation) - **Text Files** (`.txt`): Direct content reading @@ -306,7 +317,7 @@ Tracks processing status and lifecycle: #### Watchdog Implementation -- **Choice**: Dedicated observer thread (Option A) +- **Choice**: Dedicated observer thread - **Rationale**: Standard approach, clean separation of concerns - **Implementation**: Watchdog observer runs in separate thread from FastAPI @@ -327,17 +338,94 @@ Tracks processing status and lifecycle: #### Content Storage Location -- **Choice**: Store extracted content in `files` collection -- **Rationale**: Content is intrinsic property of the file -- **Benefits**: Single query to get file + content, simpler data model +- **Choice**: Store files in the file system, using the SHA256 hash as filename +- **Rationale**: MongoDB is not meant for large files, better performance. Files remain in the file system for easy + access. -### Implementation Order +#### Repository and Services Implementation + +- **Choice**: Synchronous implementation using pymongo +- **Rationale**: Full compatibility with Celery workers and simplified workflow +- **Implementation**: All repositories and services operate synchronously for seamless integration + +### Implementation Status 1. ✅ Pydantic models for MongoDB collections -2. ✅ Repository layer for data access (files + processing_jobs) -3. ✅ Celery tasks for document processing -4. ✅ Watchdog file monitoring implementation -5. ✅ FastAPI integration and startup coordination +2. ✅ Repository layer for data access (files + processing_jobs + users + documents) - synchronous +3. ✅ Service layer for business logic (auth, user, document, job) - synchronous +4. ✅ Celery tasks for document processing +5. ✅ Watchdog file monitoring implementation +6. ✅ FastAPI integration and startup coordination + +## Job Management Layer + +### Repository Pattern Implementation + +The job management system follows the repository pattern for clean separation between data access and business logic. + +#### JobRepository + +Handles direct MongoDB operations for processing jobs using synchronous pymongo: + +**CRUD Operations:** +- `create_job()` - Create new processing job with automatic `created_at` timestamp +- `get_job_by_id()` - Retrieve job by ObjectId +- `update_job_status()` - Update job status with automatic timestamp management +- `delete_job()` - Remove job from database +- `get_jobs_by_file_id()` - Get all jobs for specific file +- `get_jobs_by_status()` - Get jobs filtered by processing status + +**Automatic Timestamp Management:** +- `created_at`: Set automatically during job creation +- `started_at`: Set automatically when status changes to PROCESSING +- `completed_at`: Set automatically when status changes to COMPLETED or FAILED + +#### JobService + +Provides synchronous business logic layer with strict status transition validation: + +**Status Transition Methods:** +- `mark_job_as_started()` - PENDING → PROCESSING +- `mark_job_as_completed()` - PROCESSING → COMPLETED +- `mark_job_as_failed()` - PROCESSING → FAILED + +**Validation Rules:** +- Strict status transitions (invalid transitions raise exceptions) +- Job existence verification before any operation +- Automatic timestamp management through repository layer + +#### Custom Exceptions + +**InvalidStatusTransitionError**: Raised for invalid status transitions +**JobRepositoryError**: Raised for MongoDB operation failures + +#### Valid Status Transitions + +``` +PENDING → PROCESSING (via mark_job_as_started) +PROCESSING → COMPLETED (via mark_job_as_completed) +PROCESSING → FAILED (via mark_job_as_failed) +``` + +All other transitions are forbidden and will raise `InvalidStatusTransitionError`. + +### File Structure + +``` +src/file-processor/app/ +├── database/repositories/ +│ ├── job_repository.py # JobRepository class (synchronous) +│ ├── user_repository.py # UserRepository class (synchronous) +│ ├── document_repository.py # DocumentRepository class (synchronous) +│ └── file_repository.py # FileRepository class (synchronous) +├── services/ +│ ├── job_service.py # JobService class (synchronous) +│ ├── auth_service.py # AuthService class (synchronous) +│ ├── user_service.py # UserService class (synchronous) +│ └── document_service.py # DocumentService class (synchronous) +└── exceptions/ + └── job_exceptions.py # Custom exceptions +``` ### Processing Pipeline Features @@ -346,87 +434,7 @@ Tracks processing status and lifecycle: - **Status Tracking**: Real-time processing status via `processing_jobs` collection - **Extensible Metadata**: Flexible metadata storage per file type - **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches - -## Document Service Architecture - -### Service Overview - -The document service provides orchestrated access to file documents and their content through a single interface that coordinates between `FileDocument` and `DocumentContent` repositories. - -### Service Design - -- **Architecture Pattern**: Service orchestration with separate repositories -- **Transaction Support**: MongoDB ACID transactions for data consistency -- **Content Deduplication**: Multiple files can reference the same content via SHA256 hash -- **Error Handling**: MongoDB standard exceptions with transaction rollback - -### Document Service (`document_service.py`) - -Orchestrates operations between file and content repositories while maintaining data consistency. - -#### Core Functionality - -##### `create_document(file_path: str, file_bytes: bytes, encoding: str)` - -Creates a new document with automatic attribute calculation and content deduplication. - -**Automatic Calculations:** -- `file_hash`: SHA256 hash of file bytes -- `file_type`: Detection based on file extension -- `mime_type`: Detection via `python-magic` library -- `file_size`: Length of provided bytes -- `detected_at`: Current timestamp -- `metadata`: Empty dictionary (reserved for future extension) - -**Deduplication Logic:** -1. Calculate SHA256 hash of file content -2. Check if `DocumentContent` with this hash already exists -3. If EXISTS: Create only `FileDocument` referencing existing content -4. If NOT EXISTS: Create both `FileDocument` and `DocumentContent` in transaction - -**Transaction Flow:** -``` -BEGIN TRANSACTION - IF content_exists(file_hash): - CREATE FileDocument with content reference - ELSE: - CREATE DocumentContent - CREATE FileDocument with content reference -COMMIT TRANSACTION -``` - -#### Available Methods - -- `create_document(file_path, file_bytes, encoding)`: Create with deduplication -- `get_document_by_id(document_id)`: Retrieve by document ID -- `get_document_by_hash(file_hash)`: Retrieve by file hash -- `get_document_by_filepath(filepath)`: Retrieve by file path -- `list_documents(skip, limit)`: Paginated document listing -- `count_documents()`: Total document count -- `update_document(document_id, update_data)`: Update document metadata -- `delete_document(document_id)`: Remove document and orphaned content - -### Repository Dependencies - -The document service coordinates two existing repositories: - -#### File Repository (`file_repository.py`) -- `create_document()`, `find_document_by_id()`, `find_document_by_hash()` -- `find_document_by_filepath()`, `find_document_by_name()` -- `list_documents()`, `count_documents()` -- `update_document()`, `delete_document()` - -#### Document Content Repository (`document_content_repository.py`) -- `create_document_content()`, `find_document_content_by_id()` -- `find_document_content_by_file_hash()`, `content_exists()` -- `update_document_content()`, `delete_document_content()` -- `list_document_contents()`, `count_document_contents()` - -### Dependencies - -- `python-magic`: MIME type detection -- `hashlib`: SHA256 hashing (standard library) -- `pymongo`: MongoDB transactions support +- **Synchronous Operations**: All database operations use pymongo for Celery compatibility ## Key Implementation Notes @@ -449,6 +457,7 @@ The document service coordinates two existing repositories: - **Package Manager**: pip (standard) - **External Dependencies**: Listed in each service's requirements.txt - **Standard Library First**: Prefer standard library when possible +- **Database Driver**: pymongo for synchronous MongoDB operations ### Testing Strategy @@ -473,6 +482,7 @@ The document service coordinates two existing repositories: 12. **Content in Files Collection**: Extracted content stored with file metadata 13. **Direct Task Dispatch**: File watcher directly creates Celery tasks 14. **SHA256 Duplicate Detection**: Prevents reprocessing identical files +15. **Synchronous Implementation**: All repositories and services use pymongo for Celery compatibility ### Development Process Requirements @@ -483,21 +493,15 @@ The document service coordinates two existing repositories: ### Next Implementation Steps -1. ✅ Create docker-compose.yml with all services => Done -2. ✅ Define user management and authentication architecture => Done -3. ✅ Implement user models and authentication services => - 1. models/user.py => Done - 2. models/auth.py => Done - 3. database/repositories/user_repository.py => Done -4. ✅ Add automatic admin user creation if it does not exists => Done -5. **IN PROGRESS**: Implement file processing pipeline => - 1. Create Pydantic models for files and processing_jobs collections - 2. Implement repository layer for file and processing job data access - 3. Create Celery tasks for document processing (.txt, .pdf, .docx) - 4. Implement Watchdog file monitoring with dedicated observer - 5. Integrate file watcher with FastAPI startup -6. Create protected API routes for user management -7. Build React monitoring interface with authentication +1. **TODO**: Complete file processing pipeline => + 1. ✅ Create Pydantic models for files and processing_jobs collections + 2. ✅ Implement repository layer for file and processing job data access (synchronous) + 3. ✅ Implement service layer for business logic (synchronous) + 4. ✅ Create Celery tasks for document processing (.txt, .pdf, .docx) + 5. ✅ Implement Watchdog file monitoring with dedicated observer + 6. ✅ Integrate file watcher with FastAPI startup +2. Create protected API routes for user management +3. Build React monitoring interface with authentication ## Annexes @@ -586,4 +590,4 @@ docker-compose up --scale worker=3 - **file-processor**: Hot-reload enabled via `--reload` flag - Code changes in `src/file-processor/app/` automatically restart FastAPI - **worker**: No hot-reload (manual restart required for stability) - - Code changes in `src/worker/tasks/` require: `docker-compose restart worker` + - Code changes in `src/worker/tasks/` require: `docker-compose restart worker` \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 57b85e6..109df33 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,7 +19,7 @@ services: MONGO_INITDB_ROOT_PASSWORD: password123 MONGO_INITDB_DATABASE: mydocmanager volumes: - - mongodb-data:/data/db + - ./volumes/db:/data/db networks: - mydocmanager-network @@ -34,10 +34,12 @@ services: environment: - REDIS_URL=redis://redis:6379/0 - MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin - - PYTHONPATH=/app + - PYTHONPATH=/app:/tasks # Added /tasks to Python path volumes: - ./src/file-processor:/app + - ./src/worker/tasks:/app/tasks # <- Added: shared access to worker tasks - ./volumes/watched_files:/watched_files + - ./volumes/objects:/objects depends_on: - redis - mongodb @@ -56,14 +58,29 @@ services: - MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin - PYTHONPATH=/app volumes: - - ./src/worker/tasks:/app + - ./src/worker:/app + - ./src/file-processor/app:/app/app # <- Added: shared access file-processor app - ./volumes/watched_files:/watched_files depends_on: - redis - mongodb networks: - mydocmanager-network - command: celery -A main worker --loglevel=info + command: celery -A tasks.main worker --loglevel=info + + # Frontend - React application with Vite + frontend: + build: + context: ./src/frontend + dockerfile: Dockerfile + container_name: mydocmanager-frontend + ports: + - "5173:5173" + volumes: + - ./src/frontend:/app + - /app/node_modules # Anonymous volume to prevent node_modules override + networks: + - mydocmanager-network volumes: mongodb-data: diff --git a/requirements.txt b/requirements.txt index 7df4d86..9b882a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,30 @@ amqp==5.3.1 annotated-types==0.7.0 anyio==4.10.0 +asgiref==3.9.1 bcrypt==4.3.0 billiard==4.2.1 celery==5.5.3 +certifi==2025.8.3 +cffi==2.0.0 click==8.2.1 click-didyoumean==0.3.1 click-plugins==1.1.1.2 click-repl==0.3.0 +cryptography==46.0.1 dnspython==2.8.0 +ecdsa==0.19.1 email-validator==2.3.0 fastapi==0.116.1 h11==0.16.0 +hiredis==3.2.1 +httpcore==1.0.9 httptools==0.6.4 +httpx==0.28.1 idna==3.10 +importlib_metadata==8.7.0 iniconfig==2.1.0 +izulu==0.50.0 kombu==5.5.4 mongomock==4.3.0 mongomock-motor==0.0.36 @@ -23,9 +33,13 @@ packaging==25.0 pipdeptree==2.28.0 pluggy==1.6.0 prompt_toolkit==3.0.52 +pyasn1==0.6.1 +pycparser==2.23 +pycron==3.2.0 pydantic==2.11.9 pydantic_core==2.33.2 Pygments==2.19.2 +PyJWT==2.10.1 pymongo==4.15.1 pytest==8.4.2 pytest-asyncio==1.2.0 @@ -35,6 +49,8 @@ python-dotenv==1.1.1 python-magic==0.4.27 pytz==2025.2 PyYAML==6.0.2 +redis==6.4.0 +rsa==4.9.1 sentinels==1.1.1 six==1.17.0 sniffio==1.3.1 @@ -45,6 +61,8 @@ tzdata==2025.2 uvicorn==0.35.0 uvloop==0.21.0 vine==5.1.0 +watchdog==6.0.0 watchfiles==1.1.0 wcwidth==0.2.13 websockets==15.0.1 +zipp==3.23.0 diff --git a/src/file-processor/Dockerfile b/src/file-processor/Dockerfile index 62477fd..434b3cf 100644 --- a/src/file-processor/Dockerfile +++ b/src/file-processor/Dockerfile @@ -3,6 +3,12 @@ FROM python:3.12-slim # Set working directory WORKDIR /app +# Install libmagic +RUN apt-get update && apt-get install -y --no-install-recommends \ + libmagic1 \ + file \ + && rm -rf /var/lib/apt/lists/* + # Copy requirements and install dependencies COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt diff --git a/src/file-processor/app/api/__init__.py b/src/file-processor/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/file-processor/app/api/dependencies.py b/src/file-processor/app/api/dependencies.py new file mode 100644 index 0000000..e52a3de --- /dev/null +++ b/src/file-processor/app/api/dependencies.py @@ -0,0 +1,100 @@ +import jwt +from fastapi import Depends, HTTPException +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from jwt import InvalidTokenError +from starlette import status + +from app.config import settings +from app.database.connection import get_database +from app.models.auth import UserRole +from app.models.user import UserInDB +from app.services.auth_service import AuthService +from app.services.user_service import UserService + +security = HTTPBearer() + + +def get_auth_service() -> AuthService: + """Dependency to get AuthService instance.""" + return AuthService() + + +def get_user_service() -> UserService: + """Dependency to get UserService instance.""" + database = get_database() + return UserService(database) + + +def get_current_user( + credentials: HTTPAuthorizationCredentials = Depends(security), + user_service: UserService = Depends(get_user_service) +) -> UserInDB: + """ + Dependency to get current authenticated user from JWT token. + + Args: + credentials: HTTP Bearer credentials + user_service: Auth service instance + + Returns: + User: Current authenticated user + + Raises: + HTTPException: If token is invalid or user not found + """ + try: + payload = jwt.decode( + credentials.credentials, + settings.get_jwt_secret_key(), + algorithms=[settings.get_jwt_algorithm()] + ) + username: str = payload.get("sub") + if username is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + except InvalidTokenError: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + + user = user_service.get_user_by_username(username) + if user is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + + if not user.is_active: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Inactive user" + ) + + return user + + +def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB: + """ + Dependency to ensure current user has admin role. + + Args: + current_user: Current authenticated user + + Returns: + User: Current user if admin + + Raises: + HTTPException: If user is not admin + """ + if current_user.role != UserRole.ADMIN: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Not enough permissions" + ) + return current_user diff --git a/src/file-processor/app/api/routes/__init__.py b/src/file-processor/app/api/routes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/file-processor/app/api/routes/auth.py b/src/file-processor/app/api/routes/auth.py new file mode 100644 index 0000000..a15b8a4 --- /dev/null +++ b/src/file-processor/app/api/routes/auth.py @@ -0,0 +1,80 @@ +from fastapi import APIRouter, Depends, HTTPException, status +from fastapi.security import OAuth2PasswordRequestForm + +from app.api.dependencies import get_auth_service, get_current_user, get_user_service +from app.models.auth import LoginResponse, UserResponse +from app.models.user import UserInDB +from app.services.auth_service import AuthService +from app.services.user_service import UserService + +router = APIRouter(tags=["authentication"]) + + +@router.post("/login", response_model=LoginResponse) +def login( + form_data: OAuth2PasswordRequestForm = Depends(), + auth_service: AuthService = Depends(get_auth_service), + user_service: UserService = Depends(get_user_service) +): + """ + Authenticate user and return JWT token. + + Args: + form_data: OAuth2 password form data + auth_service: Auth service instance + user_service: User service instance + + Returns: + LoginResponse: JWT token and user info + + Raises: + HTTPException: If authentication fails + """ + incorrect_username_or_pwd = HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Incorrect username or password", + headers={"WWW-Authenticate": "Bearer"}, + ) + + user = user_service.get_user_by_username(form_data.username) + if (not user or + not user.is_active or + not auth_service.verify_user_password(form_data.password, user.hashed_password)): + raise incorrect_username_or_pwd + + access_token = auth_service.create_access_token(data={"sub": user.username}) + + return LoginResponse( + access_token=access_token, + user=UserResponse( + _id=user.id, + username=user.username, + email=user.email, + role=user.role, + is_active=user.is_active, + created_at=user.created_at, + updated_at=user.updated_at + ) + ) + + +@router.get("/me", response_model=UserResponse) +def get_current_user_profile(current_user: UserInDB = Depends(get_current_user)): + """ + Get current user profile. + + Args: + current_user: Current authenticated user + + Returns: + UserResponse: Current user profile without sensitive data + """ + return UserResponse( + _id=current_user.id, + username=current_user.username, + email=current_user.email, + role=current_user.role, + is_active=current_user.is_active, + created_at=current_user.created_at, + updated_at=current_user.updated_at + ) diff --git a/src/file-processor/app/api/routes/users.py b/src/file-processor/app/api/routes/users.py new file mode 100644 index 0000000..9d4a01a --- /dev/null +++ b/src/file-processor/app/api/routes/users.py @@ -0,0 +1,172 @@ +from fastapi import APIRouter, Depends, HTTPException +from starlette import status + +from app.api.dependencies import get_admin_user, get_user_service +from app.models.auth import UserResponse, MessageResponse +from app.models.types import PyObjectId +from app.models.user import UserInDB, UserCreate, UserUpdate +from app.services.user_service import UserService + +router = APIRouter(tags=["users"]) + + +@router.get("", response_model=list[UserInDB]) +def list_users( + admin_user: UserInDB = Depends(get_admin_user), + user_service: UserService = Depends(get_user_service) +): + """ + List all users (admin only). + + Args: + admin_user: Current admin user + user_service: User service instance + + Returns: + List[UserResponse]: List of all users without sensitive data + """ + return user_service.list_users() + + +@router.get("/{user_id}", response_model=UserResponse) +def get_user_by_id( + user_id: PyObjectId, + admin_user: UserInDB = Depends(get_admin_user), + user_service: UserService = Depends(get_user_service) +): + """ + Get specific user by ID (admin only). + + Args: + user_id: User ID to retrieve + admin_user: Current admin user + user_service: User service instance + + Returns: + UserResponse: User information without sensitive data + + Raises: + HTTPException: If user not found + """ + user = user_service.get_user_by_id(str(user_id)) + if not user: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="User not found" + ) + + return user + + +@router.post("", response_model=UserResponse, status_code=status.HTTP_201_CREATED) +def create_user( + user_data: UserCreate, + admin_user: UserInDB = Depends(get_admin_user), + user_service: UserService = Depends(get_user_service) +): + """ + Create new user (admin only). + + Args: + user_data: User creation data + admin_user: Current admin user + user_service: User service instance + + Returns: + UserResponse: Created user information without sensitive data + + Raises: + HTTPException: If user creation fails + """ + try: + user = user_service.create_user(user_data) + return UserResponse( + _id=user.id, + username=user.username, + email=user.email, + role=user.role, + is_active=user.is_active, + created_at=user.created_at, + updated_at=user.updated_at + ) + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(e) + ) + + +@router.put("/{user_id}", response_model=UserResponse) +def update_user( + user_id: PyObjectId, + user_data: UserUpdate, + admin_user: UserInDB = Depends(get_admin_user), + user_service: UserService = Depends(get_user_service) +): + """ + Update existing user (admin only). + + Args: + user_id: User ID to update + user_data: User update data + admin_user: Current admin user + user_service: User service instance + + Returns: + UserResponse: Updated user information without sensitive data + + Raises: + HTTPException: If user not found or update fails + """ + try: + user = user_service.update_user(str(user_id), user_data) + if not user: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="User not found" + ) + + return UserResponse( + _id=user.id, + username=user.username, + email=user.email, + role=user.role, + is_active=user.is_active, + created_at=user.created_at, + updated_at=user.updated_at + ) + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(e) + ) + + +@router.delete("/{user_id}", response_model=MessageResponse) +def delete_user( + user_id: PyObjectId, + admin_user: UserInDB = Depends(get_admin_user), + user_service: UserService = Depends(get_user_service) +): + """ + Delete user by ID (admin only). + + Args: + user_id: User ID to delete + admin_user: Current admin user + user_service: User service instance + + Returns: + MessageResponse: Success message + + Raises: + HTTPException: If user not found or deletion fails + """ + success = user_service.delete_user(str(user_id)) + if not success: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="User not found" + ) + + return MessageResponse(message="User successfully deleted") diff --git a/src/file-processor/app/config/settings.py b/src/file-processor/app/config/settings.py index 68720c1..81fdb1c 100644 --- a/src/file-processor/app/config/settings.py +++ b/src/file-processor/app/config/settings.py @@ -6,7 +6,6 @@ using simple os.getenv() approach without external validation libraries. """ import os -from typing import Optional def get_mongodb_url() -> str: @@ -31,6 +30,26 @@ def get_mongodb_database_name() -> str: return os.getenv("MONGODB_DATABASE", "mydocmanager") +def get_redis_url() -> str: + return os.getenv("REDIS_URL", "redis://localhost:6379/0") + + +# def get_redis_host() -> str: +# redis_url = get_redis_url() +# if redis_url.startswith("redis://"): +# return redis_url.split("redis://")[1].split("/")[0] +# else: +# return redis_url +# +# +# def get_redis_port() -> int: +# redis_url = get_redis_url() +# if redis_url.startswith("redis://"): +# return int(redis_url.split("redis://")[1].split("/")[0].split(":")[1]) +# else: +# return int(redis_url.split(":")[1]) + + def get_jwt_secret_key() -> str: """ Get JWT secret key from environment variables. @@ -82,4 +101,19 @@ def is_development_environment() -> bool: Returns: bool: True if development environment """ - return os.getenv("ENVIRONMENT", "development").lower() == "development" \ No newline at end of file + return os.getenv("ENVIRONMENT", "development").lower() == "development" + + +def get_objects_folder() -> str: + """ + Get Vault path from environment variables. + + Returns: + str: Vault path + """ + return os.getenv("OBJECTS_FOLDER", "/objects") + + +def watch_directory() -> str: + """Directory to monitor for new files""" + return os.getenv("WATCH_DIRECTORY", "/watched_files") diff --git a/src/file-processor/app/database/connection.py b/src/file-processor/app/database/connection.py index bba8f82..48295cb 100644 --- a/src/file-processor/app/database/connection.py +++ b/src/file-processor/app/database/connection.py @@ -7,6 +7,7 @@ The application will terminate if MongoDB is not accessible at startup. import sys from typing import Optional + from pymongo import MongoClient from pymongo.database import Database from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError @@ -107,6 +108,15 @@ def get_mongodb_client() -> Optional[MongoClient]: return _client +def get_extra_args(session): + # Build kwargs only if session is provided + kwargs = {} + if session is not None: + kwargs["session"] = session + + return kwargs + + def test_database_connection() -> bool: """ Test if database connection is working. @@ -122,4 +132,4 @@ def test_database_connection() -> bool: db.command('ping') return True except Exception: - return False \ No newline at end of file + return False diff --git a/src/file-processor/app/database/repositories/document_content_repository.py b/src/file-processor/app/database/repositories/document_content_repository.py deleted file mode 100644 index 16b2bd7..0000000 --- a/src/file-processor/app/database/repositories/document_content_repository.py +++ /dev/null @@ -1,214 +0,0 @@ -from typing import List, Optional -from datetime import datetime -from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection -from pymongo.errors import DuplicateKeyError, PyMongoError -from bson import ObjectId - -from app.models.document import DocumentContent - - -class DocumentContentRepository: - """ - Repository class for document content CRUD operations in MongoDB. - - This class handles all database operations related to document content, - following the repository pattern with dependency injection and async/await. - """ - - def __init__(self, database: AsyncIOMotorDatabase): - """ - Initialize repository with database dependency. - - Args: - database (AsyncIOMotorDatabase): MongoDB database instance - """ - self.db = database - self.collection: AsyncIOMotorCollection = database.document_contents - self._ensure_indexes() - - async def initialize(self): - """ - Initialize repository by ensuring required indexes exist. - - Should be called after repository instantiation to setup database indexes. - """ - await self._ensure_indexes() - - async def _ensure_indexes(self): - """ - Ensure required database indexes exist. - - Creates unique index on file_hash field to prevent duplicates. - """ - try: - await self.collection.create_index("file_hash", unique=True) - except PyMongoError: - # Index might already exist, ignore error - pass - - async def create_document_content(self, document_content: DocumentContent) -> DocumentContent: - """ - Create a new document content in the database. - - Args: - document_content (DocumentContent): Document content data - - Returns: - DocumentContent: Created document content with database ID - - Raises: - DuplicateKeyError: If file_hash already exists - ValueError: If document content creation fails due to validation - """ - document_dict = document_content.model_dump(by_alias=True, exclude_unset=True) - - # Remove _id if it's None to let MongoDB generate it - if document_dict.get("_id") is None: - document_dict.pop("_id", None) - - try: - result = await self.collection.insert_one(document_dict) - document_dict["_id"] = result.inserted_id - return DocumentContent(**document_dict) - except DuplicateKeyError as e: - raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}") - except PyMongoError as e: - raise ValueError(f"Failed to create document content: {e}") - - async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]: - """ - Find document content by ID. - - Args: - document_id (str): Document content ID to search for - - Returns: - DocumentContent or None: Document content if found, None otherwise - """ - try: - if not ObjectId.is_valid(document_id): - return None - - document_doc = await self.collection.find_one({"_id": ObjectId(document_id)}) - if document_doc: - return DocumentContent(**document_doc) - return None - except PyMongoError: - return None - - async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]: - """ - Find document content by file hash. - - Args: - file_hash (str): File hash to search for - - Returns: - DocumentContent or None: Document content if found, None otherwise - """ - try: - document_doc = await self.collection.find_one({"file_hash": file_hash}) - if document_doc: - return DocumentContent(**document_doc) - return None - except PyMongoError: - return None - - async def content_exists(self, file_hash: str) -> bool: - """ - Check if document content exists by file hash. - - Args: - file_hash (str): File hash to check - - Returns: - bool: True if document content exists, False otherwise - """ - try: - count = await self.collection.count_documents({"file_hash": file_hash}) - return count > 0 - except PyMongoError: - return False - - async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]: - """ - Update document content information. - - Args: - document_id (str): Document content ID to update - update_data (dict): Updated document content data - - Returns: - DocumentContent or None: Updated document content if found, None otherwise - """ - try: - if not ObjectId.is_valid(document_id): - return None - - # Remove None values and _id from update data - clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"} - - if not clean_update_data: - return await self.find_document_content_by_id(document_id) - - result = await self.collection.find_one_and_update( - {"_id": ObjectId(document_id)}, - {"$set": clean_update_data}, - return_document=True - ) - - if result: - return DocumentContent(**result) - return None - - except PyMongoError: - return None - - async def delete_document_content(self, document_id: str) -> bool: - """ - Delete document content from database. - - Args: - document_id (str): Document content ID to delete - - Returns: - bool: True if document content was deleted, False otherwise - """ - try: - if not ObjectId.is_valid(document_id): - return False - - result = await self.collection.delete_one({"_id": ObjectId(document_id)}) - return result.deleted_count > 0 - except PyMongoError: - return False - - async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]: - """ - List document contents with pagination. - - Args: - skip (int): Number of document contents to skip (default: 0) - limit (int): Maximum number of document contents to return (default: 100) - - Returns: - List[DocumentContent]: List of document contents - """ - try: - cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1) - document_docs = await cursor.to_list(length=limit) - return [DocumentContent(**document_doc) for document_doc in document_docs] - except PyMongoError: - return [] - - async def count_document_contents(self) -> int: - """ - Count total number of document contents. - - Returns: - int: Total number of document contents in database - """ - try: - return await self.collection.count_documents({}) - except PyMongoError: - return 0 diff --git a/src/file-processor/app/database/repositories/document_repository.py b/src/file-processor/app/database/repositories/document_repository.py index 74552db..0d8b6dc 100644 --- a/src/file-processor/app/database/repositories/document_repository.py +++ b/src/file-processor/app/database/repositories/document_repository.py @@ -6,9 +6,13 @@ in MongoDB with proper error handling and type safety. """ from typing import Optional, List + from bson import ObjectId +from pymongo.collection import Collection +from pymongo.database import Database from pymongo.errors import DuplicateKeyError, PyMongoError -from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase + +from app.database.connection import get_extra_args from app.models.document import FileDocument from app.utils.document_matching import fuzzy_matching, subsequence_matching @@ -34,52 +38,49 @@ class FileDocumentRepository: with proper error handling and data validation. """ - def __init__(self, database: AsyncIOMotorDatabase): + def __init__(self, database: Database): """Initialize file repository with database connection.""" self.db = database - self.collection: AsyncIOMotorCollection = self.db.files - self._ensure_indexes() + self.collection: Collection = self.db.documents - async def initialize(self): + def initialize(self): """ Initialize repository by ensuring required indexes exist. Should be called after repository instantiation to setup database indexes. """ - await self._ensure_indexes() + self._ensure_indexes() + return self - async def _ensure_indexes(self): + def _ensure_indexes(self): """ Ensure required database indexes exist. Creates unique index on username field to prevent duplicates. """ - try: - await self.collection.create_index("filepath", unique=True) - except PyMongoError: - # Index might already exist, ignore error - pass + pass - async def create_document(self, file_data: FileDocument) -> FileDocument: + def create_document(self, file_data: FileDocument, session=None) -> FileDocument: """ Create a new file document in database. Args: file_data (FileDocument): File document data to create + session (AsyncIOMotorClientSession, optional): MongoDB session Returns: - FileDocument: Created file document with database ID + FileDocument: Created document with database ID Raises: ValueError: If file creation fails due to validation - DuplicateKeyError: If file with same hash already exists + DuplicateKeyError: If a document with same hash already exists """ try: file_dict = file_data.model_dump(by_alias=True, exclude_unset=True) if "_id" in file_dict and file_dict["_id"] is None: del file_dict["_id"] - result = await self.collection.insert_one(file_dict) + result = self.collection.insert_one(file_dict, **get_extra_args(session)) file_data.id = result.inserted_id return file_data @@ -88,7 +89,7 @@ class FileDocumentRepository: except PyMongoError as e: raise ValueError(f"Failed to create file document: {e}") - async def find_document_by_id(self, file_id: str) -> Optional[FileDocument]: + def find_document_by_id(self, file_id: str) -> Optional[FileDocument]: """ Find file document by ID. @@ -102,7 +103,7 @@ class FileDocumentRepository: if not ObjectId.is_valid(file_id): return None - file_doc = await self.collection.find_one({"_id": ObjectId(file_id)}) + file_doc = self.collection.find_one({"_id": ObjectId(file_id)}) if file_doc: return FileDocument(**file_doc) return None @@ -110,7 +111,7 @@ class FileDocumentRepository: except PyMongoError: return None - async def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]: + def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]: """ Find file document by file hash to detect duplicates. @@ -121,7 +122,7 @@ class FileDocumentRepository: FileDocument or None: File document if found, None otherwise """ try: - file_doc = await self.collection.find_one({"file_hash": file_hash}) + file_doc = self.collection.find_one({"file_hash": file_hash}) if file_doc: return FileDocument(**file_doc) return None @@ -129,7 +130,7 @@ class FileDocumentRepository: except PyMongoError: return None - async def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: + def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: """ Find file document by exact filepath. @@ -140,7 +141,7 @@ class FileDocumentRepository: FileDocument or None: File document if found, None otherwise """ try: - file_doc = await self.collection.find_one({"filepath": filepath}) + file_doc = self.collection.find_one({"filepath": filepath}) if file_doc: return FileDocument(**file_doc) return None @@ -148,7 +149,7 @@ class FileDocumentRepository: except PyMongoError: return None - async def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]: + def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]: """ Find file documents by filename using fuzzy matching. @@ -162,8 +163,7 @@ class FileDocumentRepository: try: # Get all files from database cursor = self.collection.find({}) - all_files = await cursor.to_list(length=None) - all_documents = [FileDocument(**file_doc) for file_doc in all_files] + all_documents = [FileDocument(**file_doc) for file_doc in cursor] if isinstance(matching_method, FuzzyMatching): return fuzzy_matching(filename, all_documents, matching_method.threshold) @@ -173,7 +173,7 @@ class FileDocumentRepository: except PyMongoError: return [] - async def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]: + def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]: """ List file documents with pagination. @@ -186,13 +186,12 @@ class FileDocumentRepository: """ try: cursor = self.collection.find({}).skip(skip).limit(limit).sort("detected_at", -1) - file_docs = await cursor.to_list(length=limit) - return [FileDocument(**doc) for doc in file_docs] + return [FileDocument(**doc) for doc in cursor] except PyMongoError: return [] - async def count_documents(self) -> int: + def count_documents(self) -> int: """ Count total number of file documents. @@ -200,17 +199,18 @@ class FileDocumentRepository: int: Total number of file documents in collection """ try: - return await self.collection.count_documents({}) + return self.collection.count_documents({}) except PyMongoError: return 0 - async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]: + def update_document(self, file_id: str, update_data: dict, session=None) -> Optional[FileDocument]: """ Update file document with new data. Args: file_id (str): File document ID to update update_data (dict): Fields to update + session (AsyncIOMotorClientSession, optional): MongoDB session Returns: FileDocument or None: Updated file document if successful, None otherwise @@ -223,12 +223,13 @@ class FileDocumentRepository: clean_update_data = {k: v for k, v in update_data.items() if v is not None} if not clean_update_data: - return await self.find_document_by_id(file_id) + return self.find_document_by_id(file_id) - result = await self.collection.find_one_and_update( + result = self.collection.find_one_and_update( {"_id": ObjectId(file_id)}, {"$set": clean_update_data}, - return_document=True + return_document=True, + **get_extra_args(session) ) if result: @@ -238,12 +239,13 @@ class FileDocumentRepository: except PyMongoError: return None - async def delete_document(self, file_id: str) -> bool: + def delete_document(self, file_id: str, session=None) -> bool: """ Delete file document from database. Args: file_id (str): File document ID to delete + session (AsyncIOMotorClientSession, optional): MongoDB session Returns: bool: True if file was deleted, False otherwise @@ -252,7 +254,7 @@ class FileDocumentRepository: if not ObjectId.is_valid(file_id): return False - result = await self.collection.delete_one({"_id": ObjectId(file_id)}) + result = self.collection.delete_one({"_id": ObjectId(file_id)}, **get_extra_args(session)) return result.deleted_count > 0 except PyMongoError: diff --git a/src/file-processor/app/database/repositories/job_repository.py b/src/file-processor/app/database/repositories/job_repository.py new file mode 100644 index 0000000..d939b05 --- /dev/null +++ b/src/file-processor/app/database/repositories/job_repository.py @@ -0,0 +1,230 @@ +""" +Repository for managing processing jobs in MongoDB. + +This module provides data access layer for ProcessingJob operations +with automatic timestamp management and error handling. +""" + +from datetime import datetime +from typing import List, Optional + +from pymongo.collection import Collection +from pymongo.database import Database +from pymongo.errors import PyMongoError + +from app.exceptions.job_exceptions import JobRepositoryError +from app.models.job import ProcessingJob, ProcessingStatus +from app.models.types import PyObjectId + + +class JobRepository: + """ + Repository for processing job data access operations. + + Provides CRUD operations for ProcessingJob documents with automatic + timestamp management and proper error handling. + """ + + def __init__(self, database: Database): + """Initialize repository with MongoDB collection reference.""" + self.db = database + self.collection: Collection = self.db.processing_jobs + + def _ensure_indexes(self): + """ + Ensure required database indexes exist. + + Creates unique index on username field to prevent duplicates. + """ + try: + self.collection.create_index("document_id", unique=True) + except PyMongoError: + # Index might already exist, ignore error + pass + + def initialize(self): + """ + Initialize repository by ensuring required indexes exist. + + Should be called after repository instantiation to setup database indexes. + """ + self._ensure_indexes() + return self + + def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob: + """ + Create a new processing job. + + Args: + file_id: Reference to the file document + task_id: Optional Celery task UUID + + Returns: + The created ProcessingJob + + Raises: + JobRepositoryError: If database operation fails + """ + try: + job_data = { + "document_id": document_id, + "status": ProcessingStatus.PENDING, + "task_id": task_id, + "created_at": datetime.now(), + "started_at": None, + "completed_at": None, + "error_message": None + } + + result = self.collection.insert_one(job_data) + job_data["_id"] = result.inserted_id + + return ProcessingJob(**job_data) + + except PyMongoError as e: + raise JobRepositoryError("create_job", e) + + def find_job_by_id(self, job_id: PyObjectId) -> Optional[ProcessingJob]: + """ + Retrieve a job by its ID. + + Args: + job_id: The job ObjectId + + Returns: + The ProcessingJob document + + Raises: + JobNotFoundError: If job doesn't exist + JobRepositoryError: If database operation fails + """ + try: + job_data = self.collection.find_one({"_id": job_id}) + if job_data: + return ProcessingJob(**job_data) + + return None + + except PyMongoError as e: + raise JobRepositoryError("get_job_by_id", e) + + def update_job_status( + self, + job_id: PyObjectId, + status: ProcessingStatus, + error_message: Optional[str] = None + ) -> Optional[ProcessingJob]: + """ + Update job status with automatic timestamp management. + + Args: + job_id: The job ObjectId + status: New processing status + error_message: Optional error message for failed jobs + + Returns: + The updated ProcessingJob + + Raises: + JobNotFoundError: If job doesn't exist + JobRepositoryError: If database operation fails + """ + try: + # Prepare update data + update_data = {"status": status} + + # Set appropriate timestamp based on status + current_time = datetime.now() + if status == ProcessingStatus.PROCESSING: + update_data["started_at"] = current_time + elif status in (ProcessingStatus.COMPLETED, ProcessingStatus.FAILED): + update_data["completed_at"] = current_time + + # Add error message if provided + if error_message is not None: + update_data["error_message"] = error_message + + result = self.collection.find_one_and_update( + {"_id": job_id}, + {"$set": update_data}, + return_document=True + ) + + if result: + return ProcessingJob(**result) + + return None + + except PyMongoError as e: + raise JobRepositoryError("update_job_status", e) + + def delete_job(self, job_id: PyObjectId) -> bool: + """ + Delete a job from the database. + + Args: + job_id: The job ObjectId + + Returns: + True if job was deleted, False if not found + + Raises: + JobRepositoryError: If database operation fails + """ + try: + result = self.collection.delete_one({"_id": job_id}) + + return result.deleted_count > 0 + + except PyMongoError as e: + raise JobRepositoryError("delete_job", e) + + def find_jobs_by_document_id(self, document_id: PyObjectId) -> List[ProcessingJob]: + """ + Retrieve all jobs for a specific file. + + Args: + document_id: The file ObjectId + + Returns: + List of ProcessingJob documents + + Raises: + JobRepositoryError: If database operation fails + """ + try: + cursor = self.collection.find({"document_id": document_id}) + + jobs = [] + for job_data in cursor: + jobs.append(ProcessingJob(**job_data)) + + return jobs + + except PyMongoError as e: + raise JobRepositoryError("get_jobs_by_file_id", e) + + def get_jobs_by_status(self, status: ProcessingStatus) -> List[ProcessingJob]: + """ + Retrieve all jobs with a specific status. + + Args: + status: The processing status to filter by + + Returns: + List of ProcessingJob documents + + Raises: + JobRepositoryError: If database operation fails + """ + try: + cursor = self.collection.find({"status": status}) + + jobs = [] + for job_data in cursor: + jobs.append(ProcessingJob(**job_data)) + + return jobs + + except PyMongoError as e: + raise JobRepositoryError("get_jobs_by_status", e) diff --git a/src/file-processor/app/database/repositories/user_repository.py b/src/file-processor/app/database/repositories/user_repository.py index c3b29e3..72365df 100644 --- a/src/file-processor/app/database/repositories/user_repository.py +++ b/src/file-processor/app/database/repositories/user_repository.py @@ -5,10 +5,12 @@ This module implements the repository pattern for user CRUD operations with dependency injection of the database connection using async/await. """ -from typing import Optional, List from datetime import datetime +from typing import Optional, List + from bson import ObjectId -from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection +from pymongo.collection import Collection +from pymongo.database import Database from pymongo.errors import DuplicateKeyError, PyMongoError from app.models.user import UserCreate, UserInDB, UserUpdate @@ -23,7 +25,7 @@ class UserRepository: following the repository pattern with dependency injection and async/await. """ - def __init__(self, database: AsyncIOMotorDatabase): + def __init__(self, database: Database): """ Initialize repository with database dependency. @@ -31,30 +33,30 @@ class UserRepository: database (AsyncIOMotorDatabase): MongoDB database instance """ self.db = database - self.collection: AsyncIOMotorCollection = database.users - self._ensure_indexes() + self.collection: Collection = database.users - async def initialize(self): + def initialize(self): """ Initialize repository by ensuring required indexes exist. Should be called after repository instantiation to setup database indexes. """ - await self._ensure_indexes() + self._ensure_indexes() + return self - async def _ensure_indexes(self): + def _ensure_indexes(self): """ Ensure required database indexes exist. Creates unique index on username field to prevent duplicates. """ try: - await self.collection.create_index("username", unique=True) + self.collection.create_index("username", unique=True) except PyMongoError: # Index might already exist, ignore error pass - async def create_user(self, user_data: UserCreate) -> UserInDB: + def create_user(self, user_data: UserCreate) -> UserInDB: """ Create a new user in the database. @@ -79,7 +81,7 @@ class UserRepository: } try: - result = await self.collection.insert_one(user_dict) + result = self.collection.insert_one(user_dict) user_dict["_id"] = result.inserted_id return UserInDB(**user_dict) except DuplicateKeyError as e: @@ -87,7 +89,7 @@ class UserRepository: except PyMongoError as e: raise ValueError(f"Failed to create user: {e}") - async def find_user_by_username(self, username: str) -> Optional[UserInDB]: + def find_user_by_username(self, username: str) -> Optional[UserInDB]: """ Find user by username. @@ -98,14 +100,14 @@ class UserRepository: UserInDB or None: User if found, None otherwise """ try: - user_doc = await self.collection.find_one({"username": username}) + user_doc = self.collection.find_one({"username": username}) if user_doc: return UserInDB(**user_doc) return None except PyMongoError: return None - async def find_user_by_id(self, user_id: str) -> Optional[UserInDB]: + def find_user_by_id(self, user_id: str) -> Optional[UserInDB]: """ Find user by ID. @@ -119,14 +121,14 @@ class UserRepository: if not ObjectId.is_valid(user_id): return None - user_doc = await self.collection.find_one({"_id": ObjectId(user_id)}) + user_doc = self.collection.find_one({"_id": ObjectId(user_id)}) if user_doc: return UserInDB(**user_doc) return None except PyMongoError: return None - async def find_user_by_email(self, email: str) -> Optional[UserInDB]: + def find_user_by_email(self, email: str) -> Optional[UserInDB]: """ Find user by email address. @@ -137,14 +139,14 @@ class UserRepository: UserInDB or None: User if found, None otherwise """ try: - user_doc = await self.collection.find_one({"email": email}) + user_doc = self.collection.find_one({"email": email}) if user_doc: return UserInDB(**user_doc) return None except PyMongoError: return None - async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]: + def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]: """ Update user information. @@ -177,9 +179,9 @@ class UserRepository: clean_update_data = {k: v for k, v in update_data.items() if v is not None} if not clean_update_data: - return await self.find_user_by_id(user_id) + return self.find_user_by_id(user_id) - result = await self.collection.find_one_and_update( + result = self.collection.find_one_and_update( {"_id": ObjectId(user_id)}, {"$set": clean_update_data}, return_document=True @@ -192,7 +194,7 @@ class UserRepository: except PyMongoError: return None - async def delete_user(self, user_id: str) -> bool: + def delete_user(self, user_id: str) -> bool: """ Delete user from database. @@ -206,12 +208,12 @@ class UserRepository: if not ObjectId.is_valid(user_id): return False - result = await self.collection.delete_one({"_id": ObjectId(user_id)}) + result = self.collection.delete_one({"_id": ObjectId(user_id)}) return result.deleted_count > 0 except PyMongoError: return False - async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]: + def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]: """ List users with pagination. @@ -224,12 +226,12 @@ class UserRepository: """ try: cursor = self.collection.find({}).skip(skip).limit(limit).sort("created_at", -1) - user_docs = await cursor.to_list(length=limit) + user_docs = cursor.to_list(length=limit) return [UserInDB(**user_doc) for user_doc in user_docs] except PyMongoError: return [] - async def count_users(self) -> int: + def count_users(self) -> int: """ Count total number of users. @@ -237,11 +239,11 @@ class UserRepository: int: Total number of users in database """ try: - return await self.collection.count_documents({}) + return self.collection.count_documents({}) except PyMongoError: return 0 - async def user_exists(self, username: str) -> bool: + def user_exists(self, username: str) -> bool: """ Check if user exists by username. @@ -252,7 +254,7 @@ class UserRepository: bool: True if user exists, False otherwise """ try: - count = await self.collection.count_documents({"username": username}) + count = self.collection.count_documents({"username": username}) return count > 0 except PyMongoError: return False diff --git a/src/file-processor/app/exceptions/__init__.py b/src/file-processor/app/exceptions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/file-processor/app/exceptions/job_exceptions.py b/src/file-processor/app/exceptions/job_exceptions.py new file mode 100644 index 0000000..65bdd17 --- /dev/null +++ b/src/file-processor/app/exceptions/job_exceptions.py @@ -0,0 +1,38 @@ +""" +Custom exceptions for job management operations. + +This module defines specific exceptions for job processing lifecycle +and repository operations to provide clear error handling. +""" + +from app.models.job import ProcessingStatus + + +class InvalidStatusTransitionError(Exception): + """ + Raised when an invalid status transition is attempted. + + This exception indicates that an attempt was made to change a job's + status to an invalid target status given the current status. + """ + + def __init__(self, current_status: ProcessingStatus, target_status: ProcessingStatus): + self.current_status = current_status + self.target_status = target_status + super().__init__( + f"Invalid status transition from '{current_status}' to '{target_status}'" + ) + + +class JobRepositoryError(Exception): + """ + Raised when a MongoDB operation fails in the job repository. + + This exception wraps database-related errors that occur during + job repository operations. + """ + + def __init__(self, operation: str, original_error: Exception): + self.operation = operation + self.original_error = original_error + super().__init__(f"Repository operation '{operation}' failed: {str(original_error)}") diff --git a/src/file-processor/app/file_watcher.py b/src/file-processor/app/file_watcher.py new file mode 100644 index 0000000..4bc5c72 --- /dev/null +++ b/src/file-processor/app/file_watcher.py @@ -0,0 +1,243 @@ +""" +File watcher implementation with Watchdog observer and ProcessingJob management. + +This module provides real-time file monitoring for document processing. +When a file is created in the watched directory, it: +1. Creates a document record via DocumentService +2. Dispatches a Celery task for processing +3. Creates a ProcessingJob to track the task lifecycle +""" + +import logging +import threading +from pathlib import Path +from typing import Optional + +from watchdog.events import FileSystemEventHandler, FileCreatedEvent +from watchdog.observers import Observer + +from app.services.document_service import DocumentService +from app.services.job_service import JobService + +logger = logging.getLogger(__name__) + + +class DocumentFileEventHandler(FileSystemEventHandler): + """ + Event handler for document file creation events. + + Processes newly created files by creating document records, + dispatching Celery tasks, and managing processing jobs. + """ + + SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx'} + + def __init__(self, document_service: DocumentService, job_service: JobService): + """ + Initialize the event handler. + + Args: + document_service: Service for document management + job_service: Service for processing job management + """ + super().__init__() + self.document_service = document_service + self.job_service = job_service + + def on_created(self, event: FileCreatedEvent) -> None: + """ + Handle file creation events. + + Args: + event: File system event containing file path information + """ + if event.is_directory: + return + + filepath = event.src_path + file_extension = Path(filepath).suffix.lower() + + if file_extension not in self.SUPPORTED_EXTENSIONS: + logger.info(f"Ignoring unsupported file type: {filepath}") + return + + logger.info(f"Processing new file: {filepath}") + + # try: + from tasks.document_processing import process_document + task_result = process_document.delay(filepath) + print(task_result) + print("hello world") + # task_id = task_result.task_id + # logger.info(f"Dispatched Celery task with ID: {task_id}") + + # except Exception as e: + # logger.error(f"Failed to process file {filepath}: {str(e)}") + # # Note: We don't re-raise the exception to keep the watcher running + + +class FileWatcher: + """ + File system watcher for automatic document processing. + + Monitors a directory for new files and triggers processing pipeline + using a dedicated observer thread. + """ + + def __init__( + self, + watch_directory: str, + document_service: DocumentService, + job_service: JobService, + recursive: bool = True + ): + """ + Initialize the file watcher. + + Args: + watch_directory: Directory path to monitor + document_service: Service for document management + job_service: Service for processing job management + recursive: Whether to watch subdirectories recursively + """ + self.watch_directory = Path(watch_directory) + self.recursive = recursive + self.observer: Optional[Observer] = None + self._observer_thread: Optional[threading.Thread] = None + self._stop_event = threading.Event() + + # Validate watch directory + if not self.watch_directory.exists(): + raise ValueError(f"Watch directory does not exist: {watch_directory}") + + if not self.watch_directory.is_dir(): + raise ValueError(f"Watch path is not a directory: {watch_directory}") + + # Create event handler + self.event_handler = DocumentFileEventHandler( + document_service=document_service, + job_service=job_service + ) + + logger.info(f"FileWatcher initialized for directory: {self.watch_directory}") + + def start(self) -> None: + """ + Start the file watcher in a separate thread. + + Raises: + RuntimeError: If the watcher is already running + """ + if self.is_running(): + raise RuntimeError("FileWatcher is already running") + + self.observer = Observer() + self.observer.schedule( + self.event_handler, + str(self.watch_directory), + recursive=self.recursive + ) + + # Start observer in separate thread + self._observer_thread = threading.Thread( + target=self._run_observer, + name="FileWatcher-Observer" + ) + self._stop_event.clear() + self._observer_thread.start() + + logger.info("FileWatcher started successfully") + + def stop(self, timeout: float = 5.0) -> None: + """ + Stop the file watcher gracefully. + + Args: + timeout: Maximum time to wait for graceful shutdown + """ + if not self.is_running(): + logger.warning("FileWatcher is not running") + return + + logger.info("Stopping FileWatcher...") + + # Signal stop and wait for observer thread + self._stop_event.set() + + if self.observer: + self.observer.stop() + + if self._observer_thread and self._observer_thread.is_alive(): + self._observer_thread.join(timeout=timeout) + + if self._observer_thread.is_alive(): + logger.warning("FileWatcher thread did not stop gracefully within timeout") + else: + logger.info("FileWatcher stopped gracefully") + + # Clean up + self.observer = None + self._observer_thread = None + + def is_running(self) -> bool: + """ + Check if the file watcher is currently running. + + Returns: + True if the watcher is running, False otherwise + """ + return ( + self.observer is not None + and self._observer_thread is not None + and self._observer_thread.is_alive() + ) + + def _run_observer(self) -> None: + """ + Internal method to run the observer in a separate thread. + + This method should not be called directly. + """ + if not self.observer: + logger.error("Observer not initialized") + return + + try: + self.observer.start() + logger.info("Observer thread started") + + # Keep the observer running until stop is requested + while not self._stop_event.is_set(): + self._stop_event.wait(timeout=1.0) + + logger.info("Observer thread stopping...") + + except Exception as e: + logger.error(f"Observer thread error: {str(e)}") + finally: + if self.observer: + self.observer.join() + logger.info("Observer thread stopped") + + +def create_file_watcher( + watch_directory: str, + document_service: DocumentService, + job_service: JobService +) -> FileWatcher: + """ + Factory function to create a FileWatcher instance. + + Args: + watch_directory: Directory path to monitor + document_service: Service for document management + job_service: Service for processing job management + + Returns: + Configured FileWatcher instance + """ + return FileWatcher( + watch_directory=watch_directory, + document_service=document_service, + job_service=job_service + ) diff --git a/src/file-processor/app/main.py b/src/file-processor/app/main.py index b2247e4..5bc661b 100644 --- a/src/file-processor/app/main.py +++ b/src/file-processor/app/main.py @@ -1,203 +1,169 @@ """ -FastAPI application for MyDocManager file processor service. +FastAPI application with integrated FileWatcher for document processing. -This service provides API endpoints for health checks and task dispatching. +This module provides the main FastAPI application with: +- JWT authentication +- User management APIs +- Real-time file monitoring via FileWatcher +- Document processing via Celery tasks """ import logging -import os from contextlib import asynccontextmanager -from fastapi import FastAPI, HTTPException, Depends -from pydantic import BaseModel -import redis -from celery import Celery +from typing import AsyncGenerator -from app.database.connection import test_database_connection, get_database -from app.database.repositories.user_repository import UserRepository -from app.models.user import UserCreate +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from app.api.routes.auth import router as auth_router +from app.api.routes.users import router as users_router +from app.config import settings +from app.database.connection import get_database +from app.file_watcher import create_file_watcher, FileWatcher +from app.services.document_service import DocumentService from app.services.init_service import InitializationService +from app.services.job_service import JobService from app.services.user_service import UserService # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +# Global file watcher instance +file_watcher: FileWatcher = None + @asynccontextmanager -async def lifespan(app: FastAPI): +async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: """ - Application lifespan manager for startup and shutdown tasks. - - Handles initialization tasks that need to run when the application starts, - including admin user creation and other setup procedures. + FastAPI lifespan context manager. + + Handles application startup and shutdown events including: + - Database connection + - Default admin user creation + - FileWatcher startup/shutdown """ - # Startup tasks + global file_watcher + + # Startup logger.info("Starting MyDocManager application...") try: # Initialize database connection database = get_database() + logger.info("Database connection established") - # Initialize repositories and services - user_repository = UserRepository(database) - user_service = UserService(user_repository) + document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder()) + job_service = JobService(database=database) + user_service = UserService(database=database) + logger.info("Service created") + + # Create default admin user init_service = InitializationService(user_service) + init_service.initialize_application() + logger.info("Default admin user initialization completed") - # Run initialization tasks - initialization_result = init_service.initialize_application() + # Create and start file watcher + file_watcher = create_file_watcher( + watch_directory=settings.watch_directory(), + document_service=document_service, + job_service=job_service + ) + file_watcher.start() + logger.info(f"FileWatcher started for directory: {settings.watch_directory()}") - if initialization_result["initialization_success"]: - logger.info("Application startup completed successfully") - if initialization_result["admin_user_created"]: - logger.info("Default admin user was created during startup") - else: - logger.error("Application startup completed with errors:") - for error in initialization_result["errors"]: - logger.error(f" - {error}") + logger.info("Application startup completed successfully") + + yield except Exception as e: - logger.error(f"Critical error during application startup: {str(e)}") - # You might want to decide if the app should continue or exit here - # For now, we log the error but continue + logger.error(f"Application startup failed: {str(e)}") + raise - yield # Application is running - - # Shutdown tasks (if needed) - logger.info("Shutting down MyDocManager application...") + finally: + # Shutdown + logger.info("Shutting down MyDocManager application...") + + if file_watcher and file_watcher.is_running(): + file_watcher.stop() + logger.info("FileWatcher stopped") + + logger.info("Application shutdown completed") -# Initialize FastAPI app +# Create FastAPI application app = FastAPI( - title="MyDocManager File Processor", - description="File processing and task dispatch service", - version="1.0.0", + title="MyDocManager", + description="Real-time document processing application with authentication", + version="0.1.0", lifespan=lifespan ) -# Environment variables -REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0") -MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017") - -# Initialize Redis client -try: - redis_client = redis.from_url(REDIS_URL) -except Exception as e: - redis_client = None - print(f"Warning: Could not connect to Redis: {e}") - -# Initialize Celery -celery_app = Celery( - "file_processor", - broker=REDIS_URL, - backend=REDIS_URL +# Configure CORS +app.add_middleware( + CORSMiddleware, + allow_origins=["http://localhost:5173"], # React frontend + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], ) +# Include routers +app.include_router(auth_router, prefix="/auth", tags=["Authentication"]) +app.include_router(users_router, prefix="/users", tags=["User Management"]) +# app.include_router(documents_router, prefix="/documents", tags=["Documents"]) +# app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"]) -# Pydantic models -class TestTaskRequest(BaseModel): - """Request model for test task.""" - message: str - - -def get_user_service() -> UserService: - """ - Dependency to get user service instance. - - This should be properly implemented with database connection management - in your actual application. - """ - database = get_database() - user_repository = UserRepository(database) - return UserService(user_repository) - - -# Your API routes would use the service like this: -@app.post("/api/users") -async def create_user( - user_data: UserCreate, - user_service: UserService = Depends(get_user_service) -): - return user_service.create_user(user_data) @app.get("/health") async def health_check(): """ Health check endpoint. - + Returns: - dict: Service health status with dependencies + Dictionary containing application health status """ - health_status = { + return { "status": "healthy", - "service": "file-processor", - "dependencies": { - "redis": "unknown", - "mongodb": "unknown" - }, + "service": "MyDocManager", + "version": "1.0.0", + "file_watcher_running": file_watcher.is_running() if file_watcher else False } - - # Check Redis connection - if redis_client: - try: - redis_client.ping() - health_status["dependencies"]["redis"] = "connected" - except Exception: - health_status["dependencies"]["redis"] = "disconnected" - health_status["status"] = "degraded" - - # check MongoDB connection - if test_database_connection(): - health_status["dependencies"]["mongodb"] = "connected" - else: - health_status["dependencies"]["mongodb"] = "disconnected" - - return health_status - - -@app.post("/test-task") -async def dispatch_test_task(request: TestTaskRequest): - """ - Dispatch a test task to Celery worker. - - Args: - request: Test task request containing message - - Returns: - dict: Task dispatch information - - Raises: - HTTPException: If task dispatch fails - """ - try: - # Send task to worker - task = celery_app.send_task( - "main.test_task", - args=[request.message] - ) - - return { - "status": "dispatched", - "task_id": task.id, - "message": f"Test task dispatched with message: {request.message}" - } - - except Exception as e: - raise HTTPException( - status_code=500, - detail=f"Failed to dispatch task: {str(e)}" - ) @app.get("/") async def root(): """ - Root endpoint. - + Root endpoint with basic application information. + Returns: - dict: Basic service information + Dictionary containing welcome message and available endpoints """ return { - "service": "MyDocManager File Processor", - "version": "1.0.0", - "status": "running" + "message": "Welcome to MyDocManager", + "description": "Real-time document processing application", + "docs": "/docs", + "health": "/health" + } + + +@app.get("/watcher/status") +async def watcher_status(): + """ + Get file watcher status. + + Returns: + Dictionary containing file watcher status information + """ + if not file_watcher: + return { + "status": "not_initialized", + "running": False + } + + return { + "status": "initialized", + "running": file_watcher.is_running(), + "watch_directory": str(file_watcher.watch_directory), + "recursive": file_watcher.recursive } diff --git a/src/file-processor/app/models/auth.py b/src/file-processor/app/models/auth.py index e40644a..5d3b83a 100644 --- a/src/file-processor/app/models/auth.py +++ b/src/file-processor/app/models/auth.py @@ -3,12 +3,45 @@ Authentication models and enums for user management. Contains user roles enumeration and authentication-related Pydantic models. """ - +from datetime import datetime from enum import Enum +from pydantic import BaseModel, Field + +from app.models.types import PyObjectId + class UserRole(str, Enum): """User roles enumeration with string values.""" USER = "user" - ADMIN = "admin" \ No newline at end of file + ADMIN = "admin" + + +class UserResponse(BaseModel): + """Model for user data in API responses (excludes password_hash).""" + + id: PyObjectId = Field(alias="_id") + username: str + email: str + role: UserRole + is_active: bool + created_at: datetime + updated_at: datetime + + model_config = { + "populate_by_name": True, + "arbitrary_types_allowed": True, + } + + +class LoginResponse(BaseModel): + """Response model for successful login.""" + access_token: str + token_type: str = "bearer" + user: UserResponse + + +class MessageResponse(BaseModel): + """Generic message response.""" + message: str diff --git a/src/file-processor/app/models/document.py b/src/file-processor/app/models/document.py index 1c22ef2..19d9bfe 100644 --- a/src/file-processor/app/models/document.py +++ b/src/file-processor/app/models/document.py @@ -33,15 +33,6 @@ class ExtractionMethod(str, Enum): HYBRID = "hybrid" -class ProcessingStatus(str, Enum): - """Status values for processing jobs.""" - - PENDING = "pending" - PROCESSING = "processing" - COMPLETED = "completed" - FAILED = "failed" - - class FileDocument(BaseModel): """ Model for file documents stored in the 'files' collection. @@ -58,6 +49,9 @@ class FileDocument(BaseModel): metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata") detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected") file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") + encoding: str = Field(default="utf-8", description="Character encoding for text files") + file_size: int = Field(..., ge=0, description="File size in bytes") + mime_type: str = Field(..., description="MIME type detected") @field_validator('filepath') @classmethod @@ -74,69 +68,3 @@ class FileDocument(BaseModel): if not v.strip(): raise ValueError("Filename cannot be empty") return v.strip() - - class Config: - """Pydantic configuration.""" - populate_by_name = True - arbitrary_types_allowed = True - json_encoders = {ObjectId: str} - - -class DocumentContent(BaseModel): - """Model for document content.""" - - id: Optional[PyObjectId] = Field(default=None, alias="_id") - file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") - content: str = Field(..., description="File content") - encoding: str = Field(default="utf-8", description="Character encoding for text files") - file_size: int = Field(..., ge=0, description="File size in bytes") - mime_type: str = Field(..., description="MIME type detected") - - -class ProcessingJob(BaseModel): - """ - Model for processing jobs stored in the 'processing_jobs' collection. - - Tracks the lifecycle and status of document processing tasks. - """ - - id: Optional[PyObjectId] = Field(default=None, alias="_id") - file_id: PyObjectId = Field(..., description="Reference to file document") - status: ProcessingStatus = Field( - default=ProcessingStatus.PENDING, - description="Current processing status" - ) - task_id: Optional[str] = Field( - default=None, - description="Celery task UUID" - ) - created_at: Optional[datetime] = Field( - default=None, - description="Timestamp when job was created" - ) - started_at: Optional[datetime] = Field( - default=None, - description="Timestamp when processing started" - ) - completed_at: Optional[datetime] = Field( - default=None, - description="Timestamp when processing completed" - ) - error_message: Optional[str] = Field( - default=None, - description="Error message if processing failed" - ) - - @field_validator('error_message') - @classmethod - def validate_error_message(cls, v: Optional[str]) -> Optional[str]: - """Clean up error message.""" - if v is not None: - return v.strip() if v.strip() else None - return v - - class Config: - """Pydantic configuration.""" - populate_by_name = True - arbitrary_types_allowed = True - json_encoders = {ObjectId: str} diff --git a/src/file-processor/app/models/job.py b/src/file-processor/app/models/job.py index e69de29..d71109e 100644 --- a/src/file-processor/app/models/job.py +++ b/src/file-processor/app/models/job.py @@ -0,0 +1,42 @@ +from datetime import datetime +from enum import Enum +from typing import Optional + +from bson import ObjectId +from pydantic import BaseModel, Field, field_validator + +from app.models.types import PyObjectId + + +class ProcessingStatus(str, Enum): + """Status values for processing jobs.""" + + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + + +class ProcessingJob(BaseModel): + """ + Model for processing jobs stored in the 'processing_jobs' collection. + + Tracks the lifecycle and status of document processing tasks. + """ + + id: Optional[PyObjectId] = Field(default=None, alias="_id") + document_id: PyObjectId = Field(..., description="Reference to file document") + status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status") + task_id: Optional[str] = Field(default=None, description="Celery task UUID") + created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created") + started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started") + completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed") + error_message: Optional[str] = Field(default=None, description="Error message if processing failed") + + @field_validator('error_message') + @classmethod + def validate_error_message(cls, v: Optional[str]) -> Optional[str]: + """Clean up error message.""" + if v is not None: + return v.strip() if v.strip() else None + return v \ No newline at end of file diff --git a/src/file-processor/app/models/user.py b/src/file-processor/app/models/user.py index 5759b04..4b54e87 100644 --- a/src/file-processor/app/models/user.py +++ b/src/file-processor/app/models/user.py @@ -7,10 +7,10 @@ and API responses with proper validation and type safety. import re from datetime import datetime -from typing import Optional, Any +from typing import Optional + from bson import ObjectId from pydantic import BaseModel, Field, field_validator, EmailStr -from pydantic_core import core_schema from app.models.auth import UserRole from app.models.types import PyObjectId @@ -138,21 +138,3 @@ class UserInDB(BaseModel): "arbitrary_types_allowed": True, "json_encoders": {ObjectId: str} } - - -class UserResponse(BaseModel): - """Model for user data in API responses (excludes password_hash).""" - - id: PyObjectId = Field(alias="_id") - username: str - email: str - role: UserRole - is_active: bool - created_at: datetime - updated_at: datetime - - model_config = { - "populate_by_name": True, - "arbitrary_types_allowed": True, - "json_encoders": {ObjectId: str} - } diff --git a/src/file-processor/app/services/auth_service.py b/src/file-processor/app/services/auth_service.py index a7037d3..53cf040 100644 --- a/src/file-processor/app/services/auth_service.py +++ b/src/file-processor/app/services/auth_service.py @@ -4,7 +4,11 @@ Authentication service for password hashing and verification. This module provides authentication-related functionality including password hashing, verification, and JWT token management. """ +from datetime import datetime, timedelta +import jwt + +from app.config import settings from app.utils.security import hash_password, verify_password @@ -55,4 +59,26 @@ class AuthService: >>> auth.verify_user_password("wrongpassword", hashed) False """ - return verify_password(password, hashed_password) \ No newline at end of file + return verify_password(password, hashed_password) + + @staticmethod + def create_access_token(data=dict) -> str: + """ + Create a JWT access token. + + Args: + data (dict): Payload data to include in the token. + + Returns: + str: Encoded JWT token. + """ + # Copy data to avoid modifying the original dict + to_encode = data.copy() + + # Add expiration time + expire = datetime.now() + timedelta(hours=settings.get_jwt_expire_hours()) + to_encode.update({"exp": expire}) + + # Encode JWT + encoded_jwt = jwt.encode(to_encode, settings.get_jwt_secret_key(), algorithm=settings.get_jwt_algorithm()) + return encoded_jwt diff --git a/src/file-processor/app/services/document_service.py b/src/file-processor/app/services/document_service.py index da58712..9b30c8e 100644 --- a/src/file-processor/app/services/document_service.py +++ b/src/file-processor/app/services/document_service.py @@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions. """ import hashlib -import magic +import os from datetime import datetime from pathlib import Path -from typing import List, Optional, Dict, Any, Tuple +from typing import List, Optional, Dict, Any -from motor.motor_asyncio import AsyncIOMotorClientSession +import magic from pymongo.errors import PyMongoError -from app.database.connection import get_database +from app.config.settings import get_objects_folder from app.database.repositories.document_repository import FileDocumentRepository -from app.database.repositories.document_content_repository import DocumentContentRepository from app.models.document import ( FileDocument, - DocumentContent, FileType, - ProcessingStatus ) from app.models.types import PyObjectId @@ -34,13 +31,25 @@ class DocumentService: and their content while ensuring data consistency through transactions. """ - def __init__(self): - """Initialize the document service with repository dependencies.""" - self.db = get_database() - self.file_repository = FileDocumentRepository(self.db) - self.content_repository = DocumentContentRepository(self.db) + def __init__(self, database, objects_folder: str = None): + """ + Initialize the document service with repository dependencies. + + Args: + database: Database instance + objects_folder: folder to store files by their hash + """ + + self.db = database + self.document_repository = FileDocumentRepository(self.db) + self.objects_folder = objects_folder or get_objects_folder() - def _calculate_file_hash(self, file_bytes: bytes) -> str: + def initialize(self): + self.document_repository.initialize() + return self + + @staticmethod + def _calculate_file_hash(file_bytes: bytes) -> str: """ Calculate SHA256 hash of file content. @@ -52,7 +61,8 @@ class DocumentService: """ return hashlib.sha256(file_bytes).hexdigest() - def _detect_file_type(self, file_path: str) -> FileType: + @staticmethod + def _detect_file_type(file_path: str) -> FileType: """ Detect file type from file extension. @@ -72,7 +82,8 @@ class DocumentService: except ValueError: raise ValueError(f"Unsupported file type: {extension}") - def _detect_mime_type(self, file_bytes: bytes) -> str: + @staticmethod + def _detect_mime_type(file_bytes: bytes) -> str: """ Detect MIME type from file content. @@ -84,10 +95,51 @@ class DocumentService: """ return magic.from_buffer(file_bytes, mime=True) - async def create_document( + @staticmethod + def _read_file_bytes(file_path: str | Path) -> bytes: + """ + Read file content as bytes asynchronously. + + Args: + file_path (str | Path): Path of the file to read + + Returns: + bytes: Content of the file + + Raises: + FileNotFoundError: If the file does not exist + OSError: If any I/O error occurs + """ + path = Path(file_path) + + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + return path.read_bytes() + + def _get_document_path(self, file_hash): + """ + + :param file_hash: + :return: + """ + return os.path.join(self.objects_folder, file_hash[:24], file_hash) + + def save_content_if_needed(self, file_hash, content: bytes): + target_path = self._get_document_path(file_hash) + if os.path.exists(target_path): + return + + if not os.path.exists(os.path.dirname(target_path)): + os.makedirs(os.path.dirname(target_path)) + + with open(target_path, "wb") as f: + f.write(content) + + def create_document( self, file_path: str, - file_bytes: bytes, + file_bytes: bytes | None = None, encoding: str = "utf-8" ) -> FileDocument: """ @@ -110,57 +162,40 @@ class DocumentService: PyMongoError: If database operation fails """ # Calculate automatic attributes + file_bytes = file_bytes if file_bytes is not None else self._read_file_bytes(file_path) file_hash = self._calculate_file_hash(file_bytes) file_type = self._detect_file_type(file_path) mime_type = self._detect_mime_type(file_bytes) file_size = len(file_bytes) filename = Path(file_path).name - detected_at = datetime.utcnow() + detected_at = datetime.now() - # Start MongoDB transaction - async with await self.db.client.start_session() as session: - async with session.start_transaction(): - try: - # Check if content already exists - existing_content = await self.content_repository.find_document_content_by_file_hash( - file_hash, session=session - ) - - # Create DocumentContent if it doesn't exist - if not existing_content: - content_data = DocumentContent( - file_hash=file_hash, - content="", # Will be populated by processing workers - encoding=encoding, - file_size=file_size, - mime_type=mime_type - ) - await self.content_repository.create_document_content( - content_data, session=session - ) - - # Create FileDocument - file_data = FileDocument( - filename=filename, - filepath=file_path, - file_type=file_type, - extraction_method=None, # Will be set by processing workers - metadata={}, # Empty for now - detected_at=detected_at, - file_hash=file_hash - ) - - created_file = await self.file_repository.create_document( - file_data, session=session - ) - - return created_file - - except Exception as e: - # Transaction will automatically rollback - raise PyMongoError(f"Failed to create document: {str(e)}") + try: + self.save_content_if_needed(file_hash, file_bytes) + + # Create FileDocument + file_data = FileDocument( + filename=filename, + filepath=file_path, + file_type=file_type, + extraction_method=None, # Will be set by processing workers + metadata={}, # Empty for now + detected_at=detected_at, + file_hash=file_hash, + encoding=encoding, + file_size=file_size, + mime_type=mime_type + ) + + created_file = self.document_repository.create_document(file_data) + + return created_file + + except Exception as e: + # Transaction will automatically rollback if supported + raise PyMongoError(f"Failed to create document: {str(e)}") - async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]: + def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]: """ Retrieve a document by its ID. @@ -170,9 +205,9 @@ class DocumentService: Returns: FileDocument if found, None otherwise """ - return await self.file_repository.find_document_by_id(document_id) + return self.document_repository.find_document_by_id(str(document_id)) - async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]: + def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]: """ Retrieve a document by its file hash. @@ -182,9 +217,9 @@ class DocumentService: Returns: FileDocument if found, None otherwise """ - return await self.file_repository.find_document_by_hash(file_hash) + return self.document_repository.find_document_by_hash(file_hash) - async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: + def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: """ Retrieve a document by its file path. @@ -194,34 +229,17 @@ class DocumentService: Returns: FileDocument if found, None otherwise """ - return await self.file_repository.find_document_by_filepath(filepath) + return self.document_repository.find_document_by_filepath(filepath) - async def get_document_with_content( - self, - document_id: PyObjectId - ) -> Optional[Tuple[FileDocument, DocumentContent]]: - """ - Retrieve a document with its associated content. - - Args: - document_id: Document ObjectId - - Returns: - Tuple of (FileDocument, DocumentContent) if found, None otherwise - """ - document = await self.get_document_by_id(document_id) - if not document: + def get_document_content_by_hash(self, file_hash): + target_path = self._get_document_path(file_hash) + if not os.path.exists(target_path): return None - content = await self.content_repository.find_document_content_by_file_hash( - document.file_hash - ) - if not content: - return None - - return (document, content) + with open(target_path, "rb") as f: + return f.read() - async def list_documents( + def list_documents( self, skip: int = 0, limit: int = 100 @@ -236,18 +254,18 @@ class DocumentService: Returns: List of FileDocument instances """ - return await self.file_repository.list_documents(skip=skip, limit=limit) + return self.document_repository.list_documents(skip=skip, limit=limit) - async def count_documents(self) -> int: + def count_documents(self) -> int: """ Get total number of documents. Returns: Total document count """ - return await self.file_repository.count_documents() + return self.document_repository.count_documents() - async def update_document( + def update_document( self, document_id: PyObjectId, update_data: Dict[str, Any] @@ -262,9 +280,14 @@ class DocumentService: Returns: Updated FileDocument if found, None otherwise """ - return await self.file_repository.update_document(document_id, update_data) + if "file_bytes" in update_data: + file_hash = self._calculate_file_hash(update_data["file_bytes"]) + update_data["file_hash"] = file_hash + self.save_content_if_needed(file_hash, update_data["file_bytes"]) + + return self.document_repository.update_document(document_id, update_data) - async def delete_document(self, document_id: PyObjectId) -> bool: + def delete_document(self, document_id: PyObjectId) -> bool: """ Delete a document and its orphaned content. @@ -281,100 +304,31 @@ class DocumentService: Raises: PyMongoError: If database operation fails """ - # Start MongoDB transaction - async with await self.db.client.start_session() as session: - async with session.start_transaction(): + # Start transaction + + try: + # Get document to find its hash + document = self.document_repository.find_document_by_id(document_id) + if not document: + return False + + # Delete the document + deleted = self.document_repository.delete_document(document_id) + if not deleted: + return False + + # Check if content is orphaned + remaining_files = self.document_repository.find_document_by_hash(document.file_hash) + + # If no other files reference this content, delete it + if not remaining_files: try: - # Get document to find its hash - document = await self.file_repository.find_document_by_id( - document_id, session=session - ) - if not document: - return False - - # Delete the document - deleted = await self.file_repository.delete_document( - document_id, session=session - ) - if not deleted: - return False - - # Check if content is orphaned - remaining_files = await self.file_repository.find_document_by_hash( - document.file_hash, session=session - ) - - # If no other files reference this content, delete it - if not remaining_files: - content = await self.content_repository.find_document_content_by_file_hash( - document.file_hash, session=session - ) - if content: - await self.content_repository.delete_document_content( - content.id, session=session - ) - - return True - - except Exception as e: - # Transaction will automatically rollback - raise PyMongoError(f"Failed to delete document: {str(e)}") - - async def content_exists(self, file_hash: str) -> bool: - """ - Check if content with given hash exists. - - Args: - file_hash: SHA256 hash of file content - - Returns: - True if content exists, False otherwise - """ - return await self.content_repository.content_exists(file_hash) - - async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]: - """ - Retrieve content by file hash. - - Args: - file_hash: SHA256 hash of file content - - Returns: - DocumentContent if found, None otherwise - """ - return await self.content_repository.find_document_content_by_file_hash(file_hash) - - async def update_document_content( - self, - file_hash: str, - content: str, - encoding: str = "utf-8" - ) -> Optional[DocumentContent]: - """ - Update the extracted content for a document. - - This method is typically called by processing workers to store - the extracted text content. - - Args: - file_hash: SHA256 hash of file content - content: Extracted text content - encoding: Character encoding - - Returns: - Updated DocumentContent if found, None otherwise - """ - existing_content = await self.content_repository.find_document_content_by_file_hash( - file_hash - ) - if not existing_content: - return None + os.remove(self._get_document_path(document.file_hash)) + except Exception: + pass + + return True - update_data = { - "content": content, - "encoding": encoding - } - - return await self.content_repository.update_document_content( - existing_content.id, update_data - ) \ No newline at end of file + except Exception as e: + # Transaction will automatically rollback if supported + raise PyMongoError(f"Failed to delete document: {str(e)}") diff --git a/src/file-processor/app/services/init_service.py b/src/file-processor/app/services/init_service.py index fd3464f..11de80d 100644 --- a/src/file-processor/app/services/init_service.py +++ b/src/file-processor/app/services/init_service.py @@ -8,8 +8,8 @@ creating default admin user if none exists. import logging from typing import Optional -from app.models.user import UserCreate, UserInDB, UserCreateNoValidation from app.models.auth import UserRole +from app.models.user import UserInDB, UserCreateNoValidation from app.services.user_service import UserService logger = logging.getLogger(__name__) @@ -31,7 +31,6 @@ class InitializationService: user_service (UserService): Service for user operations """ self.user_service = user_service - def ensure_admin_user_exists(self) -> Optional[UserInDB]: """ @@ -131,4 +130,23 @@ class InitializationService: logger.error(error_msg) initialization_summary["errors"].append(error_msg) - return initialization_summary \ No newline at end of file + self.log_initialization_result(initialization_summary) + + return initialization_summary + + @staticmethod + def log_initialization_result(summary: dict) -> None: + """ + Log the result of the initialization process. + + Args: + summary (dict): Summary of initialization tasks performed + """ + if summary["initialization_success"]: + logger.info("Application startup completed successfully") + if summary["admin_user_created"]: + logger.info("Default admin user was created during startup") + else: + logger.error("Application startup completed with errors:") + for error in summary["errors"]: + logger.error(f" - {error}") diff --git a/src/file-processor/app/services/job_service.py b/src/file-processor/app/services/job_service.py new file mode 100644 index 0000000..ff55c6d --- /dev/null +++ b/src/file-processor/app/services/job_service.py @@ -0,0 +1,182 @@ +""" +Service layer for job processing business logic. + +This module provides high-level operations for managing processing jobs +with strict status transition validation and business rules enforcement. +""" + +from typing import Optional + +from app.database.repositories.job_repository import JobRepository +from app.exceptions.job_exceptions import InvalidStatusTransitionError +from app.models.job import ProcessingJob, ProcessingStatus +from app.models.types import PyObjectId + + +class JobService: + """ + Service for processing job business logic operations. + + Provides high-level job management with strict status transition + validation and business rule enforcement. + """ + + def __init__(self, database): + """ + Initialize service with job repository. + + Args: + repository: Optional JobRepository instance (creates default if None) + """ + self.db = database + self.repository = JobRepository(database) + + def initialize(self): + self.repository.initialize() + return self + + def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob: + """ + Create a new processing job. + + Args: + document_id: Reference to the file document + task_id: Optional Celery task UUID + + Returns: + The created ProcessingJob + + Raises: + JobRepositoryError: If database operation fails + """ + return self.repository.create_job(document_id, task_id) + + def get_job_by_id(self, job_id: PyObjectId) -> ProcessingJob: + """ + Retrieve a job by its ID. + + Args: + job_id: The job ObjectId + + Returns: + The ProcessingJob document + + Raises: + JobNotFoundError: If job doesn't exist + JobRepositoryError: If database operation fails + """ + return self.repository.find_job_by_id(job_id) + + def mark_job_as_started(self, job_id: PyObjectId) -> ProcessingJob: + """ + Mark a job as started (PENDING → PROCESSING). + + Args: + job_id: The job ObjectId + + Returns: + The updated ProcessingJob + + Raises: + JobNotFoundError: If job doesn't exist + InvalidStatusTransitionError: If job is not in PENDING status + JobRepositoryError: If database operation fails + """ + # Get current job to validate transition + current_job = self.repository.find_job_by_id(job_id) + + # Validate status transition + if current_job.status != ProcessingStatus.PENDING: + raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.PROCESSING) + + # Update status + return self.repository.update_job_status(job_id, ProcessingStatus.PROCESSING) + + def mark_job_as_completed(self, job_id: PyObjectId) -> ProcessingJob: + """ + Mark a job as completed (PROCESSING → COMPLETED). + + Args: + job_id: The job ObjectId + + Returns: + The updated ProcessingJob + + Raises: + JobNotFoundError: If job doesn't exist + InvalidStatusTransitionError: If job is not in PROCESSING status + JobRepositoryError: If database operation fails + """ + # Get current job to validate transition + current_job = self.repository.find_job_by_id(job_id) + + # Validate status transition + if current_job.status != ProcessingStatus.PROCESSING: + raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED) + + # Update status + return self.repository.update_job_status(job_id, ProcessingStatus.COMPLETED) + + def mark_job_as_failed( + self, + job_id: PyObjectId, + error_message: Optional[str] = None + ) -> ProcessingJob: + """ + Mark a job as failed (PROCESSING → FAILED). + + Args: + job_id: The job ObjectId + error_message: Optional error description + + Returns: + The updated ProcessingJob + + Raises: + JobNotFoundError: If job doesn't exist + InvalidStatusTransitionError: If job is not in PROCESSING status + JobRepositoryError: If database operation fails + """ + # Get current job to validate transition + current_job = self.repository.find_job_by_id(job_id) + + # Validate status transition + if current_job.status != ProcessingStatus.PROCESSING: + raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED) + + # Update status with error message + return self.repository.update_job_status( + job_id, + ProcessingStatus.FAILED, + error_message + ) + + def delete_job(self, job_id: PyObjectId) -> bool: + """ + Delete a job from the database. + + Args: + job_id: The job ObjectId + + Returns: + True if job was deleted, False if not found + + Raises: + JobRepositoryError: If database operation fails + """ + return self.repository.delete_job(job_id) + + def get_jobs_by_status(self, status: ProcessingStatus) -> list[ProcessingJob]: + """ + Retrieve all jobs with a specific status. + + Args: + status: The processing status to filter by + + Returns: + List of ProcessingJob documents + + Raises: + JobRepositoryError: If database operation fails + """ + return self.repository.get_jobs_by_status(status) diff --git a/src/file-processor/app/services/user_service.py b/src/file-processor/app/services/user_service.py index de9fcef..ffb93e5 100644 --- a/src/file-processor/app/services/user_service.py +++ b/src/file-processor/app/services/user_service.py @@ -6,11 +6,11 @@ retrieval, updates, and authentication operations with proper error handling. """ from typing import Optional, List + from pymongo.errors import DuplicateKeyError -from app.models.user import UserCreate, UserInDB, UserUpdate, UserResponse, UserCreateNoValidation -from app.models.auth import UserRole from app.database.repositories.user_repository import UserRepository +from app.models.user import UserCreate, UserInDB, UserUpdate, UserCreateNoValidation from app.services.auth_service import AuthService @@ -22,16 +22,21 @@ class UserService: authentication, and data management with proper validation. """ - def __init__(self, user_repository: UserRepository): + def __init__(self, database): """ Initialize user service with repository dependency. Args: user_repository (UserRepository): Repository for user data operations """ - self.user_repository = user_repository + self.db = database + self.user_repository = UserRepository(self.db) self.auth_service = AuthService() + def initialize(self): + self.user_repository.initialize() + return self + def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB: """ Create a new user with business logic validation. diff --git a/src/file-processor/requirements.txt b/src/file-processor/requirements.txt index 8b4b465..5198e6f 100644 --- a/src/file-processor/requirements.txt +++ b/src/file-processor/requirements.txt @@ -1,11 +1,14 @@ +asgiref==3.9.1 bcrypt==4.3.0 celery==5.5.3 email-validator==2.3.0 fastapi==0.116.1 httptools==0.6.4 motor==3.7.1 -pymongo==4.15.0 pydantic==2.11.9 +PyJWT==2.10.1 +pymongo==4.15.0 redis==6.4.0 uvicorn==0.35.0 -python-magic==0.4.27 \ No newline at end of file +python-magic==0.4.27 +watchdog==6.0.0 \ No newline at end of file diff --git a/src/frontend/.dockerignore b/src/frontend/.dockerignore new file mode 100644 index 0000000..dd262eb --- /dev/null +++ b/src/frontend/.dockerignore @@ -0,0 +1,41 @@ +# Dependencies +node_modules +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Build outputs +dist +build + +# Environment files +.env.local +.env.development.local +.env.test.local +.env.production.local + +# IDE files +.vscode +.idea +*.swp +*.swo + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Git +.git +.gitignore + +# Docker +Dockerfile +.dockerignore + +# Logs +*.log \ No newline at end of file diff --git a/src/frontend/Dockerfile b/src/frontend/Dockerfile new file mode 100644 index 0000000..95c9fb2 --- /dev/null +++ b/src/frontend/Dockerfile @@ -0,0 +1,20 @@ +# Use Node.js 20 Alpine for lightweight container +FROM node:20-alpine + +# Set working directory +WORKDIR /app + +# Copy package.json and package-lock.json (if available) +COPY package*.json ./ + +# Install dependencies +RUN npm install + +# Copy source code +COPY . . + +# Expose Vite default port +EXPOSE 5173 + +# Start development server with host 0.0.0.0 to accept external connections +CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0", "--port", "5173"] \ No newline at end of file diff --git a/src/worker/Dockerfile b/src/worker/Dockerfile index 8723a3e..fe7d573 100644 --- a/src/worker/Dockerfile +++ b/src/worker/Dockerfile @@ -3,12 +3,18 @@ FROM python:3.12-slim # Set working directory WORKDIR /app +# Install libmagic +RUN apt-get update && apt-get install -y --no-install-recommends \ + libmagic1 \ + file \ + && rm -rf /var/lib/apt/lists/* + # Copy requirements and install dependencies COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # Copy application code -COPY tasks/ . +COPY . . # Command will be overridden by docker-compose -CMD ["celery", "-A", "main", "worker", "--loglevel=info"] \ No newline at end of file +CMD ["celery", "-A", "main", "worker", "--loglevel=info"] diff --git a/src/worker/requirements.txt b/src/worker/requirements.txt index af2f3cd..b44281a 100644 --- a/src/worker/requirements.txt +++ b/src/worker/requirements.txt @@ -1,4 +1,13 @@ - +asgiref==3.9.1 +bcrypt==4.3.0 celery==5.5.3 +email-validator==2.3.0 +fastapi==0.116.1 +httptools==0.6.4 +motor==3.7.1 +pymongo==4.15.0 +pydantic==2.11.9 redis==6.4.0 -pymongo==4.15.0 \ No newline at end of file +uvicorn==0.35.0 +python-magic==0.4.27 +watchdog==6.0.0 \ No newline at end of file diff --git a/src/worker/tasks/document_processing.py b/src/worker/tasks/document_processing.py new file mode 100644 index 0000000..f71c3e3 --- /dev/null +++ b/src/worker/tasks/document_processing.py @@ -0,0 +1,85 @@ +""" +Celery tasks for document processing with ProcessingJob status management. + +This module contains Celery tasks that handle document content extraction +and update processing job statuses throughout the task lifecycle. +""" + +import logging +from typing import Any, Dict + +from app.config import settings +from app.database.connection import get_database +from app.services.document_service import DocumentService +from tasks.main import celery_app + +logger = logging.getLogger(__name__) + +@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60}) +def process_document(self, filepath: str) -> Dict[str, Any]: + """ + Process a document file and extract its content. + + This task: + 1. Updates the processing job status to PROCESSING + 2. Performs document content extraction + 3. Updates job status to COMPLETED or FAILED based on result + + Args: + self : Celery task instance + filepath: Full path to the document file to process + + Returns: + Dictionary containing processing results + + Raises: + Exception: Any processing error (will trigger retry) + """ + task_id = self.request.id + logger.info(f"Starting document processing task {task_id} for file: {filepath}") + + database = get_database() + document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder()) + from app.services.job_service import JobService + job_service = JobService(database=database) + + job = None + try: + # Step 1: Insert the document in DB + document = document_service.create_document(filepath) + logger.info(f"Job {task_id} created for document {document.id} with file path: {filepath}") + + # Step 2: Create a new job record for the document + job = job_service.create_job(task_id=task_id, document_id=document.id) + + # Step 3: Mark job as started + job_service.mark_job_as_started(job_id=job.id) + logger.info(f"Job {task_id} marked as PROCESSING") + + # Step 4: Mark job as completed + job_service.mark_job_as_completed(job_id=job.id) + logger.info(f"Job {task_id} marked as COMPLETED") + + return { + "task_id": task_id, + "filepath": filepath, + "status": "completed", + } + + except Exception as e: + error_message = f"Document processing failed: {str(e)}" + logger.error(f"Task {task_id} failed: {error_message}") + + try: + # Mark job as failed + if job is not None: + job_service.mark_job_as_failed(job_id=job.id, error_message=error_message) + logger.info(f"Job {task_id} marked as FAILED") + else: + logger.error(f"Failed to process {filepath}. error = {str(e)}") + except Exception as job_error: + logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}") + + # Re-raise the exception to trigger Celery retry mechanism + raise + diff --git a/src/worker/tasks/main.py b/src/worker/tasks/main.py index 63b2c5d..f76c202 100644 --- a/src/worker/tasks/main.py +++ b/src/worker/tasks/main.py @@ -3,9 +3,8 @@ Celery worker for MyDocManager document processing tasks. This module contains all Celery tasks for processing documents. """ - import os -import time + from celery import Celery # Environment variables @@ -13,101 +12,25 @@ REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0") MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017") # Initialize Celery app -app = Celery( +celery_app = Celery( "mydocmanager_worker", broker=REDIS_URL, - backend=REDIS_URL + backend=REDIS_URL, ) +celery_app.autodiscover_tasks(["tasks.document_processing"]) + # Celery configuration -app.conf.update( +celery_app.conf.update( task_serializer="json", accept_content=["json"], result_serializer="json", timezone="UTC", enable_utc=True, task_track_started=True, - task_time_limit=300, # 5 minutes - task_soft_time_limit=240, # 4 minutes + task_time_limit=300, # 5 minutes + task_soft_time_limit=240, # 4 minutes ) - -@app.task(bind=True) -def test_task(self, message: str): - """ - Test task for validating worker functionality. - - Args: - message: Test message to process - - Returns: - dict: Task result with processing information - """ - try: - print(f"[WORKER] Starting test task with message: {message}") - - # Simulate some work - for i in range(5): - print(f"[WORKER] Processing step {i + 1}/5...") - time.sleep(1) - - # Update task progress - self.update_state( - state="PROGRESS", - meta={ - "current": i + 1, - "total": 5, - "message": f"Processing step {i + 1}" - } - ) - - result = { - "status": "completed", - "message": f"Successfully processed: {message}", - "processed_at": time.time(), - "worker_id": self.request.id - } - - print(f"[WORKER] Test task completed successfully: {result}") - return result - - except Exception as exc: - print(f"[WORKER] Test task failed: {str(exc)}") - raise self.retry(exc=exc, countdown=60, max_retries=3) - - -@app.task(bind=True) -def process_document_task(self, file_path: str): - """ - Placeholder task for document processing. - - Args: - file_path: Path to the document to process - - Returns: - dict: Processing result - """ - try: - print(f"[WORKER] Starting document processing for: {file_path}") - - # Placeholder for document processing logic - time.sleep(2) # Simulate processing time - - result = { - "status": "completed", - "file_path": file_path, - "processed_at": time.time(), - "content": f"Placeholder content for {file_path}", - "worker_id": self.request.id - } - - print(f"[WORKER] Document processing completed: {file_path}") - return result - - except Exception as exc: - print(f"[WORKER] Document processing failed for {file_path}: {str(exc)}") - raise self.retry(exc=exc, countdown=60, max_retries=3) - - if __name__ == "__main__": - app.start() \ No newline at end of file + celery_app.start() diff --git a/tests/api/__init__.py b/tests/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/api/test_auth_routes.py b/tests/api/test_auth_routes.py new file mode 100644 index 0000000..7345161 --- /dev/null +++ b/tests/api/test_auth_routes.py @@ -0,0 +1,149 @@ +from datetime import datetime +from unittest.mock import MagicMock + +import pytest +from fastapi import status, HTTPException +from fastapi.testclient import TestClient +from mongomock.mongo_client import MongoClient + +from app.api.dependencies import get_auth_service, get_user_service, get_current_user +from app.main import app # Assuming you have FastAPI app defined in app/main.py +from app.models.auth import UserRole +from app.models.types import PyObjectId +from app.models.user import UserInDB +from app.services.auth_service import AuthService +from app.services.user_service import UserService + + +@pytest.fixture +def client(): + return TestClient(app) + + +@pytest.fixture +def fake_user(): + return UserInDB( + _id=PyObjectId(), + username="testuser", + email="test@example.com", + role=UserRole.USER, + is_active=True, + hashed_password="hashed-secret", + created_at=datetime(2025, 1, 1), + updated_at=datetime(2025, 1, 2), + ) + + +def override_auth_service(): + mock = MagicMock(spec=AuthService) + mock.verify_user_password.return_value = True + mock.create_access_token.return_value = "fake-jwt-token" + return mock + + +def override_user_service(fake_user): + mock = MagicMock(spec=UserService) + mock.get_user_by_username.return_value = fake_user + return mock + + +def override_get_current_user(fake_user): + def _override(): + return fake_user + + return _override + + +def override_get_database(): + def _override(): + client = MongoClient() + db = client.test_database + return db + + return _override + + +# ---------------------- TESTS FOR /auth/login ---------------------- +class TestLogin: + def test_i_can_login_with_valid_credentials(self, client, fake_user): + auth_service = override_auth_service() + user_service = override_user_service(fake_user) + + client.app.dependency_overrides[get_auth_service] = lambda: auth_service + client.app.dependency_overrides[get_user_service] = lambda: user_service + + response = client.post( + "/auth/login", + data={"username": "testuser", "password": "secret"}, + ) + + assert response.status_code == status.HTTP_200_OK + data = response.json() + assert "access_token" in data + assert data["user"]["username"] == "testuser" + + def test_i_cannot_login_with_invalid_username(self, client): + auth_service = override_auth_service() + user_service = MagicMock(spec=UserService) + user_service.get_user_by_username.return_value = None + + client.app.dependency_overrides[get_auth_service] = lambda: auth_service + client.app.dependency_overrides[get_user_service] = lambda: user_service + + response = client.post( + "/auth/login", + data={"username": "unknown", "password": "secret"}, + ) + + assert response.status_code == status.HTTP_401_UNAUTHORIZED + + def test_i_cannot_login_with_inactive_user(self, client, fake_user): + fake_user.is_active = False + auth_service = override_auth_service() + user_service = override_user_service(fake_user) + client.app.dependency_overrides[get_auth_service] = lambda: auth_service + client.app.dependency_overrides[get_user_service] = lambda: user_service + + response = client.post( + "/auth/login", + data={"username": "testuser", "password": "secret"}, + ) + + assert response.status_code == status.HTTP_401_UNAUTHORIZED + + def test_i_cannot_login_with_wrong_password(self, client, fake_user): + auth_service = override_auth_service() + auth_service.verify_user_password.return_value = False + user_service = override_user_service(fake_user) + client.app.dependency_overrides[get_auth_service] = lambda: auth_service + client.app.dependency_overrides[get_user_service] = lambda: user_service + + response = client.post( + "/auth/login", + data={"username": "testuser", "password": "wrong"}, + ) + + assert response.status_code == status.HTTP_401_UNAUTHORIZED + + +# ---------------------- TESTS FOR /auth/me ---------------------- +class TesteMe: + def test_i_can_get_current_user_profile(self, client, fake_user): + client.app.dependency_overrides[get_current_user] = override_get_current_user(fake_user) + + response = client.get("/auth/me") + + assert response.status_code == status.HTTP_200_OK + data = response.json() + assert data["username"] == fake_user.username + assert data["email"] == fake_user.email + + def test_i_cannot_get_profile_without_authentication(self, client, monkeypatch): + def raise_http_exception(): + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED) + + client.app.dependency_overrides[get_current_user] = raise_http_exception + + response = client.get("/auth/me") + + assert response.status_code == status.HTTP_401_UNAUTHORIZED diff --git a/tests/api/test_users.py b/tests/api/test_users.py new file mode 100644 index 0000000..8e359bf --- /dev/null +++ b/tests/api/test_users.py @@ -0,0 +1,167 @@ +# File: tests/api/test_users.py +from datetime import datetime +from unittest.mock import MagicMock + +import pytest +from fastapi import status +from fastapi.testclient import TestClient + +from app.api.dependencies import get_admin_user, get_user_service +from app.main import app +from app.models.auth import UserRole +from app.models.types import PyObjectId +from app.models.user import UserInDB, UserCreate +from app.services.user_service import UserService + + +# ----------------------- +# Fixtures +# ----------------------- + +@pytest.fixture +def fake_user_admin(): + return UserInDB( + _id=PyObjectId(), + username="admin", + email="admin@example.com", + role=UserRole.ADMIN, + is_active=True, + hashed_password="hashed-secret", + created_at=datetime(2025, 1, 1), + updated_at=datetime(2025, 1, 2), + ) + + +@pytest.fixture +def fake_user_response(): + return UserInDB( + _id=PyObjectId(), + username="other", + email="other@example.com", + role=UserRole.USER, + is_active=True, + hashed_password="hashed-secret-2", + created_at=datetime(2025, 1, 1), + updated_at=datetime(2025, 1, 2), + ) + + +@pytest.fixture +def client(fake_user_admin): + # Fake admin dependency + def get_admin_user_override(): + return fake_user_admin + + # Fake user service + user_service_mock = MagicMock(spec=UserService) + + def get_user_service_override(): + return user_service_mock + + client = TestClient(app) + client.app.dependency_overrides = { + get_admin_user: get_admin_user_override, + get_user_service: get_user_service_override + } + + client.user_service_mock = user_service_mock + return client + + +# ----------------------- +# Tests +# ----------------------- + +class TestListUsers: + + def test_i_can_list_users(self, client, fake_user_admin, fake_user_response): + client.user_service_mock.list_users.return_value = [fake_user_admin, fake_user_response] + response = client.get("/users") + assert response.status_code == status.HTTP_200_OK + data = response.json() + assert len(data) == 2 + assert data[0]["username"] == "admin" + + def test_i_can_list_users_when_empty(self, client): + client.user_service_mock.list_users.return_value = [] + response = client.get("/users") + assert response.status_code == status.HTTP_200_OK + assert response.json() == [] + + +class TestGetUserById: + + def test_i_can_get_user_by_id(self, client, fake_user_response): + client.user_service_mock.get_user_by_id.return_value = fake_user_response + response = client.get(f"/users/{fake_user_response.id}") + assert response.status_code == status.HTTP_200_OK + data = response.json() + assert data["username"] == fake_user_response.username + + def test_i_cannot_get_user_by_id_not_found(self, client): + client.user_service_mock.get_user_by_id.return_value = None + response = client.get("/users/64f0c9f4b0d1c8b7b8e1f0a2") + assert response.status_code == status.HTTP_404_NOT_FOUND + assert response.json()["detail"] == "User not found" + + +class TestCreateUser: + + def test_i_can_create_user(self, client, fake_user_response): + user_data = UserCreate(username="newuser", + email="new@example.com", + password="#Passw0rd!", + role=UserRole.USER) + + client.user_service_mock.create_user.return_value = fake_user_response + response = client.post("/users", json=user_data.model_dump(mode="json")) + assert response.status_code == status.HTTP_201_CREATED + data = response.json() + assert data["username"] == fake_user_response.username + + def test_i_cannot_create_user_when_service_raises_value_error(self, client): + user_data = {"username": "baduser", "email": "bad@example.com", "role": "user", "password": "password"} + client.user_service_mock.create_user.side_effect = ValueError("Invalid data") + response = client.post("/users", json=user_data) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +class TestUpdateUser: + + def test_i_can_update_user(self, client, fake_user_response): + user_data = {"username": "updateduser", "email": "updated@example.com"} + client.user_service_mock.update_user.return_value = fake_user_response + response = client.put(f"/users/{fake_user_response.id}", json=user_data) + assert response.status_code == status.HTTP_200_OK + data = response.json() + assert data["username"] == fake_user_response.username + + def test_i_cannot_update_user_not_found(self, client): + client.user_service_mock.update_user.return_value = None + user_data = {"username": "updateduser"} + response = client.put("/users/64f0c9f4b0d1c8b7b8e1f0a2", json=user_data) + assert response.status_code == status.HTTP_404_NOT_FOUND + assert response.json()["detail"] == "User not found" + + def test_i_cannot_update_user_when_service_raises_value_error(self, client): + client.user_service_mock.update_user.side_effect = ValueError("Invalid update") + user_data = {"username": "badupdate"} + response = client.put("/users/64f0c9f4b0d1c8b7b8e1f0a2", json=user_data) + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert response.json()["detail"] == "Invalid update" + + +class TestDeleteUser: + + def test_i_can_delete_user(self, client): + client.user_service_mock.delete_user.return_value = True + response = client.delete("/users/64f0c9f4b0d1c8b7b8e1f0a1") + assert response.status_code == status.HTTP_200_OK + data = response.json() + assert data["message"] == "User successfully deleted" + + def test_i_cannot_delete_user_not_found(self, client): + client.user_service_mock.delete_user.return_value = False + response = client.delete("/users/64f0c9f4b0d1c8b7b8e1f0a2") + assert response.status_code == status.HTTP_404_NOT_FOUND + assert response.json()["detail"] == "User not found" diff --git a/tests/database/__init__.py b/tests/database/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_user_models.py b/tests/models/test_user_models.py similarity index 95% rename from tests/test_user_models.py rename to tests/models/test_user_models.py index a11ff1e..d199fa4 100644 --- a/tests/test_user_models.py +++ b/tests/models/test_user_models.py @@ -10,8 +10,8 @@ from pydantic import ValidationError from datetime import datetime from bson import ObjectId -from app.models.user import UserCreate, UserUpdate, UserInDB, UserResponse -from app.models.auth import UserRole +from app.models.user import UserCreate, UserUpdate, UserInDB +from app.models.auth import UserRole, UserResponse class TestUserCreateModel: @@ -349,7 +349,7 @@ class TestUserResponseModel: # Convert to response model (excluding password_hash) user_response = UserResponse( - id=user_in_db.id, + _id=user_in_db.id, username=user_in_db.username, email=user_in_db.email, role=user_in_db.role, diff --git a/tests/repositories/__init__.py b/tests/repositories/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/repositories/test_document_repository.py b/tests/repositories/test_document_repository.py new file mode 100644 index 0000000..1eff3fd --- /dev/null +++ b/tests/repositories/test_document_repository.py @@ -0,0 +1,611 @@ +""" +Test suite for FileDocumentRepository with async/support. + +This module contains comprehensive tests for all FileDocumentRepository methods +using mongomock-motor for in-memory MongoDB testing. +""" + +from datetime import datetime + +import pytest +from bson import ObjectId +from mongomock.mongo_client import MongoClient +from pymongo.errors import PyMongoError + +from app.database.repositories.document_repository import ( + FileDocumentRepository, + MatchMethodBase, + SubsequenceMatching, + FuzzyMatching +) +from app.models.document import FileDocument, FileType, ExtractionMethod + + +@pytest.fixture +def in_memory_repository(): + """Create an in-memory FileDocumentRepository for testing.""" + client = MongoClient() + db = client.test_database + repo = FileDocumentRepository(db) + repo.initialize() + return repo + + +@pytest.fixture +def sample_file_document(): + """Sample FileDocument data for testing.""" + return FileDocument( + filename="sample_document.pdf", + filepath="/home/user/documents/sample_document.pdf", + file_type=FileType.PDF, + extraction_method=ExtractionMethod.OCR, + metadata={"pages": 5, "language": "en", "author": "John Doe"}, + detected_at=datetime.now(), + file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", + encoding="utf-8", + file_size=1024000, + mime_type="application/pdf" + ) + + +@pytest.fixture +def sample_update_data(): + """Sample update data for testing.""" + return { + "extraction_method": ExtractionMethod.HYBRID, + "metadata": {"pages": 10, "language": "fr", "updated": True}, + "file_size": 2048000 + } + + +@pytest.fixture +def multiple_sample_files(): + """Multiple FileDocument objects for list/search testing.""" + base_time = datetime.now() + return [ + FileDocument( + filename="first_doc.txt", + filepath="/docs/first_doc.txt", + file_type=FileType.TXT, + extraction_method=ExtractionMethod.DIRECT_TEXT, + metadata={"words": 500}, + detected_at=base_time, + file_hash="hash1" + "0" * 58, + encoding="utf-8", + file_size=5000, + mime_type="text/plain" + ), + FileDocument( + filename="second_document.pdf", + filepath="/docs/second_document.pdf", + file_type=FileType.PDF, + extraction_method=ExtractionMethod.OCR, + metadata={"pages": 8}, + detected_at=base_time, + file_hash="hash2" + "0" * 58, + encoding="utf-8", + file_size=10000, + mime_type="application/pdf" + ), + FileDocument( + filename="third_file.docx", + filepath="/docs/third_file.docx", + file_type=FileType.DOCX, + extraction_method=ExtractionMethod.HYBRID, + metadata={"paragraphs": 15}, + detected_at=base_time, + file_hash="hash3" + "0" * 58, + encoding="utf-8", + file_size=15000, + mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + ] + + +class TestFileDocumentRepositoryInitialization: + """Tests for repository initialization.""" + + def test_i_can_initialize_repository(self): + """Test repository initialization.""" + # Arrange + client = MongoClient() + db = client.test_database + repo = FileDocumentRepository(db) + repo.initialize() + + # Act & Assert (should not raise any exception) + assert repo.db is not None + assert repo.collection is not None + # TODO : check that the indexes are created + + +class TestFileDocumentRepositoryCreation: + """Tests for file document creation functionality.""" + + def test_i_can_create_file_document(self, in_memory_repository, sample_file_document): + """Test successful file document creation.""" + # Act + created_file = in_memory_repository.create_document(sample_file_document) + + # Assert + assert created_file is not None + assert created_file.filename == sample_file_document.filename + assert created_file.filepath == sample_file_document.filepath + assert created_file.file_type == sample_file_document.file_type + assert created_file.extraction_method == sample_file_document.extraction_method + assert created_file.metadata == sample_file_document.metadata + assert created_file.file_hash == sample_file_document.file_hash + assert created_file.file_size == sample_file_document.file_size + assert created_file.mime_type == sample_file_document.mime_type + assert created_file.id is not None + assert isinstance(created_file.id, ObjectId) + + def test_i_can_create_file_document_without_id(self, in_memory_repository, sample_file_document): + """Test creating file document with _id set to None (should be removed).""" + # Arrange + sample_file_document.id = None + + # Act + created_file = in_memory_repository.create_document(sample_file_document) + + # Assert + assert created_file is not None + assert created_file.id is not None + assert isinstance(created_file.id, ObjectId) + + def test_i_cannot_create_file_document_with_pymongo_error(self, in_memory_repository, + sample_file_document, mocker): + """Test handling of PyMongo errors during file document creation.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(ValueError) as exc_info: + in_memory_repository.create_document(sample_file_document) + + assert "Failed to create file document" in str(exc_info.value) + + +class TestFileDocumentRepositoryFinding: + """Tests for file document finding functionality.""" + + def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document): + """Test finding file document by valid ObjectId.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + + # Act + found_file = in_memory_repository.find_document_by_id(str(created_file.id)) + + # Assert + assert found_file is not None + assert found_file.id == created_file.id + assert found_file.filename == created_file.filename + assert found_file.filepath == created_file.filepath + + def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository): + """Test that invalid ObjectId returns None.""" + # Act + found_file = in_memory_repository.find_document_by_id("invalid_id") + + # Assert + assert found_file is None + + def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository): + """Test that nonexistent but valid ObjectId returns None.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + found_file = in_memory_repository.find_document_by_id(nonexistent_id) + + # Assert + assert found_file is None + + def test_i_can_find_document_by_file_hash(self, in_memory_repository, sample_file_document): + """Test finding file document by file hash.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + + # Act + found_file = in_memory_repository.find_document_by_hash(sample_file_document.file_hash) + + # Assert + assert found_file is not None + assert found_file.file_hash == created_file.file_hash + assert found_file.id == created_file.id + + def test_i_cannot_find_document_with_nonexistent_file_hash(self, in_memory_repository): + """Test that nonexistent file hash returns None.""" + # Act + found_file = in_memory_repository.find_document_by_hash("nonexistent_hash") + + # Assert + assert found_file is None + + def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document): + """Test finding file document by filepath.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + + # Act + found_file = in_memory_repository.find_document_by_filepath(sample_file_document.filepath) + + # Assert + assert found_file is not None + assert found_file.filepath == created_file.filepath + assert found_file.id == created_file.id + + def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository): + """Test that nonexistent filepath returns None.""" + # Act + found_file = in_memory_repository.find_document_by_filepath("/nonexistent/path/file.pdf") + + # Assert + assert found_file is None + + def test_i_cannot_find_document_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during file document finding.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error")) + + # Act + found_file = in_memory_repository.find_document_by_hash("test_hash") + + # Assert + assert found_file is None + + +class TestFileDocumentRepositoryNameMatching: + """Tests for file document name matching functionality.""" + + def test_i_can_find_documents_by_name_with_fuzzy_matching(self, in_memory_repository, multiple_sample_files): + """Test finding file documents by filename using fuzzy matching.""" + # Arrange + for file_doc in multiple_sample_files: + in_memory_repository.create_document(file_doc) + + # Act + fuzzy_method = FuzzyMatching(threshold=0.5) + found_files = in_memory_repository.find_document_by_name("document", fuzzy_method) + + # Assert + assert len(found_files) >= 1 + assert all(isinstance(file_doc, FileDocument) for file_doc in found_files) + # Should find files with "document" in the name + found_filenames = [f.filename for f in found_files] + assert any("document" in fname.lower() for fname in found_filenames) + + def test_i_can_find_documents_by_name_with_subsequence_matching(self, in_memory_repository, + multiple_sample_files): + """Test finding file documents by filename using subsequence matching.""" + # Arrange + for file_doc in multiple_sample_files: + in_memory_repository.create_document(file_doc) + + # Act + subsequence_method = SubsequenceMatching() + found_files = in_memory_repository.find_document_by_name("doc", subsequence_method) + + # Assert + assert len(found_files) >= 1 + assert all(isinstance(file_doc, FileDocument) for file_doc in found_files) + + def test_i_can_find_documents_by_name_with_default_method(self, in_memory_repository, multiple_sample_files): + """Test finding file documents by filename with default matching method.""" + # Arrange + for file_doc in multiple_sample_files: + in_memory_repository.create_document(file_doc) + + # Act + found_files = in_memory_repository.find_document_by_name("first") + + # Assert + assert len(found_files) >= 0 + assert all(isinstance(file_doc, FileDocument) for file_doc in found_files) + + def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during document name matching.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) + + # Act + found_files = in_memory_repository.find_document_by_name("test") + + # Assert + assert found_files == [] + + +class TestFileDocumentRepositoryListing: + """Tests for file document listing functionality.""" + + def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_files): + """Test listing file documents with default pagination.""" + # Arrange + for file_doc in multiple_sample_files: + in_memory_repository.create_document(file_doc) + + # Act + files = in_memory_repository.list_documents() + + # Assert + assert len(files) == len(multiple_sample_files) + assert all(isinstance(file_doc, FileDocument) for file_doc in files) + + def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_files): + """Test listing file documents with custom pagination.""" + # Arrange + for file_doc in multiple_sample_files: + in_memory_repository.create_document(file_doc) + + # Act + files_page1 = in_memory_repository.list_documents(skip=0, limit=2) + files_page2 = in_memory_repository.list_documents(skip=2, limit=2) + + # Assert + assert len(files_page1) == 2 + assert len(files_page2) == 1 # Only 3 total files + + # Ensure no overlap between pages + page1_ids = [file_doc.id for file_doc in files_page1] + page2_ids = [file_doc.id for file_doc in files_page2] + assert len(set(page1_ids).intersection(set(page2_ids))) == 0 + + def test_i_can_list_documents_sorted_by_detected_at(self, in_memory_repository, sample_file_document): + """Test that file documents are sorted by detected_at in descending order.""" + # Arrange + file1 = sample_file_document.model_copy() + file1.filepath = "/docs/file1.pdf" + file1.filename = "file1.pdf" + file1.file_hash = "hash1" + "0" * 58 + file1.detected_at = datetime(2024, 1, 1, 10, 0, 0) + + file2 = sample_file_document.model_copy() + file2.filepath = "/docs/file2.pdf" + file2.filename = "file2.pdf" + file2.file_hash = "hash2" + "0" * 58 + file2.detected_at = datetime(2024, 1, 2, 10, 0, 0) # Later date + + created_file1 = in_memory_repository.create_document(file1) + created_file2 = in_memory_repository.create_document(file2) + + # Act + files = in_memory_repository.list_documents() + + # Assert + assert len(files) == 2 + # Most recent (latest detected_at) should be first + assert files[0].id == created_file2.id + assert files[1].id == created_file1.id + + def test_i_can_list_empty_documents(self, in_memory_repository): + """Test listing file documents from empty collection.""" + # Act + files = in_memory_repository.list_documents() + + # Assert + assert files == [] + + def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during file document listing.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) + + # Act + files = in_memory_repository.list_documents() + + # Assert + assert files == [] + + +class TestFileDocumentRepositoryUpdate: + """Tests for file document update functionality.""" + + def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document, + sample_update_data): + """Test successful file document update.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + + # Act + updated_file = in_memory_repository.update_document(str(created_file.id), sample_update_data) + + # Assert + assert updated_file is not None + assert updated_file.extraction_method == sample_update_data["extraction_method"] + assert updated_file.metadata == sample_update_data["metadata"] + assert updated_file.file_size == sample_update_data["file_size"] + assert updated_file.id == created_file.id + assert updated_file.filename == created_file.filename # Unchanged fields remain + assert updated_file.filepath == created_file.filepath + + def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document): + """Test updating file document with partial data.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + partial_update = {"file_size": 999999} + + # Act + updated_file = in_memory_repository.update_document(str(created_file.id), partial_update) + + # Assert + assert updated_file is not None + assert updated_file.file_size == 999999 + assert updated_file.filename == created_file.filename # Should remain unchanged + assert updated_file.metadata == created_file.metadata # Should remain unchanged + + def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document): + """Test that None values are filtered out from update data.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + update_with_none = {"file_size": 777777, "metadata": None} + + # Act + updated_file = in_memory_repository.update_document(str(created_file.id), update_with_none) + + # Assert + assert updated_file is not None + assert updated_file.file_size == 777777 + assert updated_file.metadata == created_file.metadata # Should remain unchanged (None filtered out) + + def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document): + """Test updating file document with empty data returns current document.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + empty_update = {} + + # Act + result = in_memory_repository.update_document(str(created_file.id), empty_update) + + # Assert + assert result is not None + assert result.filename == created_file.filename + assert result.filepath == created_file.filepath + assert result.metadata == created_file.metadata + + def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data): + """Test that updating with invalid ID returns None.""" + # Act + result = in_memory_repository.update_document("invalid_id", sample_update_data) + + # Assert + assert result is None + + def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data): + """Test that updating nonexistent file document returns None.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + result = in_memory_repository.update_document(nonexistent_id, sample_update_data) + + # Assert + assert result is None + + def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document, + sample_update_data, mocker): + """Test handling of PyMongo errors during file document update.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + mocker.patch.object(in_memory_repository.collection, 'find_one_and_update', + side_effect=PyMongoError("Database error")) + + # Act + result = in_memory_repository.update_document(str(created_file.id), sample_update_data) + + # Assert + assert result is None + + +class TestFileDocumentRepositoryDeletion: + """Tests for file document deletion functionality.""" + + def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document): + """Test successful file document deletion.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + + # Act + deletion_result = in_memory_repository.delete_document(str(created_file.id)) + + # Assert + assert deletion_result is True + + # Verify document is actually deleted + found_file = in_memory_repository.find_document_by_id(str(created_file.id)) + assert found_file is None + + def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository): + """Test that deleting with invalid ID returns False.""" + # Act + result = in_memory_repository.delete_document("invalid_id") + + # Assert + assert result is False + + def test_i_cannot_delete_nonexistent_document(self, in_memory_repository): + """Test that deleting nonexistent file document returns False.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + result = in_memory_repository.delete_document(nonexistent_id) + + # Assert + assert result is False + + def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker): + """Test handling of PyMongo errors during file document deletion.""" + # Arrange + created_file = in_memory_repository.create_document(sample_file_document) + mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error")) + + # Act + result = in_memory_repository.delete_document(str(created_file.id)) + + # Assert + assert result is False + + +class TestFileDocumentRepositoryUtilities: + """Tests for utility methods.""" + + def test_i_can_count_documents(self, in_memory_repository, sample_file_document): + """Test counting file documents.""" + # Arrange + initial_count = in_memory_repository.count_documents() + in_memory_repository.create_document(sample_file_document) + + # Act + final_count = in_memory_repository.count_documents() + + # Assert + assert final_count == initial_count + 1 + + def test_i_can_count_zero_documents(self, in_memory_repository): + """Test counting file documents in empty collection.""" + # Act + count = in_memory_repository.count_documents() + + # Assert + assert count == 0 + + def test_i_cannot_count_documents_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during file document counting.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'count_documents', side_effect=PyMongoError("Database error")) + + # Act + count = in_memory_repository.count_documents() + + # Assert + assert count == 0 + + +class TestMatchingMethods: + """Tests for matching method classes.""" + + def test_i_can_create_fuzzy_matching_with_default_threshold(self): + """Test creating FuzzyMatching with default threshold.""" + # Act + fuzzy = FuzzyMatching() + + # Assert + assert fuzzy.threshold == 0.6 + + def test_i_can_create_fuzzy_matching_with_custom_threshold(self): + """Test creating FuzzyMatching with custom threshold.""" + # Act + fuzzy = FuzzyMatching(threshold=0.8) + + # Assert + assert fuzzy.threshold == 0.8 + + def test_i_can_create_subsequence_matching(self): + """Test creating SubsequenceMatching.""" + # Act + subsequence = SubsequenceMatching() + + # Assert + assert isinstance(subsequence, MatchMethodBase) + assert isinstance(subsequence, SubsequenceMatching) diff --git a/tests/repositories/test_job_repository.py b/tests/repositories/test_job_repository.py new file mode 100644 index 0000000..0b4ba64 --- /dev/null +++ b/tests/repositories/test_job_repository.py @@ -0,0 +1,496 @@ +""" +Test suite for JobRepository with async/support. + +This module contains comprehensive tests for all JobRepository methods +using mongomock-motor for in-memory MongoDB testing. +""" + +from datetime import datetime + +import pytest +from bson import ObjectId +from mongomock.mongo_client import MongoClient +from mongomock_motor import AsyncMongoMockClient +from pymongo.errors import PyMongoError + +from app.database.repositories.job_repository import JobRepository +from app.exceptions.job_exceptions import JobRepositoryError +from app.models.job import ProcessingJob, ProcessingStatus +from app.models.types import PyObjectId + + +@pytest.fixture +def in_memory_repository(): + """Create an in-memory JobRepository for testing.""" + client = MongoClient() + db = client.test_database + repo = JobRepository(db) + repo.initialize() + return repo + + +@pytest.fixture +def sample_document_id(): + """Sample document ObjectId for testing.""" + return PyObjectId() + + +@pytest.fixture +def sample_task_id(): + """Sample Celery task ID for testing.""" + return "celery-task-12345-abcde" + + +@pytest.fixture +def multiple_sample_jobs(): + """Multiple ProcessingJob objects for testing.""" + doc_id_1 = ObjectId() + doc_id_2 = ObjectId() + base_time = datetime.utcnow() + + return [ + ProcessingJob( + document_id=doc_id_1, + status=ProcessingStatus.PENDING, + task_id="task-1", + created_at=base_time, + started_at=None, + completed_at=None, + error_message=None + ), + ProcessingJob( + document_id=doc_id_2, + status=ProcessingStatus.PROCESSING, + task_id="task-2", + created_at=base_time, + started_at=base_time, + completed_at=None, + error_message=None + ), + ProcessingJob( + document_id=doc_id_1, + status=ProcessingStatus.COMPLETED, + task_id="task-3", + created_at=base_time, + started_at=base_time, + completed_at=base_time, + error_message=None + ) + ] + + +class TestJobRepositoryInitialization: + """Tests for repository initialization.""" + + def test_i_can_initialize_repository(self): + """Test repository initialization.""" + # Arrange + client = AsyncMongoMockClient() + db = client.test_database + repo = JobRepository(db) + + # Act + initialized_repo = repo.initialize() + + # Assert + assert initialized_repo is repo + assert repo.db is not None + assert repo.collection is not None + + +class TestJobRepositoryCreation: + """Tests for job creation functionality.""" + + def test_i_can_create_job_with_task_id(self, in_memory_repository, sample_document_id, sample_task_id): + """Test successful job creation with task ID.""" + # Act + created_job = in_memory_repository.create_job(sample_document_id, sample_task_id) + + # Assert + assert created_job is not None + assert created_job.document_id == sample_document_id + assert created_job.task_id == sample_task_id + assert created_job.status == ProcessingStatus.PENDING + assert created_job.created_at is not None + assert created_job.started_at is None + assert created_job.completed_at is None + assert created_job.error_message is None + assert created_job.id is not None + assert isinstance(created_job.id, ObjectId) + + def test_i_can_create_job_without_task_id(self, in_memory_repository, sample_document_id): + """Test successful job creation without task ID.""" + # Act + created_job = in_memory_repository.create_job(sample_document_id) + + # Assert + assert created_job is not None + assert created_job.document_id == sample_document_id + assert created_job.task_id is None + assert created_job.status == ProcessingStatus.PENDING + assert created_job.created_at is not None + assert created_job.started_at is None + assert created_job.completed_at is None + assert created_job.error_message is None + assert created_job.id is not None + assert isinstance(created_job.id, ObjectId) + + def test_i_cannot_create_duplicate_job_for_document(self, in_memory_repository, sample_document_id, + sample_task_id): + """Test that creating job with duplicate document_id raises DuplicateKeyError.""" + # Arrange + in_memory_repository.create_job(sample_document_id, sample_task_id) + + # Act & Assert + with pytest.raises(JobRepositoryError) as exc_info: + in_memory_repository.create_job(sample_document_id, "different-task-id") + + assert "create_job" in str(exc_info.value) + + def test_i_cannot_create_job_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker): + """Test handling of PyMongo errors during job creation.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(JobRepositoryError) as exc_info: + in_memory_repository.create_job(sample_document_id) + + assert "create_job" in str(exc_info.value) + + +class TestJobRepositoryFinding: + """Tests for job finding functionality.""" + + def test_i_can_find_job_by_valid_id(self, in_memory_repository, sample_document_id, sample_task_id): + """Test finding job by valid ObjectId.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id, sample_task_id) + + # Act + found_job = in_memory_repository.find_job_by_id(created_job.id) + + # Assert + assert found_job is not None + assert found_job.id == created_job.id + assert found_job.document_id == created_job.document_id + assert found_job.task_id == created_job.task_id + assert found_job.status == created_job.status + + def test_i_cannot_find_job_by_nonexistent_id(self, in_memory_repository): + """Test that nonexistent ObjectId returns None.""" + # Arrange + nonexistent_id = PyObjectId() + + # Act + found_job = in_memory_repository.find_job_by_id(nonexistent_id) + + # Assert + assert found_job is None + + def test_i_cannot_find_job_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during job finding.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(JobRepositoryError) as exc_info: + in_memory_repository.find_job_by_id(PyObjectId()) + + assert "get_job_by_id" in str(exc_info.value) + + def test_i_can_find_jobs_by_document_id(self, in_memory_repository, sample_document_id, sample_task_id): + """Test finding jobs by document ID.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id, sample_task_id) + + # Act + found_jobs = in_memory_repository.find_jobs_by_document_id(sample_document_id) + + # Assert + assert len(found_jobs) == 1 + assert found_jobs[0].id == created_job.id + assert found_jobs[0].document_id == sample_document_id + + def test_i_can_find_empty_jobs_list_for_nonexistent_document(self, in_memory_repository): + """Test that nonexistent document ID returns empty list.""" + # Arrange + nonexistent_id = ObjectId() + + # Act + found_jobs = in_memory_repository.find_jobs_by_document_id(nonexistent_id) + + # Assert + assert found_jobs == [] + + def test_i_cannot_find_jobs_by_document_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during finding jobs by document ID.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(JobRepositoryError) as exc_info: + in_memory_repository.find_jobs_by_document_id(PyObjectId()) + + assert "get_jobs_by_file_id" in str(exc_info.value) + + @pytest.mark.parametrize("status", [ + ProcessingStatus.PENDING, + ProcessingStatus.PROCESSING, + ProcessingStatus.COMPLETED + ]) + def test_i_can_find_jobs_by_pending_status(self, in_memory_repository, sample_document_id, status): + """Test finding jobs by PENDING status.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + in_memory_repository.update_job_status(created_job.id, status) + + # Act + found_jobs = in_memory_repository.get_jobs_by_status(status) + + # Assert + assert len(found_jobs) == 1 + assert found_jobs[0].id == created_job.id + assert found_jobs[0].status == status + + def test_i_can_find_jobs_by_failed_status(self, in_memory_repository, sample_document_id): + """Test finding jobs by FAILED status.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + in_memory_repository.update_job_status(created_job.id, ProcessingStatus.FAILED, "Test error") + + # Act + found_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.FAILED) + + # Assert + assert len(found_jobs) == 1 + assert found_jobs[0].id == created_job.id + assert found_jobs[0].status == ProcessingStatus.FAILED + assert found_jobs[0].error_message == "Test error" + + def test_i_can_find_empty_jobs_list_for_unused_status(self, in_memory_repository): + """Test that unused status returns empty list.""" + # Act + found_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.COMPLETED) + + # Assert + assert found_jobs == [] + + def test_i_cannot_find_jobs_by_status_with_pymongo_error(self, in_memory_repository, mocker): + """Test handling of PyMongo errors during finding jobs by status.""" + # Arrange + mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(JobRepositoryError) as exc_info: + in_memory_repository.get_jobs_by_status(ProcessingStatus.PENDING) + + assert "get_jobs_by_status" in str(exc_info.value) + + +class TestJobRepositoryStatusUpdate: + """Tests for job status update functionality.""" + + def test_i_can_update_job_status_to_processing(self, in_memory_repository, sample_document_id): + """Test updating job status to PROCESSING with started_at timestamp.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + + # Act + updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.PROCESSING) + + # Assert + assert updated_job is not None + assert updated_job.id == created_job.id + assert updated_job.status == ProcessingStatus.PROCESSING + assert updated_job.started_at is not None + assert updated_job.completed_at is None + assert updated_job.error_message is None + + def test_i_can_update_job_status_to_completed(self, in_memory_repository, sample_document_id): + """Test updating job status to COMPLETED with completed_at timestamp.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + in_memory_repository.update_job_status(created_job.id, ProcessingStatus.PROCESSING) + + # Act + updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.COMPLETED) + + # Assert + assert updated_job is not None + assert updated_job.id == created_job.id + assert updated_job.status == ProcessingStatus.COMPLETED + assert updated_job.started_at is not None + assert updated_job.completed_at is not None + assert updated_job.error_message is None + + def test_i_can_update_job_status_to_failed_with_error(self, in_memory_repository, sample_document_id): + """Test updating job status to FAILED with error message and completed_at timestamp.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + error_message = "Processing failed due to invalid format" + + # Act + updated_job = in_memory_repository.update_job_status( + created_job.id, ProcessingStatus.FAILED, error_message + ) + + # Assert + assert updated_job is not None + assert updated_job.id == created_job.id + assert updated_job.status == ProcessingStatus.FAILED + assert updated_job.completed_at is not None + assert updated_job.error_message == error_message + + def test_i_can_update_job_status_to_failed_without_error(self, in_memory_repository, sample_document_id): + """Test updating job status to FAILED without error message.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + + # Act + updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.FAILED) + + # Assert + assert updated_job is not None + assert updated_job.id == created_job.id + assert updated_job.status == ProcessingStatus.FAILED + assert updated_job.completed_at is not None + assert updated_job.error_message is None + + def test_i_cannot_update_nonexistent_job_status(self, in_memory_repository): + """Test that updating nonexistent job returns None.""" + # Arrange + nonexistent_id = ObjectId() + + # Act + result = in_memory_repository.update_job_status(nonexistent_id, ProcessingStatus.COMPLETED) + + # Assert + assert result is None + + def test_i_cannot_update_job_status_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker): + """Test handling of PyMongo errors during job status update.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + mocker.patch.object(in_memory_repository.collection, 'find_one_and_update', + side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(JobRepositoryError) as exc_info: + in_memory_repository.update_job_status(created_job.id, ProcessingStatus.COMPLETED) + + assert "update_job_status" in str(exc_info.value) + + +class TestJobRepositoryDeletion: + """Tests for job deletion functionality.""" + + def test_i_can_delete_existing_job(self, in_memory_repository, sample_document_id): + """Test successful job deletion.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + + # Act + deletion_result = in_memory_repository.delete_job(created_job.id) + + # Assert + assert deletion_result is True + + # Verify job is actually deleted + found_job = in_memory_repository.find_job_by_id(created_job.id) + assert found_job is None + + def test_i_cannot_delete_nonexistent_job(self, in_memory_repository): + """Test that deleting nonexistent job returns False.""" + # Arrange + nonexistent_id = ObjectId() + + # Act + result = in_memory_repository.delete_job(nonexistent_id) + + # Assert + assert result is False + + def test_i_cannot_delete_job_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker): + """Test handling of PyMongo errors during job deletion.""" + # Arrange + created_job = in_memory_repository.create_job(sample_document_id) + mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error")) + + # Act & Assert + with pytest.raises(JobRepositoryError) as exc_info: + in_memory_repository.delete_job(created_job.id) + + assert "delete_job" in str(exc_info.value) + + +class TestJobRepositoryComplexScenarios: + """Tests for complex job repository scenarios.""" + + def test_i_can_handle_complete_job_lifecycle(self, in_memory_repository, sample_document_id, sample_task_id): + """Test complete job lifecycle from creation to completion.""" + # Create job + job = in_memory_repository.create_job(sample_document_id, sample_task_id) + assert job.status == ProcessingStatus.PENDING + assert job.started_at is None + assert job.completed_at is None + + # Start processing + job = in_memory_repository.update_job_status(job.id, ProcessingStatus.PROCESSING) + assert job.status == ProcessingStatus.PROCESSING + assert job.started_at is not None + assert job.completed_at is None + + # Complete job + job = in_memory_repository.update_job_status(job.id, ProcessingStatus.COMPLETED) + assert job.status == ProcessingStatus.COMPLETED + assert job.started_at is not None + assert job.completed_at is not None + assert job.error_message is None + + def test_i_can_handle_job_failure_scenario(self, in_memory_repository, sample_document_id, sample_task_id): + """Test job failure scenario with error message.""" + # Create and start job + job = in_memory_repository.create_job(sample_document_id, sample_task_id) + job = in_memory_repository.update_job_status(job.id, ProcessingStatus.PROCESSING) + + # Fail job with error + error_msg = "File format not supported" + job = in_memory_repository.update_job_status(job.id, ProcessingStatus.FAILED, error_msg) + + # Assert failure state + assert job.status == ProcessingStatus.FAILED + assert job.started_at is not None + assert job.completed_at is not None + assert job.error_message == error_msg + + def test_i_can_handle_multiple_documents_with_different_statuses(self, in_memory_repository): + """Test managing multiple jobs for different documents with various statuses.""" + # Create jobs for different documents + doc1 = PyObjectId() + doc2 = PyObjectId() + doc3 = PyObjectId() + + job1 = in_memory_repository.create_job(doc1, "task-1") + job2 = in_memory_repository.create_job(doc2, "task-2") + job3 = in_memory_repository.create_job(doc3, "task-3") + + # Update to different statuses + in_memory_repository.update_job_status(job1.id, ProcessingStatus.PROCESSING) + in_memory_repository.update_job_status(job2.id, ProcessingStatus.COMPLETED) + in_memory_repository.update_job_status(job3.id, ProcessingStatus.FAILED, "Error occurred") + + # Verify status queries + pending_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.PENDING) + processing_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.PROCESSING) + completed_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.COMPLETED) + failed_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.FAILED) + + assert len(pending_jobs) == 0 + assert len(processing_jobs) == 1 + assert len(completed_jobs) == 1 + assert len(failed_jobs) == 1 + + assert processing_jobs[0].id == job1.id + assert completed_jobs[0].id == job2.id + assert failed_jobs[0].id == job3.id diff --git a/tests/test_user_repository.py b/tests/repositories/test_user_repository.py similarity index 51% rename from tests/test_user_repository.py rename to tests/repositories/test_user_repository.py index 0d6d39b..a623802 100644 --- a/tests/test_user_repository.py +++ b/tests/repositories/test_user_repository.py @@ -1,29 +1,26 @@ """ -Test suite for UserRepository with async/await support. +Test suite for UserRepository with async/support. This module contains comprehensive tests for all UserRepository methods using mongomock-motor for in-memory MongoDB testing. """ import pytest -from datetime import datetime - -import pytest_asyncio from bson import ObjectId +from mongomock.mongo_client import MongoClient from pymongo.errors import DuplicateKeyError -from mongomock_motor import AsyncMongoMockClient from app.database.repositories.user_repository import UserRepository -from app.models.user import UserCreate, UserUpdate, UserInDB +from app.models.user import UserCreate, UserUpdate -@pytest_asyncio.fixture -async def in_memory_repository(): +@pytest.fixture +def in_memory_repository(): """Create an in-memory UserRepository for testing.""" - client = AsyncMongoMockClient() + client = MongoClient() db = client.test_database repo = UserRepository(db) - await repo.initialize() + repo.initialize() return repo @@ -51,11 +48,10 @@ def sample_user_update(): class TestUserRepositoryCreation: """Tests for user creation functionality.""" - @pytest.mark.asyncio - async def test_i_can_create_user(self, in_memory_repository, sample_user_create): + def test_i_can_create_user(self, in_memory_repository, sample_user_create): """Test successful user creation.""" # Act - created_user = await in_memory_repository.create_user(sample_user_create) + created_user = in_memory_repository.create_user(sample_user_create) # Assert assert created_user is not None @@ -68,15 +64,14 @@ class TestUserRepositoryCreation: assert created_user.updated_at is not None assert created_user.hashed_password != sample_user_create.password # Should be hashed - @pytest.mark.asyncio - async def test_i_cannot_create_user_with_duplicate_username(self, in_memory_repository, sample_user_create): + def test_i_cannot_create_user_with_duplicate_username(self, in_memory_repository, sample_user_create): """Test that creating user with duplicate username raises DuplicateKeyError.""" # Arrange - await in_memory_repository.create_user(sample_user_create) + in_memory_repository.create_user(sample_user_create) # Act & Assert with pytest.raises(DuplicateKeyError) as exc_info: - await in_memory_repository.create_user(sample_user_create) + in_memory_repository.create_user(sample_user_create) assert "already exists" in str(exc_info.value) @@ -84,14 +79,13 @@ class TestUserRepositoryCreation: class TestUserRepositoryFinding: """Tests for user finding functionality.""" - @pytest.mark.asyncio - async def test_i_can_find_user_by_id(self, in_memory_repository, sample_user_create): + def test_i_can_find_user_by_id(self, in_memory_repository, sample_user_create): """Test finding user by valid ID.""" # Arrange - created_user = await in_memory_repository.create_user(sample_user_create) + created_user = in_memory_repository.create_user(sample_user_create) # Act - found_user = await in_memory_repository.find_user_by_id(str(created_user.id)) + found_user = in_memory_repository.find_user_by_id(str(created_user.id)) # Assert assert found_user is not None @@ -99,69 +93,63 @@ class TestUserRepositoryFinding: assert found_user.username == created_user.username assert found_user.email == created_user.email - @pytest.mark.asyncio - async def test_i_cannot_find_user_by_invalid_id(self, in_memory_repository): + def test_i_cannot_find_user_by_invalid_id(self, in_memory_repository): """Test that invalid ObjectId returns None.""" # Act - found_user = await in_memory_repository.find_user_by_id("invalid_id") + found_user = in_memory_repository.find_user_by_id("invalid_id") # Assert assert found_user is None - @pytest.mark.asyncio - async def test_i_cannot_find_user_by_nonexistent_id(self, in_memory_repository): + def test_i_cannot_find_user_by_nonexistent_id(self, in_memory_repository): """Test that nonexistent but valid ObjectId returns None.""" # Arrange nonexistent_id = str(ObjectId()) # Act - found_user = await in_memory_repository.find_user_by_id(nonexistent_id) + found_user = in_memory_repository.find_user_by_id(nonexistent_id) # Assert assert found_user is None - @pytest.mark.asyncio - async def test_i_can_find_user_by_username(self, in_memory_repository, sample_user_create): + def test_i_can_find_user_by_username(self, in_memory_repository, sample_user_create): """Test finding user by username.""" # Arrange - created_user = await in_memory_repository.create_user(sample_user_create) + created_user = in_memory_repository.create_user(sample_user_create) # Act - found_user = await in_memory_repository.find_user_by_username(sample_user_create.username) + found_user = in_memory_repository.find_user_by_username(sample_user_create.username) # Assert assert found_user is not None assert found_user.username == created_user.username assert found_user.id == created_user.id - @pytest.mark.asyncio - async def test_i_cannot_find_user_by_nonexistent_username(self, in_memory_repository): + def test_i_cannot_find_user_by_nonexistent_username(self, in_memory_repository): """Test that nonexistent username returns None.""" # Act - found_user = await in_memory_repository.find_user_by_username("nonexistent") + found_user = in_memory_repository.find_user_by_username("nonexistent") # Assert assert found_user is None - @pytest.mark.asyncio - async def test_i_can_find_user_by_email(self, in_memory_repository, sample_user_create): + def test_i_can_find_user_by_email(self, in_memory_repository, sample_user_create): """Test finding user by email.""" # Arrange - created_user = await in_memory_repository.create_user(sample_user_create) + created_user = in_memory_repository.create_user(sample_user_create) # Act - found_user = await in_memory_repository.find_user_by_email(str(sample_user_create.email)) + found_user = in_memory_repository.find_user_by_email(str(sample_user_create.email)) # Assert assert found_user is not None assert found_user.email == created_user.email assert found_user.id == created_user.id - @pytest.mark.asyncio - async def test_i_cannot_find_user_by_nonexistent_email(self, in_memory_repository): + def test_i_cannot_find_user_by_nonexistent_email(self, in_memory_repository): """Test that nonexistent email returns None.""" # Act - found_user = await in_memory_repository.find_user_by_email("nonexistent@example.com") + found_user = in_memory_repository.find_user_by_email("nonexistent@example.com") # Assert assert found_user is None @@ -170,15 +158,14 @@ class TestUserRepositoryFinding: class TestUserRepositoryUpdate: """Tests for user update functionality.""" - @pytest.mark.asyncio - async def test_i_can_update_user(self, in_memory_repository, sample_user_create, sample_user_update): + def test_i_can_update_user(self, in_memory_repository, sample_user_create, sample_user_update): """Test successful user update.""" # Arrange - created_user = await in_memory_repository.create_user(sample_user_create) + created_user = in_memory_repository.create_user(sample_user_create) original_updated_at = created_user.updated_at # Act - updated_user = await in_memory_repository.update_user(str(created_user.id), sample_user_update) + updated_user = in_memory_repository.update_user(str(created_user.id), sample_user_update) # Assert assert updated_user is not None @@ -187,24 +174,22 @@ class TestUserRepositoryUpdate: assert updated_user.role == sample_user_update.role assert updated_user.id == created_user.id - @pytest.mark.asyncio - async def test_i_cannot_update_user_with_invalid_id(self, in_memory_repository, sample_user_update): + def test_i_cannot_update_user_with_invalid_id(self, in_memory_repository, sample_user_update): """Test that updating with invalid ID returns None.""" # Act - result = await in_memory_repository.update_user("invalid_id", sample_user_update) + result = in_memory_repository.update_user("invalid_id", sample_user_update) # Assert assert result is None - @pytest.mark.asyncio - async def test_i_can_update_user_with_partial_data(self, in_memory_repository, sample_user_create): + def test_i_can_update_user_with_partial_data(self, in_memory_repository, sample_user_create): """Test updating user with partial data.""" # Arrange - created_user = await in_memory_repository.create_user(sample_user_create) + created_user = in_memory_repository.create_user(sample_user_create) partial_update = UserUpdate(username="newusername") # Act - updated_user = await in_memory_repository.update_user(str(created_user.id), partial_update) + updated_user = in_memory_repository.update_user(str(created_user.id), partial_update) # Assert assert updated_user is not None @@ -212,15 +197,14 @@ class TestUserRepositoryUpdate: assert updated_user.email == created_user.email # Should remain unchanged assert updated_user.role == created_user.role # Should remain unchanged - @pytest.mark.asyncio - async def test_i_can_update_user_with_empty_data(self, in_memory_repository, sample_user_create): + def test_i_can_update_user_with_empty_data(self, in_memory_repository, sample_user_create): """Test updating user with empty data returns current user.""" # Arrange - created_user = await in_memory_repository.create_user(sample_user_create) + created_user = in_memory_repository.create_user(sample_user_create) empty_update = UserUpdate() # Act - result = await in_memory_repository.update_user(str(created_user.id), empty_update) + result = in_memory_repository.update_user(str(created_user.id), empty_update) # Assert assert result is not None @@ -231,39 +215,36 @@ class TestUserRepositoryUpdate: class TestUserRepositoryDeletion: """Tests for user deletion functionality.""" - @pytest.mark.asyncio - async def test_i_can_delete_user(self, in_memory_repository, sample_user_create): + def test_i_can_delete_user(self, in_memory_repository, sample_user_create): """Test successful user deletion.""" # Arrange - created_user = await in_memory_repository.create_user(sample_user_create) + created_user = in_memory_repository.create_user(sample_user_create) # Act - deletion_result = await in_memory_repository.delete_user(str(created_user.id)) + deletion_result = in_memory_repository.delete_user(str(created_user.id)) # Assert assert deletion_result is True # Verify user is actually deleted - found_user = await in_memory_repository.find_user_by_id(str(created_user.id)) + found_user = in_memory_repository.find_user_by_id(str(created_user.id)) assert found_user is None - @pytest.mark.asyncio - async def test_i_cannot_delete_user_with_invalid_id(self, in_memory_repository): + def test_i_cannot_delete_user_with_invalid_id(self, in_memory_repository): """Test that deleting with invalid ID returns False.""" # Act - result = await in_memory_repository.delete_user("invalid_id") + result = in_memory_repository.delete_user("invalid_id") # Assert assert result is False - @pytest.mark.asyncio - async def test_i_cannot_delete_nonexistent_user(self, in_memory_repository): + def test_i_cannot_delete_nonexistent_user(self, in_memory_repository): """Test that deleting nonexistent user returns False.""" # Arrange nonexistent_id = str(ObjectId()) # Act - result = await in_memory_repository.delete_user(nonexistent_id) + result = in_memory_repository.delete_user(nonexistent_id) # Assert assert result is False @@ -272,30 +253,27 @@ class TestUserRepositoryDeletion: class TestUserRepositoryUtilities: """Tests for utility methods.""" - @pytest.mark.asyncio - async def test_i_can_count_users(self, in_memory_repository, sample_user_create): + def test_i_can_count_users(self, in_memory_repository, sample_user_create): """Test counting users.""" # Arrange - initial_count = await in_memory_repository.count_users() - await in_memory_repository.create_user(sample_user_create) + initial_count = in_memory_repository.count_users() + in_memory_repository.create_user(sample_user_create) # Act - final_count = await in_memory_repository.count_users() + final_count = in_memory_repository.count_users() # Assert assert final_count == initial_count + 1 - @pytest.mark.asyncio - async def test_i_can_check_user_exists(self, in_memory_repository, sample_user_create): + def test_i_can_check_user_exists(self, in_memory_repository, sample_user_create): """Test checking if user exists.""" # Arrange - await in_memory_repository.create_user(sample_user_create) + in_memory_repository.create_user(sample_user_create) # Act - exists = await in_memory_repository.user_exists(sample_user_create.username) - not_exists = await in_memory_repository.user_exists("nonexistent") + exists = in_memory_repository.user_exists(sample_user_create.username) + not_exists = in_memory_repository.user_exists("nonexistent") # Assert assert exists is True assert not_exists is False - diff --git a/tests/services/__init__.py b/tests/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/services/test_document_service.py b/tests/services/test_document_service.py new file mode 100644 index 0000000..5ca9867 --- /dev/null +++ b/tests/services/test_document_service.py @@ -0,0 +1,570 @@ +""" +Unit tests for DocumentService using in-memory MongoDB. + +Tests the orchestration logic with real MongoDB operations +using mongomock for better integration testing. +""" +import os +from datetime import datetime +from unittest.mock import patch + +import pytest +import pytest_asyncio +from bson import ObjectId +from mongomock.mongo_client import MongoClient + +from app.models.document import FileType +from app.services.document_service import DocumentService + + +@pytest.fixture(autouse=True) +def cleanup_test_folder(): + """Clean up test folder.""" + import shutil + shutil.rmtree("test_folder", ignore_errors=True) + + +@pytest.fixture +def in_memory_database(): + """Create an in-memory database for testing.""" + client = MongoClient() + return client.test_database + + +@pytest_asyncio.fixture +def document_service(in_memory_database): + """Create DocumentService with in-memory repositories.""" + service = DocumentService(in_memory_database, objects_folder="test_folder") + return service + + +@pytest.fixture +def sample_file_bytes(): + """Sample file content as bytes.""" + return b"This is a test PDF content" + + +@pytest.fixture +def sample_text_bytes(): + """Sample text file content as bytes.""" + return b"This is a test text file content" + + +@pytest.fixture +def sample_file_hash(): + """Expected SHA256 hash for sample file bytes.""" + import hashlib + return hashlib.sha256(b"This is a test PDF content").hexdigest() + + +def validate_file_saved(document_service, file_hash, file_bytes): + # Verify file is saved to disk + target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash) + assert os.path.exists(target_file_path) + + with open(target_file_path, "rb") as f: + content = f.read() + assert content == file_bytes + + +class TestCreateDocument: + """Tests for create_document method.""" + + @patch('app.services.document_service.magic.from_buffer') + @patch('app.services.document_service.datetime') + def test_i_can_create_document_with_new_content( + self, + mock_datetime, + mock_magic, + document_service, + sample_file_bytes + ): + """Test creating document when content doesn't exist yet.""" + # Setup mocks + fixed_time = datetime(2025, 1, 1, 10, 30, 0) + mock_datetime.now.return_value = fixed_time + mock_magic.return_value = "application/pdf" + + # Execute + result = document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify document creation + assert result is not None + assert result.filename == "test.pdf" + assert result.filepath == "/test/test.pdf" + assert result.file_type == FileType.PDF + assert result.detected_at == fixed_time + assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes) + + # Verify document created in database + doc_in_db = document_service.document_repository.find_document_by_id(result.id) + assert doc_in_db is not None + assert doc_in_db.id == result.id + assert doc_in_db.filename == result.filename + assert doc_in_db.filepath == result.filepath + assert doc_in_db.file_type == result.file_type + assert doc_in_db.detected_at == fixed_time + assert doc_in_db.file_hash == result.file_hash + + # Verify file is saved to disk + validate_file_saved(document_service, result.file_hash, sample_file_bytes) + + @patch('app.services.document_service.magic.from_buffer') + @patch('app.services.document_service.datetime') + def test_i_can_create_document_with_existing_content( + self, + mock_datetime, + mock_magic, + document_service, + sample_file_bytes + ): + """Test creating document when content already exists (deduplication).""" + # Setup mocks + fixed_time = datetime(2025, 1, 1, 10, 30, 0) + mock_datetime.now.return_value = fixed_time + mock_magic.return_value = "application/pdf" + + # Create first document + first_doc = document_service.create_document( + "/test/first.pdf", + sample_file_bytes, + "utf-8" + ) + + # Create second document with same content + second_doc = document_service.create_document( + "/test/second.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify both documents exist but share same hash + assert first_doc.file_hash == second_doc.file_hash + assert first_doc.filename != second_doc.filename + assert first_doc.filepath != second_doc.filepath + + def test_i_cannot_create_document_with_unsupported_file_type( + self, + document_service, + sample_file_bytes + ): + """Test that unsupported file types raise ValueError.""" + with pytest.raises(ValueError, match="Unsupported file type"): + document_service.create_document( + "/test/test.xyz", # Unsupported extension + sample_file_bytes, + "utf-8" + ) + + def test_i_cannot_create_document_with_empty_file_path( + self, + document_service, + sample_file_bytes + ): + """Test that empty file path raises ValueError.""" + with pytest.raises(ValueError): + document_service.create_document( + "", # Empty path + sample_file_bytes, + "utf-8" + ) + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_create_document_with_empty_bytes( + self, + mock_magic, + document_service + ): + """Test behavior with empty file bytes.""" + # Setup + mock_magic.return_value = "text/plain" + + # Execute with empty bytes + result = document_service.create_document( + "/test/empty.txt", + b"", # Empty bytes + "utf-8" + ) + + # Verify file is saved to disk + validate_file_saved(document_service, result.file_hash, b"") + + +class TestGetMethods: + """Tests for document retrieval methods.""" + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_get_document_by_id( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by ID.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = document_service.get_document_by_id(created_doc.id) + + # Verify + assert result is not None + assert result.id == created_doc.id + assert result.filename == created_doc.filename + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_get_document_by_hash( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by file hash.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = document_service.get_document_by_hash(created_doc.file_hash) + + # Verify + assert result is not None + assert result.file_hash == created_doc.file_hash + assert result.filename == created_doc.filename + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_get_document_by_filepath( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by file path.""" + # Setup + mock_magic.return_value = "application/pdf" + test_path = "/test/unique_test.pdf" + + # Create a document first + created_doc = document_service.create_document( + test_path, + sample_file_bytes, + "utf-8" + ) + + # Execute + result = document_service.get_document_by_filepath(test_path) + + # Verify + assert result is not None + assert result.filepath == test_path + assert result.id == created_doc.id + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_get_document_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document with associated content.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = document_service.get_document_content_by_hash(created_doc.file_hash) + + # Verify + assert result == sample_file_bytes + + def test_i_cannot_get_nonexistent_document_by_id( + self, + document_service + ): + """Test that nonexistent document returns None.""" + # Execute with random ObjectId + result = document_service.get_document_by_id(ObjectId()) + + # Verify + assert result is None + + def test_i_cannot_get_nonexistent_document_by_hash( + self, + document_service + ): + """Test that nonexistent document hash returns None.""" + # Execute + result = document_service.get_document_by_hash("nonexistent_hash") + + # Verify + assert result is None + + +class TestPaginationAndCounting: + """Tests for document listing and counting.""" + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_list_documents_with_pagination( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test document listing with pagination parameters.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create multiple documents + for i in range(5): + document_service.create_document( + f"/test/test{i}.pdf", + sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique + "utf-8" + ) + + # Execute with pagination + result = document_service.list_documents(skip=1, limit=2) + + # Verify + assert len(result) == 2 + + # Test counting + total_count = document_service.count_documents() + assert total_count == 5 + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_count_documents( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test document counting.""" + # Setup + mock_magic.return_value = "text/plain" + + # Initially should be 0 + initial_count = document_service.count_documents() + assert initial_count == 0 + + # Create some documents + for i in range(3): + document_service.create_document( + f"/test/test{i}.txt", + sample_file_bytes + bytes(str(i), 'utf-8'), + "utf-8" + ) + + # Execute + final_count = document_service.count_documents() + + # Verify + assert final_count == 3 + + +class TestUpdateAndDelete: + """Tests for document update and deletion operations.""" + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_update_document_metadata( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test updating document metadata.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute update + update_data = {"metadata": {"page_count": 5}} + result = document_service.update_document(created_doc.id, update_data) + + # Verify + assert result is not None + assert result.metadata.get("page_count") == 5 + assert result.filename == created_doc.filename + assert result.filepath == created_doc.filepath + assert result.file_hash == created_doc.file_hash + assert result.file_type == created_doc.file_type + assert result.metadata == update_data['metadata'] + + def test_i_can_update_document_content( + self, + document_service, + sample_file_bytes + ): + # Create a document first + created_doc = document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute update + update_data = {"file_bytes": b"this is an updated file content"} + result = document_service.update_document(created_doc.id, update_data) + + assert result.filename == created_doc.filename + assert result.filepath == created_doc.filepath + assert result.file_hash != created_doc.file_hash + assert result.file_type == created_doc.file_type + assert result.metadata == created_doc.metadata + + # Verify file is saved to disk + validate_file_saved(document_service, result.file_hash, b"this is an updated file content") + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_delete_document_and_orphaned_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test deleting document with orphaned content cleanup.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document + created_doc = document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify content exists + validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes) + + # Execute deletion + result = document_service.delete_document(created_doc.id) + + # Verify document and content are deleted + assert result is True + + deleted_doc = document_service.get_document_by_id(created_doc.id) + assert deleted_doc is None + + # validate content is deleted + file_hash = created_doc.file_hash[:24] + target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash) + assert not os.path.exists(target_file_path) + + @patch('app.services.document_service.magic.from_buffer') + def test_i_can_delete_document_without_affecting_shared_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test deleting document without removing shared content.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create two documents with same content + doc1 = document_service.create_document( + "/test/test1.pdf", + sample_file_bytes, + "utf-8" + ) + + doc2 = document_service.create_document( + "/test/test2.pdf", + sample_file_bytes, + "utf-8" + ) + + # They should share the same hash + assert doc1.file_hash == doc2.file_hash + + # Delete first document + result = document_service.delete_document(doc1.id) + assert result is True + + # Verify first document is deleted but content still exists + deleted_doc = document_service.get_document_by_id(doc1.id) + assert deleted_doc is None + + remaining_doc = document_service.get_document_by_id(doc2.id) + assert remaining_doc is not None + + validate_file_saved(document_service, doc2.file_hash, sample_file_bytes) + + +class TestHashCalculation: + """Tests for file hash calculation utility.""" + + def test_i_can_calculate_consistent_file_hash(self, document_service): + """Test that file hash calculation is consistent.""" + test_bytes = b"Test content for hashing" + + # Calculate hash multiple times + hash1 = document_service._calculate_file_hash(test_bytes) + hash2 = document_service._calculate_file_hash(test_bytes) + + # Should be identical + assert hash1 == hash2 + assert len(hash1) == 64 # SHA256 produces 64-character hex string + + def test_i_get_different_hashes_for_different_content(self, document_service): + """Test that different content produces different hashes.""" + content1 = b"First content" + content2 = b"Second content" + + hash1 = document_service._calculate_file_hash(content1) + hash2 = document_service._calculate_file_hash(content2) + + assert hash1 != hash2 + + +class TestFileTypeDetection: + """Tests for file type detection.""" + + def test_i_can_detect_pdf_file_type(self, document_service): + """Test PDF file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.pdf") + assert file_type == FileType.PDF + + def test_i_can_detect_txt_file_type(self, document_service): + """Test text file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.txt") + assert file_type == FileType.TXT + + def test_i_can_detect_docx_file_type(self, document_service): + """Test DOCX file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.docx") + assert file_type == FileType.DOCX + + def test_i_cannot_detect_unsupported_file_type(self, document_service): + """Test unsupported file type raises ValueError.""" + with pytest.raises(ValueError, match="Unsupported file type"): + document_service._detect_file_type("/path/to/document.xyz") diff --git a/tests/services/test_job_service.py b/tests/services/test_job_service.py new file mode 100644 index 0000000..5307ab9 --- /dev/null +++ b/tests/services/test_job_service.py @@ -0,0 +1,518 @@ +""" +Unit tests for JobService using in-memory MongoDB. + +Tests the business logic operations with real MongoDB operations +using mongomock for better integration testing. +""" + +import pytest +from bson import ObjectId +from mongomock.mongo_client import MongoClient + +from app.exceptions.job_exceptions import InvalidStatusTransitionError +from app.models.job import ProcessingStatus +from app.models.types import PyObjectId +from app.services.job_service import JobService + + +@pytest.fixture +def in_memory_database(): + """Create an in-memory database for testing.""" + client = MongoClient() + return client.test_database + + +@pytest.fixture +def job_service(in_memory_database): + """Create JobService with in-memory repositories.""" + service = JobService(in_memory_database).initialize() + return service + + +@pytest.fixture +def sample_document_id(): + """Sample file ObjectId.""" + return PyObjectId() + + +@pytest.fixture +def sample_task_id(): + """Sample Celery task UUID.""" + return "550e8400-e29b-41d4-a716-446655440000" + + +class TestCreateJob: + """Tests for create_job method.""" + + def test_i_can_create_job_with_task_id( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test creating job with task ID.""" + # Execute + result = job_service.create_job(sample_document_id, sample_task_id) + + # Verify job creation + assert result is not None + assert result.document_id == sample_document_id + assert result.task_id == sample_task_id + assert result.status == ProcessingStatus.PENDING + assert result.created_at is not None + assert result.started_at is None + assert result.error_message is None + + # Verify job exists in database + job_in_db = job_service.get_job_by_id(result.id) + assert job_in_db is not None + assert job_in_db.id == result.id + assert job_in_db.document_id == sample_document_id + assert job_in_db.task_id == sample_task_id + assert job_in_db.status == ProcessingStatus.PENDING + + def test_i_can_create_job_without_task_id( + self, + job_service, + sample_document_id + ): + """Test creating job without task ID.""" + # Execute + result = job_service.create_job(sample_document_id) + + # Verify job creation + assert result is not None + assert result.document_id == sample_document_id + assert result.task_id is None + assert result.status == ProcessingStatus.PENDING + assert result.created_at is not None + assert result.started_at is None + assert result.error_message is None + + +class TestGetJobMethods: + """Tests for job retrieval methods.""" + + def test_i_can_get_job_by_id( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test retrieving job by ID.""" + # Create a job first + created_job = job_service.create_job(sample_document_id, sample_task_id) + + # Execute + result = job_service.get_job_by_id(created_job.id) + + # Verify + assert result is not None + assert result.id == created_job.id + assert result.document_id == created_job.document_id + assert result.task_id == created_job.task_id + assert result.status == created_job.status + + def test_i_can_get_jobs_by_status( + self, + job_service, + sample_document_id + ): + """Test retrieving jobs by status.""" + # Create jobs with different statuses + pending_job = job_service.create_job(sample_document_id, "pending-task") + + processing_job = job_service.create_job(ObjectId(), "processing-task") + job_service.mark_job_as_started(processing_job.id) + + completed_job = job_service.create_job(ObjectId(), "completed-task") + job_service.mark_job_as_started(completed_job.id) + job_service.mark_job_as_completed(completed_job.id) + + # Execute - get pending jobs + pending_results = job_service.get_jobs_by_status(ProcessingStatus.PENDING) + + # Verify + assert len(pending_results) == 1 + assert pending_results[0].id == pending_job.id + assert pending_results[0].status == ProcessingStatus.PENDING + + # Execute - get processing jobs + processing_results = job_service.get_jobs_by_status(ProcessingStatus.PROCESSING) + assert len(processing_results) == 1 + assert processing_results[0].status == ProcessingStatus.PROCESSING + + # Execute - get completed jobs + completed_results = job_service.get_jobs_by_status(ProcessingStatus.COMPLETED) + assert len(completed_results) == 1 + assert completed_results[0].status == ProcessingStatus.COMPLETED + + +class TestUpdateStatus: + """Tests for mark_job_as_started method.""" + + def test_i_can_mark_pending_job_as_started( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test marking pending job as started (PENDING → PROCESSING).""" + # Create a pending job + created_job = job_service.create_job(sample_document_id, sample_task_id) + assert created_job.status == ProcessingStatus.PENDING + + # Execute + result = job_service.mark_job_as_started(created_job.id) + + # Verify status transition + assert result is not None + assert result.id == created_job.id + assert result.status == ProcessingStatus.PROCESSING + + # Verify in database + updated_job = job_service.get_job_by_id(created_job.id) + assert updated_job.status == ProcessingStatus.PROCESSING + + def test_i_cannot_mark_processing_job_as_started( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that processing job cannot be marked as started.""" + # Create and start a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + job_service.mark_job_as_started(created_job.id) + + # Try to start it again + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_started(created_job.id) + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.PROCESSING + assert exc_info.value.target_status == ProcessingStatus.PROCESSING + + def test_i_cannot_mark_completed_job_as_started( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that completed job cannot be marked as started.""" + # Create, start, and complete a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + job_service.mark_job_as_started(created_job.id) + job_service.mark_job_as_completed(created_job.id) + + # Try to start it again + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_started(created_job.id) + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.COMPLETED + assert exc_info.value.target_status == ProcessingStatus.PROCESSING + + def test_i_cannot_mark_failed_job_as_started( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that failed job cannot be marked as started.""" + # Create, start, and fail a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + job_service.mark_job_as_started(created_job.id) + job_service.mark_job_as_failed(created_job.id, "Test error") + + # Try to start it again + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_started(created_job.id) + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.FAILED + assert exc_info.value.target_status == ProcessingStatus.PROCESSING + + def test_i_can_mark_processing_job_as_completed( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test marking processing job as completed (PROCESSING → COMPLETED).""" + # Create and start a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + started_job = job_service.mark_job_as_started(created_job.id) + + # Execute + result = job_service.mark_job_as_completed(created_job.id) + + # Verify status transition + assert result is not None + assert result.id == created_job.id + assert result.status == ProcessingStatus.COMPLETED + + # Verify in database + updated_job = job_service.get_job_by_id(created_job.id) + assert updated_job.status == ProcessingStatus.COMPLETED + + def test_i_cannot_mark_pending_job_as_completed( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that pending job cannot be marked as completed.""" + # Create a pending job + created_job = job_service.create_job(sample_document_id, sample_task_id) + + # Try to complete it directly + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_completed(created_job.id) + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.PENDING + assert exc_info.value.target_status == ProcessingStatus.COMPLETED + + def test_i_cannot_mark_completed_job_as_completed( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that completed job cannot be marked as completed again.""" + # Create, start, and complete a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + job_service.mark_job_as_started(created_job.id) + job_service.mark_job_as_completed(created_job.id) + + # Try to complete it again + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_completed(created_job.id) + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.COMPLETED + assert exc_info.value.target_status == ProcessingStatus.COMPLETED + + def test_i_cannot_mark_failed_job_as_completed( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that failed job cannot be marked as completed.""" + # Create, start, and fail a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + job_service.mark_job_as_started(created_job.id) + job_service.mark_job_as_failed(created_job.id, "Test error") + + # Try to complete it + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_completed(created_job.id) + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.FAILED + assert exc_info.value.target_status == ProcessingStatus.COMPLETED + + def test_i_can_mark_processing_job_as_failed_with_error_message( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test marking processing job as failed with error message.""" + # Create and start a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + started_job = job_service.mark_job_as_started(created_job.id) + + error_message = "Processing failed due to invalid file format" + + # Execute + result = job_service.mark_job_as_failed(created_job.id, error_message) + + # Verify status transition + assert result is not None + assert result.id == created_job.id + assert result.status == ProcessingStatus.FAILED + assert result.error_message == error_message + + # Verify in database + updated_job = job_service.get_job_by_id(created_job.id) + assert updated_job.status == ProcessingStatus.FAILED + assert updated_job.error_message == error_message + + def test_i_can_mark_processing_job_as_failed_without_error_message( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test marking processing job as failed without error message.""" + # Create and start a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + job_service.mark_job_as_started(created_job.id) + + # Execute without error message + result = job_service.mark_job_as_failed(created_job.id) + + # Verify status transition + assert result is not None + assert result.status == ProcessingStatus.FAILED + assert result.error_message is None + + def test_i_cannot_mark_pending_job_as_failed( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that pending job cannot be marked as failed.""" + # Create a pending job + created_job = job_service.create_job(sample_document_id, sample_task_id) + + # Try to fail it directly + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_failed(created_job.id, "Test error") + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.PENDING + assert exc_info.value.target_status == ProcessingStatus.FAILED + + def test_i_cannot_mark_completed_job_as_failed( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that completed job cannot be marked as failed.""" + # Create, start, and complete a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + job_service.mark_job_as_started(created_job.id) + job_service.mark_job_as_completed(created_job.id) + + # Try to fail it + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_failed(created_job.id, "Test error") + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.COMPLETED + assert exc_info.value.target_status == ProcessingStatus.FAILED + + def test_i_cannot_mark_failed_job_as_failed( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test that failed job cannot be marked as failed again.""" + # Create, start, and fail a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + job_service.mark_job_as_started(created_job.id) + job_service.mark_job_as_failed(created_job.id, "First error") + + # Try to fail it again + with pytest.raises(InvalidStatusTransitionError) as exc_info: + job_service.mark_job_as_failed(created_job.id, "Second error") + + # Verify exception details + assert exc_info.value.current_status == ProcessingStatus.FAILED + assert exc_info.value.target_status == ProcessingStatus.FAILED + + +class TestDeleteJob: + """Tests for delete_job method.""" + + def test_i_can_delete_existing_job( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test deleting an existing job.""" + # Create a job + created_job = job_service.create_job(sample_document_id, sample_task_id) + + # Verify job exists + job_before_delete = job_service.get_job_by_id(created_job.id) + assert job_before_delete is not None + + # Execute deletion + result = job_service.delete_job(created_job.id) + + # Verify deletion + assert result is True + + # Verify job no longer exists + deleted_job = job_service.get_job_by_id(created_job.id) + assert deleted_job is None + + def test_i_cannot_delete_nonexistent_job( + self, + job_service + ): + """Test deleting a nonexistent job returns False.""" + # Execute deletion with random ObjectId + result = job_service.delete_job(ObjectId()) + + # Verify + assert result is False + + +class TestStatusTransitionValidation: + """Tests for status transition validation across different scenarios.""" + + def test_valid_job_lifecycle_flow( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test complete valid job lifecycle: PENDING → PROCESSING → COMPLETED.""" + # Create job (PENDING) + job = job_service.create_job(sample_document_id, sample_task_id) + assert job.status == ProcessingStatus.PENDING + + # Start job (PENDING → PROCESSING) + started_job = job_service.mark_job_as_started(job.id) + assert started_job.status == ProcessingStatus.PROCESSING + + # Complete job (PROCESSING → COMPLETED) + completed_job = job_service.mark_job_as_completed(job.id) + assert completed_job.status == ProcessingStatus.COMPLETED + + def test_valid_job_failure_flow( + self, + job_service, + sample_document_id, + sample_task_id + ): + """Test valid job failure: PENDING → PROCESSING → FAILED.""" + # Create job (PENDING) + job = job_service.create_job(sample_document_id, sample_task_id) + assert job.status == ProcessingStatus.PENDING + + # Start job (PENDING → PROCESSING) + started_job = job_service.mark_job_as_started(job.id) + assert started_job.status == ProcessingStatus.PROCESSING + + # Fail job (PROCESSING → FAILED) + failed_job = job_service.mark_job_as_failed(job.id, "Test failure") + assert failed_job.status == ProcessingStatus.FAILED + assert failed_job.error_message == "Test failure" + + def test_job_operations_with_empty_database( + self, + job_service + ): + """Test job operations when database is empty.""" + # Try to get nonexistent job + result = job_service.get_job_by_id(ObjectId()) + assert result is None + + # Try to get jobs by status when none exist + pending_jobs = job_service.get_jobs_by_status(ProcessingStatus.PENDING) + assert pending_jobs == [] + + # Try to delete nonexistent job + delete_result = job_service.delete_job(ObjectId()) + assert delete_result is False diff --git a/tests/test_connection.py b/tests/test_connection.py deleted file mode 100644 index 1fb9968..0000000 --- a/tests/test_connection.py +++ /dev/null @@ -1,187 +0,0 @@ -""" -Unit tests for MongoDB database connection module. - -Tests the database connection functionality with mocking -to avoid requiring actual MongoDB instance during tests. -""" - -import pytest -from unittest.mock import Mock, patch, MagicMock -from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError - -from app.database.connection import ( - create_mongodb_client, - get_database, - close_database_connection, - get_mongodb_client, - test_database_connection -) - - -def test_i_can_get_database_connection(): - """Test successful database connection creation.""" - mock_client = Mock() - mock_database = Mock() - - # Configure the mock to support dictionary-like access - mock_client.__getitem__ = Mock(return_value=mock_database) - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"): - with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"): - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - result = get_database() - - assert result == mock_database - mock_client.admin.command.assert_called_with('ping') - # Verify that __getitem__ was called with the database name - mock_client.__getitem__.assert_called_with("testdb") - - -def test_i_cannot_connect_to_invalid_mongodb_url(): - """Test fail-fast behavior with invalid MongoDB URL.""" - mock_client = Mock() - mock_client.admin.command.side_effect = ConnectionFailure("Connection failed") - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://invalid:27017"): - with pytest.raises(SystemExit) as exc_info: - create_mongodb_client() - - assert exc_info.value.code == 1 - - -def test_i_cannot_connect_with_server_selection_timeout(): - """Test fail-fast behavior with server selection timeout.""" - mock_client = Mock() - mock_client.admin.command.side_effect = ServerSelectionTimeoutError("Timeout") - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://timeout:27017"): - with pytest.raises(SystemExit) as exc_info: - create_mongodb_client() - - assert exc_info.value.code == 1 - - -def test_i_cannot_connect_with_unexpected_error(): - """Test fail-fast behavior with unexpected connection error.""" - with patch('app.database.connection.MongoClient', side_effect=Exception("Unexpected error")): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://error:27017"): - with pytest.raises(SystemExit) as exc_info: - create_mongodb_client() - - assert exc_info.value.code == 1 - - -def test_i_can_get_database_singleton(): - """Test that get_database returns the same instance (singleton pattern).""" - mock_client = Mock() - mock_database = Mock() - mock_client.__getitem__ = Mock(return_value=mock_database) - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"): - with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"): - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - # First call - db1 = get_database() - # Second call - db2 = get_database() - - assert db1 is db2 - # MongoClient should be called only once - assert mock_client.admin.command.call_count == 1 - - -def test_i_can_close_database_connection(): - """Test closing database connection.""" - mock_client = Mock() - mock_database = Mock() - mock_client.__getitem__ = Mock(return_value=mock_database) - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"): - with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"): - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - # Create connection - get_database() - - # Close connection - close_database_connection() - - mock_client.close.assert_called_once() - assert app.database.connection._client is None - assert app.database.connection._database is None - - -def test_i_can_get_mongodb_client(): - """Test getting raw MongoDB client instance.""" - mock_client = Mock() - mock_database = Mock() - mock_client.__getitem__ = Mock(return_value=mock_database) - - with patch('app.database.connection.MongoClient', return_value=mock_client): - with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"): - with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"): - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - # Create connection first - get_database() - - # Get client - result = get_mongodb_client() - - assert result == mock_client - - -def test_i_can_get_none_mongodb_client_when_not_connected(): - """Test getting MongoDB client returns None when not connected.""" - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - result = get_mongodb_client() - assert result is None - - -def test_i_can_test_database_connection_success(): - """Test database connection health check - success case.""" - mock_database = Mock() - mock_database.command.return_value = True - - with patch('app.database.connection.get_database', return_value=mock_database): - result = test_database_connection() - - assert result is True - mock_database.command.assert_called_with('ping') - - -def test_i_can_close_connection_when_no_client(): - """Test closing connection when no client exists (should not raise error).""" - # Reset global variables - import app.database.connection - app.database.connection._client = None - app.database.connection._database = None - - # Should not raise any exception - close_database_connection() - - assert app.database.connection._client is None - assert app.database.connection._database is None \ No newline at end of file diff --git a/tests/test_document_content_repository.py b/tests/test_document_content_repository.py deleted file mode 100644 index 1033e59..0000000 --- a/tests/test_document_content_repository.py +++ /dev/null @@ -1,311 +0,0 @@ -""" -Test suite for DocumentContentRepository with async/await support. - -This module contains comprehensive tests for all DocumentContentRepository methods -using mongomock-motor for in-memory MongoDB testing. -""" - -import pytest -import hashlib -from datetime import datetime - -import pytest_asyncio -from bson import ObjectId -from pymongo.errors import DuplicateKeyError -from mongomock_motor import AsyncMongoMockClient - -from app.database.repositories.document_content_repository import DocumentContentRepository -from app.models.document import DocumentContent - - -@pytest_asyncio.fixture -async def in_memory_repository(): - """Create an in-memory DocumentContentRepository for testing.""" - client = AsyncMongoMockClient() - db = client.test_database - repo = DocumentContentRepository(db) - await repo.initialize() - return repo - - -@pytest.fixture -def sample_document_content(): - """Sample DocumentContent data for testing.""" - content = "This is sample document content for testing purposes." - file_hash = hashlib.sha256(content.encode()).hexdigest() - - return DocumentContent( - file_hash=file_hash, - content=content, - encoding="utf-8", - file_size=len(content.encode()), - mime_type="text/plain" - ) - - -@pytest.fixture -def another_document_content(): - """Another sample DocumentContent data for testing.""" - content = "This is another sample document with different content." - file_hash = hashlib.sha256(content.encode()).hexdigest() - - return DocumentContent( - file_hash=file_hash, - content=content, - encoding="utf-8", - file_size=len(content.encode()), - mime_type="text/plain" - ) - - -class TestDocumentContentRepositoryCreation: - """Tests for document content creation functionality.""" - - @pytest.mark.asyncio - async def test_i_can_create_document_content(self, in_memory_repository, sample_document_content): - """Test successful document content creation.""" - # Act - created_content = await in_memory_repository.create_document_content(sample_document_content) - - # Assert - assert created_content is not None - assert created_content.file_hash == sample_document_content.file_hash - assert created_content.content == sample_document_content.content - assert created_content.encoding == sample_document_content.encoding - assert created_content.file_size == sample_document_content.file_size - assert created_content.mime_type == sample_document_content.mime_type - assert created_content.id is not None - - @pytest.mark.asyncio - async def test_i_cannot_create_document_content_with_duplicate_file_hash(self, in_memory_repository, - sample_document_content): - """Test that creating document content with duplicate file_hash raises DuplicateKeyError.""" - # Arrange - await in_memory_repository.create_document_content(sample_document_content) - - # Act & Assert - with pytest.raises(DuplicateKeyError) as exc_info: - await in_memory_repository.create_document_content(sample_document_content) - - assert "already exists" in str(exc_info.value) - - -class TestDocumentContentRepositoryFinding: - """Tests for document content finding functionality.""" - - @pytest.mark.asyncio - async def test_i_can_find_document_content_by_id(self, in_memory_repository, sample_document_content): - """Test finding document content by valid ID.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - - # Act - found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id)) - - # Assert - assert found_content is not None - assert found_content.id == created_content.id - assert found_content.file_hash == created_content.file_hash - assert found_content.content == created_content.content - - @pytest.mark.asyncio - async def test_i_cannot_find_document_content_by_invalid_id(self, in_memory_repository): - """Test that invalid ObjectId returns None.""" - # Act - found_content = await in_memory_repository.find_document_content_by_id("invalid_id") - - # Assert - assert found_content is None - - @pytest.mark.asyncio - async def test_i_cannot_find_document_content_by_nonexistent_id(self, in_memory_repository): - """Test that nonexistent but valid ObjectId returns None.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - found_content = await in_memory_repository.find_document_content_by_id(nonexistent_id) - - # Assert - assert found_content is None - - @pytest.mark.asyncio - async def test_i_can_find_document_content_by_file_hash(self, in_memory_repository, sample_document_content): - """Test finding document content by file hash.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - - # Act - found_content = await in_memory_repository.find_document_content_by_file_hash(sample_document_content.file_hash) - - # Assert - assert found_content is not None - assert found_content.file_hash == created_content.file_hash - assert found_content.id == created_content.id - - @pytest.mark.asyncio - async def test_i_cannot_find_document_content_by_nonexistent_file_hash(self, in_memory_repository): - """Test that nonexistent file hash returns None.""" - # Act - found_content = await in_memory_repository.find_document_content_by_file_hash("nonexistent_hash") - - # Assert - assert found_content is None - - -class TestDocumentContentRepositoryUpdate: - """Tests for document content update functionality.""" - - @pytest.mark.asyncio - async def test_i_can_update_document_content(self, in_memory_repository, sample_document_content): - """Test successful document content update.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - update_data = { - "content": "Updated content for testing", - "encoding": "utf-16", - "mime_type": "text/html" - } - - # Act - updated_content = await in_memory_repository.update_document_content(str(created_content.id), update_data) - - # Assert - assert updated_content is not None - assert updated_content.content == update_data["content"] - assert updated_content.encoding == update_data["encoding"] - assert updated_content.mime_type == update_data["mime_type"] - assert updated_content.id == created_content.id - assert updated_content.file_hash == created_content.file_hash # Should remain unchanged - - @pytest.mark.asyncio - async def test_i_cannot_update_document_content_with_invalid_id(self, in_memory_repository): - """Test that updating with invalid ID returns None.""" - # Act - result = await in_memory_repository.update_document_content("invalid_id", {"content": "test"}) - - # Assert - assert result is None - - @pytest.mark.asyncio - async def test_i_can_update_document_content_with_partial_data(self, in_memory_repository, sample_document_content): - """Test updating document content with partial data.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - partial_update = {"encoding": "iso-8859-1"} - - # Act - updated_content = await in_memory_repository.update_document_content(str(created_content.id), partial_update) - - # Assert - assert updated_content is not None - assert updated_content.encoding == "iso-8859-1" - assert updated_content.content == created_content.content # Should remain unchanged - assert updated_content.mime_type == created_content.mime_type # Should remain unchanged - - @pytest.mark.asyncio - async def test_i_can_update_document_content_with_empty_data(self, in_memory_repository, sample_document_content): - """Test updating document content with empty data returns current content.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - empty_update = {} - - # Act - result = await in_memory_repository.update_document_content(str(created_content.id), empty_update) - - # Assert - assert result is not None - assert result.content == created_content.content - assert result.encoding == created_content.encoding - assert result.mime_type == created_content.mime_type - - -class TestDocumentContentRepositoryDeletion: - """Tests for document content deletion functionality.""" - - @pytest.mark.asyncio - async def test_i_can_delete_document_content(self, in_memory_repository, sample_document_content): - """Test successful document content deletion.""" - # Arrange - created_content = await in_memory_repository.create_document_content(sample_document_content) - - # Act - deletion_result = await in_memory_repository.delete_document_content(str(created_content.id)) - - # Assert - assert deletion_result is True - - # Verify content is actually deleted - found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id)) - assert found_content is None - - @pytest.mark.asyncio - async def test_i_cannot_delete_document_content_with_invalid_id(self, in_memory_repository): - """Test that deleting with invalid ID returns False.""" - # Act - result = await in_memory_repository.delete_document_content("invalid_id") - - # Assert - assert result is False - - @pytest.mark.asyncio - async def test_i_cannot_delete_nonexistent_document_content(self, in_memory_repository): - """Test that deleting nonexistent document content returns False.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - result = await in_memory_repository.delete_document_content(nonexistent_id) - - # Assert - assert result is False - - -class TestDocumentContentRepositoryUtilities: - """Tests for utility methods.""" - - @pytest.mark.asyncio - async def test_i_can_check_content_exists(self, in_memory_repository, sample_document_content): - """Test checking if document content exists by file hash.""" - # Arrange - await in_memory_repository.create_document_content(sample_document_content) - - # Act - exists = await in_memory_repository.content_exists(sample_document_content.file_hash) - not_exists = await in_memory_repository.content_exists("nonexistent_hash") - - # Assert - assert exists is True - assert not_exists is False - - @pytest.mark.asyncio - async def test_i_can_list_document_contents(self, in_memory_repository, sample_document_content, - another_document_content): - """Test listing document contents with pagination.""" - # Arrange - await in_memory_repository.create_document_content(sample_document_content) - await in_memory_repository.create_document_content(another_document_content) - - # Act - all_contents = await in_memory_repository.list_document_contents() - limited_contents = await in_memory_repository.list_document_contents(skip=0, limit=1) - - # Assert - assert len(all_contents) == 2 - assert len(limited_contents) == 1 - assert all(isinstance(content, DocumentContent) for content in all_contents) - - @pytest.mark.asyncio - async def test_i_can_count_document_contents(self, in_memory_repository, sample_document_content, - another_document_content): - """Test counting document contents.""" - # Arrange - initial_count = await in_memory_repository.count_document_contents() - await in_memory_repository.create_document_content(sample_document_content) - await in_memory_repository.create_document_content(another_document_content) - - # Act - final_count = await in_memory_repository.count_document_contents() - - # Assert - assert final_count == initial_count + 2 \ No newline at end of file diff --git a/tests/test_document_repository.py b/tests/test_document_repository.py deleted file mode 100644 index a5cc5c1..0000000 --- a/tests/test_document_repository.py +++ /dev/null @@ -1,566 +0,0 @@ -""" -Test suite for FileDocumentRepository with async/await support. - -This module contains comprehensive tests for all FileDocumentRepository methods -using mongomock-motor for in-memory MongoDB testing. -""" - -import pytest -from datetime import datetime -from typing import Dict, Any - -import pytest_asyncio -from bson import ObjectId -from pymongo.errors import DuplicateKeyError, PyMongoError -from mongomock_motor import AsyncMongoMockClient - -from app.database.repositories.document_repository import FileDocumentRepository -from app.models.document import FileDocument, FileType - - -@pytest_asyncio.fixture -async def in_memory_repository(): - """Create an in-memory FileDocumentRepository for testing.""" - client = AsyncMongoMockClient() - db = client.test_database - repo = FileDocumentRepository(db) - # repo.db = db - # repo.collection = db.files - await repo.initialize() - return repo - - -@pytest.fixture -def sample_file_document(): - """Sample FileDocument data for testing.""" - return FileDocument( - filename="test_document.pdf", - filepath="/path/to/test_document.pdf", - file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", - file_type=FileType("pdf"), - detected_at=datetime.now(), - ) - - -@pytest.fixture -def sample_update_data(): - """Sample update data for testing.""" - return { - "metadata": {"tags": ["updated", "document"]}, - "file_type": FileType("txt"), - } - - -@pytest.fixture -def multiple_sample_documents(): - """Multiple FileDocument objects for list/search testing.""" - base_time = datetime.now() - return [ - FileDocument( - filename="document1.pdf", - filepath="/path/to/document1.pdf", - file_hash="hash1" + "0" * 58, - file_type=FileType("pdf"), - detected_at=base_time, - ), - FileDocument( - filename="similar_document.pdf", - filepath="/path/to/similar_document.pdf", - file_hash="hash2" + "0" * 58, - file_type=FileType("pdf"), - detected_at=base_time, - ), - FileDocument( - filename="completely_different.txt", - filepath="/path/to/completely_different.txt", - file_hash="hash3" + "0" * 58, - file_type=FileType("pdf"), - detected_at=base_time, - ) - ] - - -class TestFileDocumentRepositoryInitialization: - """Tests for repository initialization.""" - - @pytest.mark.asyncio - async def test_i_can_initialize_repository(self): - """Test repository initialization.""" - # Arrange - client = AsyncMongoMockClient() - db = client.test_database - repo = FileDocumentRepository(db) - await repo.initialize() - - # Act & Assert (should not raise any exception) - assert repo.db is not None - assert repo.collection is not None - # TODO : check that the indexes are create - - -class TestFileDocumentRepositoryCreation: - """Tests for file document creation functionality.""" - - @pytest.mark.asyncio - async def test_i_can_create_document(self, in_memory_repository, sample_file_document): - """Test successful file document creation.""" - # Act - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Assert - assert created_doc is not None - assert created_doc.filename == sample_file_document.filename - assert created_doc.filepath == sample_file_document.filepath - assert created_doc.file_hash == sample_file_document.file_hash - assert created_doc.file_type == sample_file_document.file_type - assert created_doc.id is not None - assert isinstance(created_doc.id, ObjectId) - - @pytest.mark.asyncio - async def test_i_can_create_document_without_id(self, in_memory_repository, sample_file_document): - """Test creating document with _id set to None (should be removed).""" - # Arrange - sample_file_document.id = None - - # Act - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Assert - assert created_doc is not None - assert created_doc.id is not None - assert isinstance(created_doc.id, ObjectId) - - @pytest.mark.asyncio - async def test_i_cannot_create_duplicate_document(self, in_memory_repository, sample_file_document): - """Test that creating document with duplicate hash raises DuplicateKeyError.""" - # Arrange - await in_memory_repository.create_document(sample_file_document) - duplicate_doc = FileDocument( - filename="different_name.pdf", - filepath=sample_file_document.filepath, - file_hash="different_hash" + "0" * 58, - file_type=FileType("pdf"), - detected_at=datetime.now() - ) - - # Act & Assert - with pytest.raises(DuplicateKeyError) as exc_info: - await in_memory_repository.create_document(duplicate_doc) - - assert "already exists" in str(exc_info.value) - - @pytest.mark.asyncio - async def test_i_cannot_create_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker): - """Test handling of PyMongo errors during document creation.""" - # Arrange - mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error")) - - # Act & Assert - with pytest.raises(ValueError) as exc_info: - await in_memory_repository.create_document(sample_file_document) - - assert "Failed to create file document" in str(exc_info.value) - - -class TestFileDocumentRepositoryFinding: - """Tests for file document finding functionality.""" - - @pytest.mark.asyncio - async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document): - """Test finding document by valid ObjectId.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id)) - - # Assert - assert found_doc is not None - assert found_doc.id == created_doc.id - assert found_doc.filename == created_doc.filename - assert found_doc.file_hash == created_doc.file_hash - - @pytest.mark.asyncio - async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository): - """Test that invalid ObjectId returns None.""" - # Act - found_doc = await in_memory_repository.find_document_by_id("invalid_id") - - # Assert - assert found_doc is None - - @pytest.mark.asyncio - async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository): - """Test that nonexistent but valid ObjectId returns None.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - found_doc = await in_memory_repository.find_document_by_id(nonexistent_id) - - # Assert - assert found_doc is None - - @pytest.mark.asyncio - async def test_i_can_find_document_by_hash(self, in_memory_repository, sample_file_document): - """Test finding document by file hash.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - found_doc = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash) - - # Assert - assert found_doc is not None - assert found_doc.file_hash == created_doc.file_hash - assert found_doc.id == created_doc.id - - @pytest.mark.asyncio - async def test_i_cannot_find_document_with_nonexistent_hash(self, in_memory_repository): - """Test that nonexistent hash returns None.""" - # Act - found_doc = await in_memory_repository.find_document_by_hash("nonexistent_hash") - - # Assert - assert found_doc is None - - @pytest.mark.asyncio - async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document): - """Test finding document by exact filepath.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - found_doc = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath) - - # Assert - assert found_doc is not None - assert found_doc.filepath == created_doc.filepath - assert found_doc.id == created_doc.id - - @pytest.mark.asyncio - async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository): - """Test that nonexistent filepath returns None.""" - # Act - found_doc = await in_memory_repository.find_document_by_filepath("/nonexistent/path.pdf") - - # Assert - assert found_doc is None - - -class TestFileDocumentRepositoryFuzzySearch: - """Tests for fuzzy search functionality by filename.""" - - @pytest.mark.asyncio - async def test_i_can_find_documents_by_exact_name(self, in_memory_repository, multiple_sample_documents): - """Test finding documents with exact filename match.""" - # Arrange - for doc in multiple_sample_documents: - await in_memory_repository.create_document(doc) - - # Act - found_docs = await in_memory_repository.find_document_by_name("document1.pdf") - - # Assert - assert len(found_docs) == 1 - assert found_docs[0].filename == "document1.pdf" - - @pytest.mark.asyncio - async def test_i_can_find_documents_by_fuzzy_name(self, in_memory_repository, multiple_sample_documents): - """Test finding documents with fuzzy matching using default threshold.""" - # Arrange - for doc in multiple_sample_documents: - await in_memory_repository.create_document(doc) - - # Act - found_docs = await in_memory_repository.find_document_by_name("document") - - # Assert - assert len(found_docs) >= 2 # Should find document1.pdf and similar_document.pdf - filenames = [doc.filename for doc in found_docs] - assert "document1.pdf" in filenames - assert "similar_document.pdf" in filenames - - @pytest.mark.asyncio - async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker): - """Test handling of PyMongo errors during name search.""" - # Arrange - mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) - - # Act - found_docs = await in_memory_repository.find_document_by_name("test") - - # Assert - assert found_docs == [] - - -class TestFileDocumentRepositoryListing: - """Tests for document listing functionality.""" - - @pytest.mark.asyncio - async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_documents): - """Test listing documents with default pagination.""" - # Arrange - for doc in multiple_sample_documents: - await in_memory_repository.create_document(doc) - - # Act - docs = await in_memory_repository.list_documents() - - # Assert - assert len(docs) == len(multiple_sample_documents) - assert all(isinstance(doc, FileDocument) for doc in docs) - - @pytest.mark.asyncio - async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_documents): - """Test listing documents with custom pagination.""" - # Arrange - for doc in multiple_sample_documents: - await in_memory_repository.create_document(doc) - - # Act - docs_page1 = await in_memory_repository.list_documents(skip=0, limit=2) - docs_page2 = await in_memory_repository.list_documents(skip=2, limit=2) - - # Assert - assert len(docs_page1) == 2 - assert len(docs_page2) == 1 # Only 3 total documents - - # Ensure no overlap between pages - page1_ids = [doc.id for doc in docs_page1] - page2_ids = [doc.id for doc in docs_page2] - assert len(set(page1_ids).intersection(set(page2_ids))) == 0 - - @pytest.mark.asyncio - async def test_i_can_list_documents_sorted_by_date(self, in_memory_repository, sample_file_document): - """Test that documents are sorted by detected_at in descending order.""" - # Arrange - from datetime import timedelta - - # Create documents with different timestamps - doc1 = sample_file_document.model_copy() - doc1.filename = "oldest.pdf" - doc1.filepath = f"/path/to/{doc1.filename}" - doc1.file_hash = "hash1" + "0" * 58 - doc1.detected_at = datetime.now() - timedelta(hours=2) - - doc2 = sample_file_document.model_copy() - doc2.filename = "newest.pdf" - doc2.filepath = f"/path/to/{doc2.filename}" - doc2.file_hash = "hash2" + "0" * 58 - doc2.detected_at = datetime.now() - - await in_memory_repository.create_document(doc1) - await in_memory_repository.create_document(doc2) - - # Act - docs = await in_memory_repository.list_documents() - - # Assert - assert len(docs) == 2 - assert docs[0].filename == "newest.pdf" # Most recent first - assert docs[1].filename == "oldest.pdf" - - @pytest.mark.asyncio - async def test_i_can_list_empty_documents(self, in_memory_repository): - """Test listing documents from empty collection.""" - # Act - docs = await in_memory_repository.list_documents() - - # Assert - assert docs == [] - - @pytest.mark.asyncio - async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker): - """Test handling of PyMongo errors during document listing.""" - # Arrange - mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error")) - - # Act - docs = await in_memory_repository.list_documents() - - # Assert - assert docs == [] - - -class TestFileDocumentRepositoryUpdate: - """Tests for document update functionality.""" - - @pytest.mark.asyncio - async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document, - sample_update_data): - """Test successful document update.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - updated_doc = await in_memory_repository.update_document(str(created_doc.id), sample_update_data) - - # Assert - assert updated_doc is not None - assert updated_doc.file_type == sample_update_data["file_type"] - assert updated_doc.id == created_doc.id - assert updated_doc.filename == created_doc.filename # Unchanged fields remain - - @pytest.mark.asyncio - async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document): - """Test updating document with partial data.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - partial_update = {"file_type": FileType("txt")} - - # Act - updated_doc = await in_memory_repository.update_document(str(created_doc.id), partial_update) - - # Assert - assert updated_doc is not None - assert updated_doc.file_type == FileType("txt") - assert updated_doc.filename == created_doc.filename # Should remain unchanged - assert updated_doc.filepath == created_doc.filepath # Should remain unchanged - - @pytest.mark.asyncio - async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document): - """Test that None values are filtered out from update data.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - update_with_none = {"metadata": {"tags": ["updated", "document"]}, "file_type": None} - - # Act - updated_doc = await in_memory_repository.update_document(str(created_doc.id), update_with_none) - - # Assert - assert updated_doc is not None - assert updated_doc.metadata == {"tags": ["updated", "document"]} - assert updated_doc.file_type == created_doc.file_type # Should remain unchanged (None filtered out) - - @pytest.mark.asyncio - async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document): - """Test updating document with empty data returns current document.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - empty_update = {} - - # Act - result = await in_memory_repository.update_document(str(created_doc.id), empty_update) - - # Assert - assert result is not None - assert result.filename == created_doc.filename - assert result.file_hash == created_doc.file_hash - assert result.metadata == created_doc.metadata - - @pytest.mark.asyncio - async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data): - """Test that updating with invalid ID returns None.""" - # Act - result = await in_memory_repository.update_document("invalid_id", sample_update_data) - - # Assert - assert result is None - - @pytest.mark.asyncio - async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data): - """Test that updating nonexistent document returns None.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - result = await in_memory_repository.update_document(nonexistent_id, sample_update_data) - - # Assert - assert result is None - - @pytest.mark.asyncio - async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document, - sample_update_data, mocker): - """Test handling of PyMongo errors during document update.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - mocker.patch.object(in_memory_repository.collection, 'find_one_and_update', - side_effect=PyMongoError("Database error")) - - # Act - result = await in_memory_repository.update_document(str(created_doc.id), sample_update_data) - - # Assert - assert result is None - - -class TestFileDocumentRepositoryDeletion: - """Tests for document deletion functionality.""" - - @pytest.mark.asyncio - async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document): - """Test successful document deletion.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - - # Act - deletion_result = await in_memory_repository.delete_document(str(created_doc.id)) - - # Assert - assert deletion_result is True - - # Verify document is actually deleted - found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id)) - assert found_doc is None - - @pytest.mark.asyncio - async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository): - """Test that deleting with invalid ID returns False.""" - # Act - result = await in_memory_repository.delete_document("invalid_id") - - # Assert - assert result is False - - @pytest.mark.asyncio - async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository): - """Test that deleting nonexistent document returns False.""" - # Arrange - nonexistent_id = str(ObjectId()) - - # Act - result = await in_memory_repository.delete_document(nonexistent_id) - - # Assert - assert result is False - - @pytest.mark.asyncio - async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker): - """Test handling of PyMongo errors during document deletion.""" - # Arrange - created_doc = await in_memory_repository.create_document(sample_file_document) - mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error")) - - # Act - result = await in_memory_repository.delete_document(str(created_doc.id)) - - # Assert - assert result is False - - -class TestFileDocumentRepositoryUtilities: - """Tests for utility methods.""" - - @pytest.mark.asyncio - async def test_i_can_count_documents(self, in_memory_repository, sample_file_document): - """Test counting documents.""" - # Arrange - initial_count = await in_memory_repository.count_documents() - await in_memory_repository.create_document(sample_file_document) - - # Act - final_count = await in_memory_repository.count_documents() - - # Assert - assert final_count == initial_count + 1 - - @pytest.mark.asyncio - async def test_i_can_count_zero_documents(self, in_memory_repository): - """Test counting documents in empty collection.""" - # Act - count = await in_memory_repository.count_documents() - - # Assert - assert count == 0 diff --git a/tests/test_document_service.py b/tests/test_document_service.py deleted file mode 100644 index 532c2c4..0000000 --- a/tests/test_document_service.py +++ /dev/null @@ -1,697 +0,0 @@ -""" -Unit tests for DocumentService using in-memory MongoDB. - -Tests the orchestration logic with real MongoDB operations -using mongomock for better integration testing. -""" - -import pytest -import pytest_asyncio -from unittest.mock import Mock, patch -from datetime import datetime -from bson import ObjectId -from pathlib import Path - -from mongomock_motor import AsyncMongoMockClient - -from app.services.document_service import DocumentService -from app.database.repositories.document_repository import FileDocumentRepository -from app.database.repositories.document_content_repository import DocumentContentRepository -from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod -from app.models.types import PyObjectId - - -@pytest_asyncio.fixture -async def in_memory_file_repository(): - """Create an in-memory FileDocumentRepository for testing.""" - client = AsyncMongoMockClient() - db = client.test_database - repo = FileDocumentRepository(db) - await repo.initialize() - return repo - - -@pytest_asyncio.fixture -async def in_memory_content_repository(): - """Create an in-memory DocumentContentRepository for testing.""" - client = AsyncMongoMockClient() - db = client.test_database - repo = DocumentContentRepository(db) - await repo.initialize() - return repo - - -@pytest_asyncio.fixture -async def in_memory_database(): - """Create an in-memory database for testing.""" - client = AsyncMongoMockClient() - return client.test_database - - -@pytest_asyncio.fixture -async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database): - """Create DocumentService with in-memory repositories.""" - with patch('app.services.document_service.get_database', return_value=in_memory_database): - service = DocumentService() - service.file_repository = in_memory_file_repository - service.content_repository = in_memory_content_repository - return service - - -@pytest.fixture -def sample_file_bytes(): - """Sample file content as bytes.""" - return b"This is a test PDF content" - - -@pytest.fixture -def sample_text_bytes(): - """Sample text file content as bytes.""" - return b"This is a test text file content" - - -@pytest.fixture -def sample_file_hash(): - """Expected SHA256 hash for sample file bytes.""" - import hashlib - return hashlib.sha256(b"This is a test PDF content").hexdigest() - - -@pytest.fixture -def sample_file_document(): - """Sample FileDocument for testing.""" - return FileDocument( - id=ObjectId(), - filename="test.pdf", - filepath="/test/test.pdf", - file_type=FileType.PDF, - extraction_method=None, - metadata={}, - detected_at=datetime(2024, 1, 15, 10, 30, 0), - file_hash="test_hash" - ) - - -class TestCreateDocument: - """Tests for create_document method.""" - - @patch('app.services.document_service.magic.from_buffer') - @patch('app.services.document_service.datetime') - @pytest.mark.asyncio - async def test_i_can_create_document_with_new_content( - self, - mock_datetime, - mock_magic, - document_service, - sample_file_bytes - ): - """Test creating document when content doesn't exist yet.""" - # Setup mocks - fixed_time = datetime(2024, 1, 15, 10, 30, 0) - mock_datetime.utcnow.return_value = fixed_time - mock_magic.return_value = "application/pdf" - - # Execute - result = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Verify document creation - assert result is not None - assert result.filename == "test.pdf" - assert result.filepath == "/test/test.pdf" - assert result.file_type == FileType.PDF - assert result.detected_at == fixed_time - assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes) - - # Verify content was created - content = await document_service.content_repository.find_document_content_by_file_hash( - result.file_hash - ) - assert content is not None - assert content.file_hash == result.file_hash - assert content.file_size == len(sample_file_bytes) - assert content.mime_type == "application/pdf" - assert content.encoding == "utf-8" - - @patch('app.services.document_service.magic.from_buffer') - @patch('app.services.document_service.datetime') - @pytest.mark.asyncio - async def test_i_can_create_document_with_existing_content( - self, - mock_datetime, - mock_magic, - document_service, - sample_file_bytes - ): - """Test creating document when content already exists (deduplication).""" - # Setup mocks - fixed_time = datetime(2024, 1, 15, 10, 30, 0) - mock_datetime.utcnow.return_value = fixed_time - mock_magic.return_value = "application/pdf" - - # Create first document - first_doc = await document_service.create_document( - "/test/first.pdf", - sample_file_bytes, - "utf-8" - ) - - # Create second document with same content - second_doc = await document_service.create_document( - "/test/second.pdf", - sample_file_bytes, - "utf-8" - ) - - # Verify both documents exist but share same hash - assert first_doc.file_hash == second_doc.file_hash - assert first_doc.filename != second_doc.filename - assert first_doc.filepath != second_doc.filepath - - # Verify only one content document exists - all_content = await document_service.content_repository.list_document_content() - content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash] - assert len(content_for_hash) == 1 - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_create_document_with_different_encodings( - self, - mock_magic, - document_service, - sample_text_bytes - ): - """Test creating documents with different text encodings.""" - # Setup - mock_magic.return_value = "text/plain" - - # Test with different encodings - encodings = ["utf-8", "latin-1", "ascii"] - - for i, encoding in enumerate(encodings): - result = await document_service.create_document( - f"/test/test{i}.txt", - sample_text_bytes, - encoding - ) - - # Verify document was created - assert result is not None - assert result.file_type == FileType.TXT - - # Verify content has correct encoding - content = await document_service.content_repository.find_document_content_by_file_hash( - result.file_hash - ) - assert content.encoding == encoding - - @pytest.mark.asyncio - async def test_i_cannot_create_document_with_unsupported_file_type( - self, - document_service, - sample_file_bytes - ): - """Test that unsupported file types raise ValueError.""" - with pytest.raises(ValueError, match="Unsupported file type"): - await document_service.create_document( - "/test/test.xyz", # Unsupported extension - sample_file_bytes, - "utf-8" - ) - - @pytest.mark.asyncio - async def test_i_cannot_create_document_with_empty_file_path( - self, - document_service, - sample_file_bytes - ): - """Test that empty file path raises ValueError.""" - with pytest.raises(ValueError): - await document_service.create_document( - "", # Empty path - sample_file_bytes, - "utf-8" - ) - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_create_document_with_empty_bytes( - self, - mock_magic, - document_service - ): - """Test behavior with empty file bytes.""" - # Setup - mock_magic.return_value = "text/plain" - - # Execute with empty bytes - result = await document_service.create_document( - "/test/empty.txt", - b"", # Empty bytes - "utf-8" - ) - - # Should still work but with zero file size - assert result is not None - content = await document_service.content_repository.find_document_content_by_file_hash( - result.file_hash - ) - assert content.file_size == 0 - - -class TestGetMethods: - """Tests for document retrieval methods.""" - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_get_document_by_id( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test retrieving document by ID.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Execute - result = await document_service.get_document_by_id(created_doc.id) - - # Verify - assert result is not None - assert result.id == created_doc.id - assert result.filename == created_doc.filename - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_get_document_by_hash( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test retrieving document by file hash.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Execute - result = await document_service.get_document_by_hash(created_doc.file_hash) - - # Verify - assert result is not None - assert result.file_hash == created_doc.file_hash - assert result.filename == created_doc.filename - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_get_document_by_filepath( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test retrieving document by file path.""" - # Setup - mock_magic.return_value = "application/pdf" - test_path = "/test/unique_test.pdf" - - # Create a document first - created_doc = await document_service.create_document( - test_path, - sample_file_bytes, - "utf-8" - ) - - # Execute - result = await document_service.get_document_by_filepath(test_path) - - # Verify - assert result is not None - assert result.filepath == test_path - assert result.id == created_doc.id - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_get_document_with_content( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test retrieving document with associated content.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Execute - result = await document_service.get_document_with_content(created_doc.id) - - # Verify - assert result is not None - document, content = result - assert document.id == created_doc.id - assert content is not None - assert content.file_hash == created_doc.file_hash - - @pytest.mark.asyncio - async def test_i_cannot_get_nonexistent_document_by_id( - self, - document_service - ): - """Test that nonexistent document returns None.""" - # Execute with random ObjectId - result = await document_service.get_document_by_id(ObjectId()) - - # Verify - assert result is None - - @pytest.mark.asyncio - async def test_i_cannot_get_nonexistent_document_by_hash( - self, - document_service - ): - """Test that nonexistent document hash returns None.""" - # Execute - result = await document_service.get_document_by_hash("nonexistent_hash") - - # Verify - assert result is None - - -class TestPaginationAndCounting: - """Tests for document listing and counting.""" - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_list_documents_with_pagination( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test document listing with pagination parameters.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create multiple documents - for i in range(5): - await document_service.create_document( - f"/test/test{i}.pdf", - sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique - "utf-8" - ) - - # Execute with pagination - result = await document_service.list_documents(skip=1, limit=2) - - # Verify - assert len(result) == 2 - - # Test counting - total_count = await document_service.count_documents() - assert total_count == 5 - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_count_documents( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test document counting.""" - # Setup - mock_magic.return_value = "text/plain" - - # Initially should be 0 - initial_count = await document_service.count_documents() - assert initial_count == 0 - - # Create some documents - for i in range(3): - await document_service.create_document( - f"/test/test{i}.txt", - sample_file_bytes + bytes(str(i), 'utf-8'), - "utf-8" - ) - - # Execute - final_count = await document_service.count_documents() - - # Verify - assert final_count == 3 - - -class TestUpdateAndDelete: - """Tests for document update and deletion operations.""" - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_update_document_metadata( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test updating document metadata.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Execute update - update_data = {"metadata": {"page_count": 5}} - result = await document_service.update_document(created_doc.id, update_data) - - # Verify - assert result is not None - assert result.metadata.get("page_count") == 5 - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_delete_document_and_orphaned_content( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test deleting document with orphaned content cleanup.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Verify content exists - content_before = await document_service.content_repository.find_document_content_by_file_hash( - created_doc.file_hash - ) - assert content_before is not None - - # Execute deletion - result = await document_service.delete_document(created_doc.id) - - # Verify document and content are deleted - assert result is True - - deleted_doc = await document_service.get_document_by_id(created_doc.id) - assert deleted_doc is None - - content_after = await document_service.content_repository.find_document_content_by_file_hash( - created_doc.file_hash - ) - assert content_after is None - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_delete_document_without_affecting_shared_content( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test deleting document without removing shared content.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create two documents with same content - doc1 = await document_service.create_document( - "/test/test1.pdf", - sample_file_bytes, - "utf-8" - ) - - doc2 = await document_service.create_document( - "/test/test2.pdf", - sample_file_bytes, - "utf-8" - ) - - # They should share the same hash - assert doc1.file_hash == doc2.file_hash - - # Delete first document - result = await document_service.delete_document(doc1.id) - assert result is True - - # Verify first document is deleted but content still exists - deleted_doc = await document_service.get_document_by_id(doc1.id) - assert deleted_doc is None - - remaining_doc = await document_service.get_document_by_id(doc2.id) - assert remaining_doc is not None - - content = await document_service.content_repository.find_document_content_by_file_hash( - doc2.file_hash - ) - assert content is not None - - -class TestUtilityMethods: - """Tests for utility methods.""" - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_check_content_exists( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test checking if content exists by hash.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Initially content doesn't exist - test_hash = "nonexistent_hash" - exists_before = await document_service.content_exists(test_hash) - assert exists_before is False - - # Create a document - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Now content should exist - exists_after = await document_service.content_exists(created_doc.file_hash) - assert exists_after is True - - @patch('app.services.document_service.magic.from_buffer') - @pytest.mark.asyncio - async def test_i_can_update_document_content( - self, - mock_magic, - document_service, - sample_file_bytes - ): - """Test updating extracted document content.""" - # Setup - mock_magic.return_value = "application/pdf" - - # Create a document first - created_doc = await document_service.create_document( - "/test/test.pdf", - sample_file_bytes, - "utf-8" - ) - - # Update content - new_content = "Updated extracted content" - result = await document_service.update_document_content( - created_doc.file_hash, - new_content - ) - - # Verify update - assert result is not None - assert result.content == new_content - - # Verify persistence - updated_content = await document_service.content_repository.find_document_content_by_file_hash( - created_doc.file_hash - ) - assert updated_content.content == new_content - - -class TestHashCalculation: - """Tests for file hash calculation utility.""" - - def test_i_can_calculate_consistent_file_hash(self, document_service): - """Test that file hash calculation is consistent.""" - test_bytes = b"Test content for hashing" - - # Calculate hash multiple times - hash1 = document_service._calculate_file_hash(test_bytes) - hash2 = document_service._calculate_file_hash(test_bytes) - - # Should be identical - assert hash1 == hash2 - assert len(hash1) == 64 # SHA256 produces 64-character hex string - - def test_i_get_different_hashes_for_different_content(self, document_service): - """Test that different content produces different hashes.""" - content1 = b"First content" - content2 = b"Second content" - - hash1 = document_service._calculate_file_hash(content1) - hash2 = document_service._calculate_file_hash(content2) - - assert hash1 != hash2 - - -class TestFileTypeDetection: - """Tests for file type detection.""" - - def test_i_can_detect_pdf_file_type(self, document_service): - """Test PDF file type detection.""" - file_type = document_service._detect_file_type("/path/to/document.pdf") - assert file_type == FileType.PDF - - def test_i_can_detect_txt_file_type(self, document_service): - """Test text file type detection.""" - file_type = document_service._detect_file_type("/path/to/document.txt") - assert file_type == FileType.TXT - - def test_i_can_detect_docx_file_type(self, document_service): - """Test DOCX file type detection.""" - file_type = document_service._detect_file_type("/path/to/document.docx") - assert file_type == FileType.DOCX - - def test_i_cannot_detect_unsupported_file_type(self, document_service): - """Test unsupported file type raises ValueError.""" - with pytest.raises(ValueError, match="Unsupported file type"): - document_service._detect_file_type("/path/to/document.xyz") \ No newline at end of file diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_utils_document_matching.py b/tests/utils/test_document_matching.py similarity index 95% rename from tests/test_utils_document_matching.py rename to tests/utils/test_document_matching.py index ea83895..9025502 100644 --- a/tests/test_utils_document_matching.py +++ b/tests/utils/test_document_matching.py @@ -14,6 +14,8 @@ def get_doc(filename: str = None): file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456", file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"), detected_at=datetime.now(), + file_size=1024, + mime_type="application/pdf" ) diff --git a/tests/test_security.py b/tests/utils/test_security.py similarity index 100% rename from tests/test_security.py rename to tests/utils/test_security.py