Compare commits
10 Commits
master
...
Implementi
| Author | SHA1 | Date | |
|---|---|---|---|
| cdbc93d4aa | |||
| b0289d865c | |||
| 42db3daa31 | |||
| 1f7ef200e7 | |||
| 48f5b009ae | |||
| e17c4c7e7b | |||
| 010ef56f63 | |||
| 34f7854b3c | |||
| 98c43feadf | |||
| 9564cfadd5 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,3 +1,5 @@
|
|||||||
|
volumes
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[codz]
|
*.py[codz]
|
||||||
|
|||||||
358
Readme.md
358
Readme.md
@@ -13,7 +13,7 @@ architecture with Redis for task queuing and MongoDB for data persistence.
|
|||||||
- **Backend API**: FastAPI (Python 3.12)
|
- **Backend API**: FastAPI (Python 3.12)
|
||||||
- **Task Processing**: Celery with Redis broker
|
- **Task Processing**: Celery with Redis broker
|
||||||
- **Document Processing**: EasyOCR, PyMuPDF, python-docx, pdfplumber
|
- **Document Processing**: EasyOCR, PyMuPDF, python-docx, pdfplumber
|
||||||
- **Database**: MongoDB
|
- **Database**: MongoDB (pymongo)
|
||||||
- **Frontend**: React
|
- **Frontend**: React
|
||||||
- **Containerization**: Docker & Docker Compose
|
- **Containerization**: Docker & Docker Compose
|
||||||
- **File Monitoring**: Python watchdog library
|
- **File Monitoring**: Python watchdog library
|
||||||
@@ -95,25 +95,32 @@ MyDocManager/
|
|||||||
│ │ ├── requirements.txt
|
│ │ ├── requirements.txt
|
||||||
│ │ ├── app/
|
│ │ ├── app/
|
||||||
│ │ │ ├── main.py
|
│ │ │ ├── main.py
|
||||||
│ │ │ ├── file_watcher.py
|
│ │ │ ├── file_watcher.py # FileWatcher class with observer thread
|
||||||
│ │ │ ├── celery_app.py
|
│ │ │ ├── celery_app.py # Celery Configuration
|
||||||
│ │ │ ├── config/
|
│ │ │ ├── config/
|
||||||
│ │ │ │ ├── __init__.py
|
│ │ │ │ ├── __init__.py
|
||||||
│ │ │ │ └── settings.py # JWT, MongoDB config
|
│ │ │ │ └── settings.py # JWT, MongoDB config
|
||||||
│ │ │ ├── models/
|
│ │ │ ├── models/
|
||||||
│ │ │ │ ├── __init__.py
|
│ │ │ │ ├── __init__.py
|
||||||
│ │ │ │ ├── user.py # User Pydantic models
|
│ │ │ │ ├── user.py # User Pydantic models
|
||||||
│ │ │ │ └── auth.py # Auth Pydantic models
|
│ │ │ │ ├── auth.py # Auth Pydantic models
|
||||||
|
│ │ │ │ ├── document.py # Document Pydantic models
|
||||||
|
│ │ │ │ ├── job.py # Job Processing Pydantic models
|
||||||
|
│ │ │ │ └── types.py # PyObjectId and other useful types
|
||||||
│ │ │ ├── database/
|
│ │ │ ├── database/
|
||||||
│ │ │ │ ├── __init__.py
|
│ │ │ │ ├── __init__.py
|
||||||
│ │ │ │ ├── connection.py # MongoDB connection
|
│ │ │ │ ├── connection.py # MongoDB connection (pymongo)
|
||||||
│ │ │ │ └── repositories/
|
│ │ │ │ └── repositories/
|
||||||
│ │ │ │ ├── __init__.py
|
│ │ │ │ ├── __init__.py
|
||||||
│ │ │ │ └── user_repository.py # User CRUD operations
|
│ │ │ │ ├── user_repository.py # User CRUD operations (synchronous)
|
||||||
|
│ │ │ │ ├── document_repository.py # Document CRUD operations (synchronous)
|
||||||
|
│ │ │ │ └── job_repository.py # Job CRUD operations (synchronous)
|
||||||
│ │ │ ├── services/
|
│ │ │ ├── services/
|
||||||
│ │ │ │ ├── __init__.py
|
│ │ │ │ ├── __init__.py
|
||||||
│ │ │ │ ├── auth_service.py # JWT & password logic
|
│ │ │ │ ├── auth_service.py # JWT & password logic (synchronous)
|
||||||
│ │ │ │ ├── user_service.py # User business logic
|
│ │ │ │ ├── user_service.py # User business logic (synchronous)
|
||||||
|
│ │ │ │ ├── document_service.py # Document business logic (synchronous)
|
||||||
|
│ │ │ │ ├── job_service.py # Job processing logic (synchronous)
|
||||||
│ │ │ │ └── init_service.py # Admin creation at startup
|
│ │ │ │ └── init_service.py # Admin creation at startup
|
||||||
│ │ │ ├── api/
|
│ │ │ ├── api/
|
||||||
│ │ │ │ ├── __init__.py
|
│ │ │ │ ├── __init__.py
|
||||||
@@ -125,7 +132,7 @@ MyDocManager/
|
|||||||
│ │ │ └── utils/
|
│ │ │ └── utils/
|
||||||
│ │ │ ├── __init__.py
|
│ │ │ ├── __init__.py
|
||||||
│ │ │ ├── security.py # Password utilities
|
│ │ │ ├── security.py # Password utilities
|
||||||
│ │ │ └── exceptions.py # Custom exceptions
|
│ │ │ └── document_matching.py # Fuzzy matching Algorithms
|
||||||
│ ├── worker/
|
│ ├── worker/
|
||||||
│ │ ├── Dockerfile
|
│ │ ├── Dockerfile
|
||||||
│ │ ├── requirements.txt
|
│ │ ├── requirements.txt
|
||||||
@@ -133,7 +140,13 @@ MyDocManager/
|
|||||||
│ └── frontend/
|
│ └── frontend/
|
||||||
│ ├── Dockerfile
|
│ ├── Dockerfile
|
||||||
│ ├── package.json
|
│ ├── package.json
|
||||||
|
│ ├── index.html
|
||||||
│ └── src/
|
│ └── src/
|
||||||
|
│ ├── assets/
|
||||||
|
│ ├── App.css
|
||||||
|
│ ├── App.jsx
|
||||||
|
│ ├── main.css
|
||||||
|
│ └── main.jsx
|
||||||
├── tests/
|
├── tests/
|
||||||
│ ├── file-processor/
|
│ ├── file-processor/
|
||||||
│ │ ├── test_auth/
|
│ │ ├── test_auth/
|
||||||
@@ -224,78 +237,76 @@ On first startup, the application automatically creates a default admin user:
|
|||||||
|
|
||||||
#### Files Collection
|
#### Files Collection
|
||||||
|
|
||||||
Stores file metadata and extracted content:
|
Stores file metadata and extracted content using Pydantic models:
|
||||||
|
|
||||||
```json
|
```python
|
||||||
{
|
class FileDocument(BaseModel):
|
||||||
"_id": "ObjectId",
|
"""
|
||||||
"filename": "document.pdf",
|
Model for file documents stored in the 'files' collection.
|
||||||
"filepath": "/watched_files/document.pdf",
|
|
||||||
"file_type": "pdf",
|
|
||||||
"extraction_method": "direct_text", // direct_text, ocr, hybrid
|
|
||||||
"metadata": {
|
|
||||||
"page_count": 15, // for PDFs
|
|
||||||
"word_count": 250, // for text files
|
|
||||||
"image_dimensions": { // for images
|
|
||||||
"width": 1920,
|
|
||||||
"height": 1080
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"detected_at": "2024-01-15T10:29:00Z",
|
|
||||||
"file_hash": "sha256_hash_value"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
#### Document Contents Collection
|
|
||||||
|
|
||||||
Stores actual file content and technical metadata:
|
Represents a file detected in the watched directory with its
|
||||||
```json
|
metadata and extracted content.
|
||||||
{
|
"""
|
||||||
"_id": "ObjectId",
|
|
||||||
"file_hash": "sha256_hash_value",
|
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||||
"content": "extracted text content...",
|
filename: str = Field(..., description="Original filename")
|
||||||
"encoding": "utf-8",
|
filepath: str = Field(..., description="Full path to the file")
|
||||||
"file_size": 2048576,
|
file_type: FileType = Field(..., description="Type of the file")
|
||||||
"mime_type": "application/pdf"
|
extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content")
|
||||||
}
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||||
|
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||||
|
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||||
|
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||||
|
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||||
|
mime_type: str = Field(..., description="MIME type detected")
|
||||||
|
|
||||||
|
@field_validator('filepath')
|
||||||
|
@classmethod
|
||||||
|
def validate_filepath(cls, v: str) -> str:
|
||||||
|
"""Validate filepath format."""
|
||||||
|
if not v.strip():
|
||||||
|
raise ValueError("Filepath cannot be empty")
|
||||||
|
return v.strip()
|
||||||
|
|
||||||
|
@field_validator('filename')
|
||||||
|
@classmethod
|
||||||
|
def validate_filename(cls, v: str) -> str:
|
||||||
|
"""Validate filename format."""
|
||||||
|
if not v.strip():
|
||||||
|
raise ValueError("Filename cannot be empty")
|
||||||
|
return v.strip()
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Processing Jobs Collection
|
#### Processing Jobs Collection
|
||||||
|
|
||||||
Tracks processing status and lifecycle:
|
Tracks processing status and lifecycle:
|
||||||
|
|
||||||
```json
|
```python
|
||||||
{
|
class ProcessingJob(BaseModel):
|
||||||
"_id": "ObjectId",
|
"""
|
||||||
"file_id": "reference_to_files_collection",
|
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||||
"status": "completed",
|
|
||||||
// pending, processing, completed, failed
|
Tracks the lifecycle and status of document processing tasks.
|
||||||
"task_id": "celery_task_uuid",
|
"""
|
||||||
"created_at": "2024-01-15T10:29:00Z",
|
|
||||||
"started_at": "2024-01-15T10:29:30Z",
|
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||||
"completed_at": "2024-01-15T10:30:00Z",
|
file_id: PyObjectId = Field(..., description="Reference to file document")
|
||||||
"error_message": null
|
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
|
||||||
}
|
task_id: Optional[str] = Field(default=None, description="Celery task UUID")
|
||||||
|
created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
|
||||||
|
started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
|
||||||
|
completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
|
||||||
|
error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
|
||||||
|
|
||||||
|
@field_validator('error_message')
|
||||||
|
@classmethod
|
||||||
|
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||||
|
"""Clean up error message."""
|
||||||
|
if v is not None:
|
||||||
|
return v.strip() if v.strip() else None
|
||||||
|
return v
|
||||||
```
|
```
|
||||||
|
|
||||||
### Data Storage Strategy
|
|
||||||
|
|
||||||
- **Choice**: Three separate collections for files, content, and processing status
|
|
||||||
- **Rationale**: Normalization prevents content duplication when multiple files have identical content
|
|
||||||
- **Benefits**:
|
|
||||||
- Content deduplication via SHA256 hash
|
|
||||||
- Better query performance for metadata vs content searches
|
|
||||||
- Clear separation of concerns between file metadata, content, and processing lifecycle
|
|
||||||
- Multiple files can reference the same content (e.g., identical copies in different locations)
|
|
||||||
|
|
||||||
### Content Storage Location
|
|
||||||
|
|
||||||
- **Choice**: Store extracted content in separate `document_contents` collection
|
|
||||||
- **Rationale**: Content normalization and deduplication
|
|
||||||
- **Benefits**:
|
|
||||||
- Single content storage per unique file hash
|
|
||||||
- Multiple file entries can reference same content
|
|
||||||
- Efficient storage for duplicate files
|
|
||||||
|
|
||||||
### Supported File Types (Initial Implementation)
|
### Supported File Types (Initial Implementation)
|
||||||
|
|
||||||
- **Text Files** (`.txt`): Direct content reading
|
- **Text Files** (`.txt`): Direct content reading
|
||||||
@@ -306,7 +317,7 @@ Tracks processing status and lifecycle:
|
|||||||
|
|
||||||
#### Watchdog Implementation
|
#### Watchdog Implementation
|
||||||
|
|
||||||
- **Choice**: Dedicated observer thread (Option A)
|
- **Choice**: Dedicated observer thread
|
||||||
- **Rationale**: Standard approach, clean separation of concerns
|
- **Rationale**: Standard approach, clean separation of concerns
|
||||||
- **Implementation**: Watchdog observer runs in separate thread from FastAPI
|
- **Implementation**: Watchdog observer runs in separate thread from FastAPI
|
||||||
|
|
||||||
@@ -327,17 +338,94 @@ Tracks processing status and lifecycle:
|
|||||||
|
|
||||||
#### Content Storage Location
|
#### Content Storage Location
|
||||||
|
|
||||||
- **Choice**: Store extracted content in `files` collection
|
- **Choice**: Store files in the file system, using the SHA256 hash as filename
|
||||||
- **Rationale**: Content is intrinsic property of the file
|
- **Rationale**: MongoDB is not meant for large files, better performance. Files remain in the file system for easy
|
||||||
- **Benefits**: Single query to get file + content, simpler data model
|
access.
|
||||||
|
|
||||||
### Implementation Order
|
#### Repository and Services Implementation
|
||||||
|
|
||||||
|
- **Choice**: Synchronous implementation using pymongo
|
||||||
|
- **Rationale**: Full compatibility with Celery workers and simplified workflow
|
||||||
|
- **Implementation**: All repositories and services operate synchronously for seamless integration
|
||||||
|
|
||||||
|
### Implementation Status
|
||||||
|
|
||||||
1. ✅ Pydantic models for MongoDB collections
|
1. ✅ Pydantic models for MongoDB collections
|
||||||
2. ✅ Repository layer for data access (files + processing_jobs)
|
2. ✅ Repository layer for data access (files + processing_jobs + users + documents) - synchronous
|
||||||
3. ✅ Celery tasks for document processing
|
3. ✅ Service layer for business logic (auth, user, document, job) - synchronous
|
||||||
4. ✅ Watchdog file monitoring implementation
|
4. ✅ Celery tasks for document processing
|
||||||
5. ✅ FastAPI integration and startup coordination
|
5. ✅ Watchdog file monitoring implementation
|
||||||
|
6. ✅ FastAPI integration and startup coordination
|
||||||
|
|
||||||
|
## Job Management Layer
|
||||||
|
|
||||||
|
### Repository Pattern Implementation
|
||||||
|
|
||||||
|
The job management system follows the repository pattern for clean separation between data access and business logic.
|
||||||
|
|
||||||
|
#### JobRepository
|
||||||
|
|
||||||
|
Handles direct MongoDB operations for processing jobs using synchronous pymongo:
|
||||||
|
|
||||||
|
**CRUD Operations:**
|
||||||
|
- `create_job()` - Create new processing job with automatic `created_at` timestamp
|
||||||
|
- `get_job_by_id()` - Retrieve job by ObjectId
|
||||||
|
- `update_job_status()` - Update job status with automatic timestamp management
|
||||||
|
- `delete_job()` - Remove job from database
|
||||||
|
- `get_jobs_by_file_id()` - Get all jobs for specific file
|
||||||
|
- `get_jobs_by_status()` - Get jobs filtered by processing status
|
||||||
|
|
||||||
|
**Automatic Timestamp Management:**
|
||||||
|
- `created_at`: Set automatically during job creation
|
||||||
|
- `started_at`: Set automatically when status changes to PROCESSING
|
||||||
|
- `completed_at`: Set automatically when status changes to COMPLETED or FAILED
|
||||||
|
|
||||||
|
#### JobService
|
||||||
|
|
||||||
|
Provides synchronous business logic layer with strict status transition validation:
|
||||||
|
|
||||||
|
**Status Transition Methods:**
|
||||||
|
- `mark_job_as_started()` - PENDING → PROCESSING
|
||||||
|
- `mark_job_as_completed()` - PROCESSING → COMPLETED
|
||||||
|
- `mark_job_as_failed()` - PROCESSING → FAILED
|
||||||
|
|
||||||
|
**Validation Rules:**
|
||||||
|
- Strict status transitions (invalid transitions raise exceptions)
|
||||||
|
- Job existence verification before any operation
|
||||||
|
- Automatic timestamp management through repository layer
|
||||||
|
|
||||||
|
#### Custom Exceptions
|
||||||
|
|
||||||
|
**InvalidStatusTransitionError**: Raised for invalid status transitions
|
||||||
|
**JobRepositoryError**: Raised for MongoDB operation failures
|
||||||
|
|
||||||
|
#### Valid Status Transitions
|
||||||
|
|
||||||
|
```
|
||||||
|
PENDING → PROCESSING (via mark_job_as_started)
|
||||||
|
PROCESSING → COMPLETED (via mark_job_as_completed)
|
||||||
|
PROCESSING → FAILED (via mark_job_as_failed)
|
||||||
|
```
|
||||||
|
|
||||||
|
All other transitions are forbidden and will raise `InvalidStatusTransitionError`.
|
||||||
|
|
||||||
|
### File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
src/file-processor/app/
|
||||||
|
├── database/repositories/
|
||||||
|
│ ├── job_repository.py # JobRepository class (synchronous)
|
||||||
|
│ ├── user_repository.py # UserRepository class (synchronous)
|
||||||
|
│ ├── document_repository.py # DocumentRepository class (synchronous)
|
||||||
|
│ └── file_repository.py # FileRepository class (synchronous)
|
||||||
|
├── services/
|
||||||
|
│ ├── job_service.py # JobService class (synchronous)
|
||||||
|
│ ├── auth_service.py # AuthService class (synchronous)
|
||||||
|
│ ├── user_service.py # UserService class (synchronous)
|
||||||
|
│ └── document_service.py # DocumentService class (synchronous)
|
||||||
|
└── exceptions/
|
||||||
|
└── job_exceptions.py # Custom exceptions
|
||||||
|
```
|
||||||
|
|
||||||
### Processing Pipeline Features
|
### Processing Pipeline Features
|
||||||
|
|
||||||
@@ -346,87 +434,7 @@ Tracks processing status and lifecycle:
|
|||||||
- **Status Tracking**: Real-time processing status via `processing_jobs` collection
|
- **Status Tracking**: Real-time processing status via `processing_jobs` collection
|
||||||
- **Extensible Metadata**: Flexible metadata storage per file type
|
- **Extensible Metadata**: Flexible metadata storage per file type
|
||||||
- **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches
|
- **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches
|
||||||
|
- **Synchronous Operations**: All database operations use pymongo for Celery compatibility
|
||||||
## Document Service Architecture
|
|
||||||
|
|
||||||
### Service Overview
|
|
||||||
|
|
||||||
The document service provides orchestrated access to file documents and their content through a single interface that coordinates between `FileDocument` and `DocumentContent` repositories.
|
|
||||||
|
|
||||||
### Service Design
|
|
||||||
|
|
||||||
- **Architecture Pattern**: Service orchestration with separate repositories
|
|
||||||
- **Transaction Support**: MongoDB ACID transactions for data consistency
|
|
||||||
- **Content Deduplication**: Multiple files can reference the same content via SHA256 hash
|
|
||||||
- **Error Handling**: MongoDB standard exceptions with transaction rollback
|
|
||||||
|
|
||||||
### Document Service (`document_service.py`)
|
|
||||||
|
|
||||||
Orchestrates operations between file and content repositories while maintaining data consistency.
|
|
||||||
|
|
||||||
#### Core Functionality
|
|
||||||
|
|
||||||
##### `create_document(file_path: str, file_bytes: bytes, encoding: str)`
|
|
||||||
|
|
||||||
Creates a new document with automatic attribute calculation and content deduplication.
|
|
||||||
|
|
||||||
**Automatic Calculations:**
|
|
||||||
- `file_hash`: SHA256 hash of file bytes
|
|
||||||
- `file_type`: Detection based on file extension
|
|
||||||
- `mime_type`: Detection via `python-magic` library
|
|
||||||
- `file_size`: Length of provided bytes
|
|
||||||
- `detected_at`: Current timestamp
|
|
||||||
- `metadata`: Empty dictionary (reserved for future extension)
|
|
||||||
|
|
||||||
**Deduplication Logic:**
|
|
||||||
1. Calculate SHA256 hash of file content
|
|
||||||
2. Check if `DocumentContent` with this hash already exists
|
|
||||||
3. If EXISTS: Create only `FileDocument` referencing existing content
|
|
||||||
4. If NOT EXISTS: Create both `FileDocument` and `DocumentContent` in transaction
|
|
||||||
|
|
||||||
**Transaction Flow:**
|
|
||||||
```
|
|
||||||
BEGIN TRANSACTION
|
|
||||||
IF content_exists(file_hash):
|
|
||||||
CREATE FileDocument with content reference
|
|
||||||
ELSE:
|
|
||||||
CREATE DocumentContent
|
|
||||||
CREATE FileDocument with content reference
|
|
||||||
COMMIT TRANSACTION
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Available Methods
|
|
||||||
|
|
||||||
- `create_document(file_path, file_bytes, encoding)`: Create with deduplication
|
|
||||||
- `get_document_by_id(document_id)`: Retrieve by document ID
|
|
||||||
- `get_document_by_hash(file_hash)`: Retrieve by file hash
|
|
||||||
- `get_document_by_filepath(filepath)`: Retrieve by file path
|
|
||||||
- `list_documents(skip, limit)`: Paginated document listing
|
|
||||||
- `count_documents()`: Total document count
|
|
||||||
- `update_document(document_id, update_data)`: Update document metadata
|
|
||||||
- `delete_document(document_id)`: Remove document and orphaned content
|
|
||||||
|
|
||||||
### Repository Dependencies
|
|
||||||
|
|
||||||
The document service coordinates two existing repositories:
|
|
||||||
|
|
||||||
#### File Repository (`file_repository.py`)
|
|
||||||
- `create_document()`, `find_document_by_id()`, `find_document_by_hash()`
|
|
||||||
- `find_document_by_filepath()`, `find_document_by_name()`
|
|
||||||
- `list_documents()`, `count_documents()`
|
|
||||||
- `update_document()`, `delete_document()`
|
|
||||||
|
|
||||||
#### Document Content Repository (`document_content_repository.py`)
|
|
||||||
- `create_document_content()`, `find_document_content_by_id()`
|
|
||||||
- `find_document_content_by_file_hash()`, `content_exists()`
|
|
||||||
- `update_document_content()`, `delete_document_content()`
|
|
||||||
- `list_document_contents()`, `count_document_contents()`
|
|
||||||
|
|
||||||
### Dependencies
|
|
||||||
|
|
||||||
- `python-magic`: MIME type detection
|
|
||||||
- `hashlib`: SHA256 hashing (standard library)
|
|
||||||
- `pymongo`: MongoDB transactions support
|
|
||||||
|
|
||||||
## Key Implementation Notes
|
## Key Implementation Notes
|
||||||
|
|
||||||
@@ -449,6 +457,7 @@ The document service coordinates two existing repositories:
|
|||||||
- **Package Manager**: pip (standard)
|
- **Package Manager**: pip (standard)
|
||||||
- **External Dependencies**: Listed in each service's requirements.txt
|
- **External Dependencies**: Listed in each service's requirements.txt
|
||||||
- **Standard Library First**: Prefer standard library when possible
|
- **Standard Library First**: Prefer standard library when possible
|
||||||
|
- **Database Driver**: pymongo for synchronous MongoDB operations
|
||||||
|
|
||||||
### Testing Strategy
|
### Testing Strategy
|
||||||
|
|
||||||
@@ -473,6 +482,7 @@ The document service coordinates two existing repositories:
|
|||||||
12. **Content in Files Collection**: Extracted content stored with file metadata
|
12. **Content in Files Collection**: Extracted content stored with file metadata
|
||||||
13. **Direct Task Dispatch**: File watcher directly creates Celery tasks
|
13. **Direct Task Dispatch**: File watcher directly creates Celery tasks
|
||||||
14. **SHA256 Duplicate Detection**: Prevents reprocessing identical files
|
14. **SHA256 Duplicate Detection**: Prevents reprocessing identical files
|
||||||
|
15. **Synchronous Implementation**: All repositories and services use pymongo for Celery compatibility
|
||||||
|
|
||||||
### Development Process Requirements
|
### Development Process Requirements
|
||||||
|
|
||||||
@@ -483,21 +493,15 @@ The document service coordinates two existing repositories:
|
|||||||
|
|
||||||
### Next Implementation Steps
|
### Next Implementation Steps
|
||||||
|
|
||||||
1. ✅ Create docker-compose.yml with all services => Done
|
1. **TODO**: Complete file processing pipeline =>
|
||||||
2. ✅ Define user management and authentication architecture => Done
|
1. ✅ Create Pydantic models for files and processing_jobs collections
|
||||||
3. ✅ Implement user models and authentication services =>
|
2. ✅ Implement repository layer for file and processing job data access (synchronous)
|
||||||
1. models/user.py => Done
|
3. ✅ Implement service layer for business logic (synchronous)
|
||||||
2. models/auth.py => Done
|
4. ✅ Create Celery tasks for document processing (.txt, .pdf, .docx)
|
||||||
3. database/repositories/user_repository.py => Done
|
5. ✅ Implement Watchdog file monitoring with dedicated observer
|
||||||
4. ✅ Add automatic admin user creation if it does not exists => Done
|
6. ✅ Integrate file watcher with FastAPI startup
|
||||||
5. **IN PROGRESS**: Implement file processing pipeline =>
|
2. Create protected API routes for user management
|
||||||
1. Create Pydantic models for files and processing_jobs collections
|
3. Build React monitoring interface with authentication
|
||||||
2. Implement repository layer for file and processing job data access
|
|
||||||
3. Create Celery tasks for document processing (.txt, .pdf, .docx)
|
|
||||||
4. Implement Watchdog file monitoring with dedicated observer
|
|
||||||
5. Integrate file watcher with FastAPI startup
|
|
||||||
6. Create protected API routes for user management
|
|
||||||
7. Build React monitoring interface with authentication
|
|
||||||
|
|
||||||
## Annexes
|
## Annexes
|
||||||
|
|
||||||
@@ -586,4 +590,4 @@ docker-compose up --scale worker=3
|
|||||||
- **file-processor**: Hot-reload enabled via `--reload` flag
|
- **file-processor**: Hot-reload enabled via `--reload` flag
|
||||||
- Code changes in `src/file-processor/app/` automatically restart FastAPI
|
- Code changes in `src/file-processor/app/` automatically restart FastAPI
|
||||||
- **worker**: No hot-reload (manual restart required for stability)
|
- **worker**: No hot-reload (manual restart required for stability)
|
||||||
- Code changes in `src/worker/tasks/` require: `docker-compose restart worker`
|
- Code changes in `src/worker/tasks/` require: `docker-compose restart worker`
|
||||||
@@ -19,7 +19,7 @@ services:
|
|||||||
MONGO_INITDB_ROOT_PASSWORD: password123
|
MONGO_INITDB_ROOT_PASSWORD: password123
|
||||||
MONGO_INITDB_DATABASE: mydocmanager
|
MONGO_INITDB_DATABASE: mydocmanager
|
||||||
volumes:
|
volumes:
|
||||||
- mongodb-data:/data/db
|
- ./volumes/db:/data/db
|
||||||
networks:
|
networks:
|
||||||
- mydocmanager-network
|
- mydocmanager-network
|
||||||
|
|
||||||
@@ -34,10 +34,12 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- REDIS_URL=redis://redis:6379/0
|
- REDIS_URL=redis://redis:6379/0
|
||||||
- MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin
|
- MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin
|
||||||
- PYTHONPATH=/app
|
- PYTHONPATH=/app:/tasks # Added /tasks to Python path
|
||||||
volumes:
|
volumes:
|
||||||
- ./src/file-processor:/app
|
- ./src/file-processor:/app
|
||||||
|
- ./src/worker/tasks:/app/tasks # <- Added: shared access to worker tasks
|
||||||
- ./volumes/watched_files:/watched_files
|
- ./volumes/watched_files:/watched_files
|
||||||
|
- ./volumes/objects:/objects
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
- mongodb
|
- mongodb
|
||||||
@@ -56,14 +58,29 @@ services:
|
|||||||
- MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin
|
- MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin
|
||||||
- PYTHONPATH=/app
|
- PYTHONPATH=/app
|
||||||
volumes:
|
volumes:
|
||||||
- ./src/worker/tasks:/app
|
- ./src/worker:/app
|
||||||
|
- ./src/file-processor/app:/app/app # <- Added: shared access file-processor app
|
||||||
- ./volumes/watched_files:/watched_files
|
- ./volumes/watched_files:/watched_files
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
- mongodb
|
- mongodb
|
||||||
networks:
|
networks:
|
||||||
- mydocmanager-network
|
- mydocmanager-network
|
||||||
command: celery -A main worker --loglevel=info
|
command: celery -A tasks.main worker --loglevel=info
|
||||||
|
|
||||||
|
# Frontend - React application with Vite
|
||||||
|
frontend:
|
||||||
|
build:
|
||||||
|
context: ./src/frontend
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: mydocmanager-frontend
|
||||||
|
ports:
|
||||||
|
- "5173:5173"
|
||||||
|
volumes:
|
||||||
|
- ./src/frontend:/app
|
||||||
|
- /app/node_modules # Anonymous volume to prevent node_modules override
|
||||||
|
networks:
|
||||||
|
- mydocmanager-network
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
mongodb-data:
|
mongodb-data:
|
||||||
|
|||||||
@@ -1,20 +1,30 @@
|
|||||||
amqp==5.3.1
|
amqp==5.3.1
|
||||||
annotated-types==0.7.0
|
annotated-types==0.7.0
|
||||||
anyio==4.10.0
|
anyio==4.10.0
|
||||||
|
asgiref==3.9.1
|
||||||
bcrypt==4.3.0
|
bcrypt==4.3.0
|
||||||
billiard==4.2.1
|
billiard==4.2.1
|
||||||
celery==5.5.3
|
celery==5.5.3
|
||||||
|
certifi==2025.8.3
|
||||||
|
cffi==2.0.0
|
||||||
click==8.2.1
|
click==8.2.1
|
||||||
click-didyoumean==0.3.1
|
click-didyoumean==0.3.1
|
||||||
click-plugins==1.1.1.2
|
click-plugins==1.1.1.2
|
||||||
click-repl==0.3.0
|
click-repl==0.3.0
|
||||||
|
cryptography==46.0.1
|
||||||
dnspython==2.8.0
|
dnspython==2.8.0
|
||||||
|
ecdsa==0.19.1
|
||||||
email-validator==2.3.0
|
email-validator==2.3.0
|
||||||
fastapi==0.116.1
|
fastapi==0.116.1
|
||||||
h11==0.16.0
|
h11==0.16.0
|
||||||
|
hiredis==3.2.1
|
||||||
|
httpcore==1.0.9
|
||||||
httptools==0.6.4
|
httptools==0.6.4
|
||||||
|
httpx==0.28.1
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
importlib_metadata==8.7.0
|
||||||
iniconfig==2.1.0
|
iniconfig==2.1.0
|
||||||
|
izulu==0.50.0
|
||||||
kombu==5.5.4
|
kombu==5.5.4
|
||||||
mongomock==4.3.0
|
mongomock==4.3.0
|
||||||
mongomock-motor==0.0.36
|
mongomock-motor==0.0.36
|
||||||
@@ -23,9 +33,13 @@ packaging==25.0
|
|||||||
pipdeptree==2.28.0
|
pipdeptree==2.28.0
|
||||||
pluggy==1.6.0
|
pluggy==1.6.0
|
||||||
prompt_toolkit==3.0.52
|
prompt_toolkit==3.0.52
|
||||||
|
pyasn1==0.6.1
|
||||||
|
pycparser==2.23
|
||||||
|
pycron==3.2.0
|
||||||
pydantic==2.11.9
|
pydantic==2.11.9
|
||||||
pydantic_core==2.33.2
|
pydantic_core==2.33.2
|
||||||
Pygments==2.19.2
|
Pygments==2.19.2
|
||||||
|
PyJWT==2.10.1
|
||||||
pymongo==4.15.1
|
pymongo==4.15.1
|
||||||
pytest==8.4.2
|
pytest==8.4.2
|
||||||
pytest-asyncio==1.2.0
|
pytest-asyncio==1.2.0
|
||||||
@@ -35,6 +49,8 @@ python-dotenv==1.1.1
|
|||||||
python-magic==0.4.27
|
python-magic==0.4.27
|
||||||
pytz==2025.2
|
pytz==2025.2
|
||||||
PyYAML==6.0.2
|
PyYAML==6.0.2
|
||||||
|
redis==6.4.0
|
||||||
|
rsa==4.9.1
|
||||||
sentinels==1.1.1
|
sentinels==1.1.1
|
||||||
six==1.17.0
|
six==1.17.0
|
||||||
sniffio==1.3.1
|
sniffio==1.3.1
|
||||||
@@ -45,6 +61,8 @@ tzdata==2025.2
|
|||||||
uvicorn==0.35.0
|
uvicorn==0.35.0
|
||||||
uvloop==0.21.0
|
uvloop==0.21.0
|
||||||
vine==5.1.0
|
vine==5.1.0
|
||||||
|
watchdog==6.0.0
|
||||||
watchfiles==1.1.0
|
watchfiles==1.1.0
|
||||||
wcwidth==0.2.13
|
wcwidth==0.2.13
|
||||||
websockets==15.0.1
|
websockets==15.0.1
|
||||||
|
zipp==3.23.0
|
||||||
|
|||||||
@@ -3,6 +3,12 @@ FROM python:3.12-slim
|
|||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install libmagic
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libmagic1 \
|
||||||
|
file \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Copy requirements and install dependencies
|
# Copy requirements and install dependencies
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|||||||
0
src/file-processor/app/api/__init__.py
Normal file
0
src/file-processor/app/api/__init__.py
Normal file
100
src/file-processor/app/api/dependencies.py
Normal file
100
src/file-processor/app/api/dependencies.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
import jwt
|
||||||
|
from fastapi import Depends, HTTPException
|
||||||
|
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||||
|
from jwt import InvalidTokenError
|
||||||
|
from starlette import status
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
from app.database.connection import get_database
|
||||||
|
from app.models.auth import UserRole
|
||||||
|
from app.models.user import UserInDB
|
||||||
|
from app.services.auth_service import AuthService
|
||||||
|
from app.services.user_service import UserService
|
||||||
|
|
||||||
|
security = HTTPBearer()
|
||||||
|
|
||||||
|
|
||||||
|
def get_auth_service() -> AuthService:
|
||||||
|
"""Dependency to get AuthService instance."""
|
||||||
|
return AuthService()
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_service() -> UserService:
|
||||||
|
"""Dependency to get UserService instance."""
|
||||||
|
database = get_database()
|
||||||
|
return UserService(database)
|
||||||
|
|
||||||
|
|
||||||
|
def get_current_user(
|
||||||
|
credentials: HTTPAuthorizationCredentials = Depends(security),
|
||||||
|
user_service: UserService = Depends(get_user_service)
|
||||||
|
) -> UserInDB:
|
||||||
|
"""
|
||||||
|
Dependency to get current authenticated user from JWT token.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
credentials: HTTP Bearer credentials
|
||||||
|
user_service: Auth service instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
User: Current authenticated user
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If token is invalid or user not found
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
payload = jwt.decode(
|
||||||
|
credentials.credentials,
|
||||||
|
settings.get_jwt_secret_key(),
|
||||||
|
algorithms=[settings.get_jwt_algorithm()]
|
||||||
|
)
|
||||||
|
username: str = payload.get("sub")
|
||||||
|
if username is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail="Could not validate credentials",
|
||||||
|
headers={"WWW-Authenticate": "Bearer"},
|
||||||
|
)
|
||||||
|
except InvalidTokenError:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail="Could not validate credentials",
|
||||||
|
headers={"WWW-Authenticate": "Bearer"},
|
||||||
|
)
|
||||||
|
|
||||||
|
user = user_service.get_user_by_username(username)
|
||||||
|
if user is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail="Could not validate credentials",
|
||||||
|
headers={"WWW-Authenticate": "Bearer"},
|
||||||
|
)
|
||||||
|
|
||||||
|
if not user.is_active:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="Inactive user"
|
||||||
|
)
|
||||||
|
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
|
||||||
|
"""
|
||||||
|
Dependency to ensure current user has admin role.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
current_user: Current authenticated user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
User: Current user if admin
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If user is not admin
|
||||||
|
"""
|
||||||
|
if current_user.role != UserRole.ADMIN:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_403_FORBIDDEN,
|
||||||
|
detail="Not enough permissions"
|
||||||
|
)
|
||||||
|
return current_user
|
||||||
0
src/file-processor/app/api/routes/__init__.py
Normal file
0
src/file-processor/app/api/routes/__init__.py
Normal file
80
src/file-processor/app/api/routes/auth.py
Normal file
80
src/file-processor/app/api/routes/auth.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||||||
|
from fastapi.security import OAuth2PasswordRequestForm
|
||||||
|
|
||||||
|
from app.api.dependencies import get_auth_service, get_current_user, get_user_service
|
||||||
|
from app.models.auth import LoginResponse, UserResponse
|
||||||
|
from app.models.user import UserInDB
|
||||||
|
from app.services.auth_service import AuthService
|
||||||
|
from app.services.user_service import UserService
|
||||||
|
|
||||||
|
router = APIRouter(tags=["authentication"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/login", response_model=LoginResponse)
|
||||||
|
def login(
|
||||||
|
form_data: OAuth2PasswordRequestForm = Depends(),
|
||||||
|
auth_service: AuthService = Depends(get_auth_service),
|
||||||
|
user_service: UserService = Depends(get_user_service)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Authenticate user and return JWT token.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
form_data: OAuth2 password form data
|
||||||
|
auth_service: Auth service instance
|
||||||
|
user_service: User service instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LoginResponse: JWT token and user info
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If authentication fails
|
||||||
|
"""
|
||||||
|
incorrect_username_or_pwd = HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail="Incorrect username or password",
|
||||||
|
headers={"WWW-Authenticate": "Bearer"},
|
||||||
|
)
|
||||||
|
|
||||||
|
user = user_service.get_user_by_username(form_data.username)
|
||||||
|
if (not user or
|
||||||
|
not user.is_active or
|
||||||
|
not auth_service.verify_user_password(form_data.password, user.hashed_password)):
|
||||||
|
raise incorrect_username_or_pwd
|
||||||
|
|
||||||
|
access_token = auth_service.create_access_token(data={"sub": user.username})
|
||||||
|
|
||||||
|
return LoginResponse(
|
||||||
|
access_token=access_token,
|
||||||
|
user=UserResponse(
|
||||||
|
_id=user.id,
|
||||||
|
username=user.username,
|
||||||
|
email=user.email,
|
||||||
|
role=user.role,
|
||||||
|
is_active=user.is_active,
|
||||||
|
created_at=user.created_at,
|
||||||
|
updated_at=user.updated_at
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/me", response_model=UserResponse)
|
||||||
|
def get_current_user_profile(current_user: UserInDB = Depends(get_current_user)):
|
||||||
|
"""
|
||||||
|
Get current user profile.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
current_user: Current authenticated user
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UserResponse: Current user profile without sensitive data
|
||||||
|
"""
|
||||||
|
return UserResponse(
|
||||||
|
_id=current_user.id,
|
||||||
|
username=current_user.username,
|
||||||
|
email=current_user.email,
|
||||||
|
role=current_user.role,
|
||||||
|
is_active=current_user.is_active,
|
||||||
|
created_at=current_user.created_at,
|
||||||
|
updated_at=current_user.updated_at
|
||||||
|
)
|
||||||
172
src/file-processor/app/api/routes/users.py
Normal file
172
src/file-processor/app/api/routes/users.py
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
from starlette import status
|
||||||
|
|
||||||
|
from app.api.dependencies import get_admin_user, get_user_service
|
||||||
|
from app.models.auth import UserResponse, MessageResponse
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
from app.models.user import UserInDB, UserCreate, UserUpdate
|
||||||
|
from app.services.user_service import UserService
|
||||||
|
|
||||||
|
router = APIRouter(tags=["users"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=list[UserInDB])
|
||||||
|
def list_users(
|
||||||
|
admin_user: UserInDB = Depends(get_admin_user),
|
||||||
|
user_service: UserService = Depends(get_user_service)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
List all users (admin only).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
admin_user: Current admin user
|
||||||
|
user_service: User service instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[UserResponse]: List of all users without sensitive data
|
||||||
|
"""
|
||||||
|
return user_service.list_users()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{user_id}", response_model=UserResponse)
|
||||||
|
def get_user_by_id(
|
||||||
|
user_id: PyObjectId,
|
||||||
|
admin_user: UserInDB = Depends(get_admin_user),
|
||||||
|
user_service: UserService = Depends(get_user_service)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get specific user by ID (admin only).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
user_id: User ID to retrieve
|
||||||
|
admin_user: Current admin user
|
||||||
|
user_service: User service instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UserResponse: User information without sensitive data
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If user not found
|
||||||
|
"""
|
||||||
|
user = user_service.get_user_by_id(str(user_id))
|
||||||
|
if not user:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="User not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("", response_model=UserResponse, status_code=status.HTTP_201_CREATED)
|
||||||
|
def create_user(
|
||||||
|
user_data: UserCreate,
|
||||||
|
admin_user: UserInDB = Depends(get_admin_user),
|
||||||
|
user_service: UserService = Depends(get_user_service)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Create new user (admin only).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
user_data: User creation data
|
||||||
|
admin_user: Current admin user
|
||||||
|
user_service: User service instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UserResponse: Created user information without sensitive data
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If user creation fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
user = user_service.create_user(user_data)
|
||||||
|
return UserResponse(
|
||||||
|
_id=user.id,
|
||||||
|
username=user.username,
|
||||||
|
email=user.email,
|
||||||
|
role=user.role,
|
||||||
|
is_active=user.is_active,
|
||||||
|
created_at=user.created_at,
|
||||||
|
updated_at=user.updated_at
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/{user_id}", response_model=UserResponse)
|
||||||
|
def update_user(
|
||||||
|
user_id: PyObjectId,
|
||||||
|
user_data: UserUpdate,
|
||||||
|
admin_user: UserInDB = Depends(get_admin_user),
|
||||||
|
user_service: UserService = Depends(get_user_service)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Update existing user (admin only).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
user_id: User ID to update
|
||||||
|
user_data: User update data
|
||||||
|
admin_user: Current admin user
|
||||||
|
user_service: User service instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UserResponse: Updated user information without sensitive data
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If user not found or update fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
user = user_service.update_user(str(user_id), user_data)
|
||||||
|
if not user:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="User not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
return UserResponse(
|
||||||
|
_id=user.id,
|
||||||
|
username=user.username,
|
||||||
|
email=user.email,
|
||||||
|
role=user.role,
|
||||||
|
is_active=user.is_active,
|
||||||
|
created_at=user.created_at,
|
||||||
|
updated_at=user.updated_at
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{user_id}", response_model=MessageResponse)
|
||||||
|
def delete_user(
|
||||||
|
user_id: PyObjectId,
|
||||||
|
admin_user: UserInDB = Depends(get_admin_user),
|
||||||
|
user_service: UserService = Depends(get_user_service)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Delete user by ID (admin only).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
user_id: User ID to delete
|
||||||
|
admin_user: Current admin user
|
||||||
|
user_service: User service instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
MessageResponse: Success message
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If user not found or deletion fails
|
||||||
|
"""
|
||||||
|
success = user_service.delete_user(str(user_id))
|
||||||
|
if not success:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="User not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
return MessageResponse(message="User successfully deleted")
|
||||||
@@ -6,7 +6,6 @@ using simple os.getenv() approach without external validation libraries.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
|
|
||||||
def get_mongodb_url() -> str:
|
def get_mongodb_url() -> str:
|
||||||
@@ -31,6 +30,26 @@ def get_mongodb_database_name() -> str:
|
|||||||
return os.getenv("MONGODB_DATABASE", "mydocmanager")
|
return os.getenv("MONGODB_DATABASE", "mydocmanager")
|
||||||
|
|
||||||
|
|
||||||
|
def get_redis_url() -> str:
|
||||||
|
return os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
||||||
|
|
||||||
|
|
||||||
|
# def get_redis_host() -> str:
|
||||||
|
# redis_url = get_redis_url()
|
||||||
|
# if redis_url.startswith("redis://"):
|
||||||
|
# return redis_url.split("redis://")[1].split("/")[0]
|
||||||
|
# else:
|
||||||
|
# return redis_url
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def get_redis_port() -> int:
|
||||||
|
# redis_url = get_redis_url()
|
||||||
|
# if redis_url.startswith("redis://"):
|
||||||
|
# return int(redis_url.split("redis://")[1].split("/")[0].split(":")[1])
|
||||||
|
# else:
|
||||||
|
# return int(redis_url.split(":")[1])
|
||||||
|
|
||||||
|
|
||||||
def get_jwt_secret_key() -> str:
|
def get_jwt_secret_key() -> str:
|
||||||
"""
|
"""
|
||||||
Get JWT secret key from environment variables.
|
Get JWT secret key from environment variables.
|
||||||
@@ -82,4 +101,19 @@ def is_development_environment() -> bool:
|
|||||||
Returns:
|
Returns:
|
||||||
bool: True if development environment
|
bool: True if development environment
|
||||||
"""
|
"""
|
||||||
return os.getenv("ENVIRONMENT", "development").lower() == "development"
|
return os.getenv("ENVIRONMENT", "development").lower() == "development"
|
||||||
|
|
||||||
|
|
||||||
|
def get_objects_folder() -> str:
|
||||||
|
"""
|
||||||
|
Get Vault path from environment variables.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Vault path
|
||||||
|
"""
|
||||||
|
return os.getenv("OBJECTS_FOLDER", "/objects")
|
||||||
|
|
||||||
|
|
||||||
|
def watch_directory() -> str:
|
||||||
|
"""Directory to monitor for new files"""
|
||||||
|
return os.getenv("WATCH_DIRECTORY", "/watched_files")
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ The application will terminate if MongoDB is not accessible at startup.
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from pymongo import MongoClient
|
from pymongo import MongoClient
|
||||||
from pymongo.database import Database
|
from pymongo.database import Database
|
||||||
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
||||||
@@ -107,6 +108,15 @@ def get_mongodb_client() -> Optional[MongoClient]:
|
|||||||
return _client
|
return _client
|
||||||
|
|
||||||
|
|
||||||
|
def get_extra_args(session):
|
||||||
|
# Build kwargs only if session is provided
|
||||||
|
kwargs = {}
|
||||||
|
if session is not None:
|
||||||
|
kwargs["session"] = session
|
||||||
|
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
def test_database_connection() -> bool:
|
def test_database_connection() -> bool:
|
||||||
"""
|
"""
|
||||||
Test if database connection is working.
|
Test if database connection is working.
|
||||||
@@ -122,4 +132,4 @@ def test_database_connection() -> bool:
|
|||||||
db.command('ping')
|
db.command('ping')
|
||||||
return True
|
return True
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -1,214 +0,0 @@
|
|||||||
from typing import List, Optional
|
|
||||||
from datetime import datetime
|
|
||||||
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
|
|
||||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
|
||||||
from bson import ObjectId
|
|
||||||
|
|
||||||
from app.models.document import DocumentContent
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentContentRepository:
|
|
||||||
"""
|
|
||||||
Repository class for document content CRUD operations in MongoDB.
|
|
||||||
|
|
||||||
This class handles all database operations related to document content,
|
|
||||||
following the repository pattern with dependency injection and async/await.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, database: AsyncIOMotorDatabase):
|
|
||||||
"""
|
|
||||||
Initialize repository with database dependency.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
database (AsyncIOMotorDatabase): MongoDB database instance
|
|
||||||
"""
|
|
||||||
self.db = database
|
|
||||||
self.collection: AsyncIOMotorCollection = database.document_contents
|
|
||||||
self._ensure_indexes()
|
|
||||||
|
|
||||||
async def initialize(self):
|
|
||||||
"""
|
|
||||||
Initialize repository by ensuring required indexes exist.
|
|
||||||
|
|
||||||
Should be called after repository instantiation to setup database indexes.
|
|
||||||
"""
|
|
||||||
await self._ensure_indexes()
|
|
||||||
|
|
||||||
async def _ensure_indexes(self):
|
|
||||||
"""
|
|
||||||
Ensure required database indexes exist.
|
|
||||||
|
|
||||||
Creates unique index on file_hash field to prevent duplicates.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
await self.collection.create_index("file_hash", unique=True)
|
|
||||||
except PyMongoError:
|
|
||||||
# Index might already exist, ignore error
|
|
||||||
pass
|
|
||||||
|
|
||||||
async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
|
|
||||||
"""
|
|
||||||
Create a new document content in the database.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_content (DocumentContent): Document content data
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DocumentContent: Created document content with database ID
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
DuplicateKeyError: If file_hash already exists
|
|
||||||
ValueError: If document content creation fails due to validation
|
|
||||||
"""
|
|
||||||
document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
|
|
||||||
|
|
||||||
# Remove _id if it's None to let MongoDB generate it
|
|
||||||
if document_dict.get("_id") is None:
|
|
||||||
document_dict.pop("_id", None)
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = await self.collection.insert_one(document_dict)
|
|
||||||
document_dict["_id"] = result.inserted_id
|
|
||||||
return DocumentContent(**document_dict)
|
|
||||||
except DuplicateKeyError as e:
|
|
||||||
raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
|
|
||||||
except PyMongoError as e:
|
|
||||||
raise ValueError(f"Failed to create document content: {e}")
|
|
||||||
|
|
||||||
async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
|
|
||||||
"""
|
|
||||||
Find document content by ID.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_id (str): Document content ID to search for
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DocumentContent or None: Document content if found, None otherwise
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if not ObjectId.is_valid(document_id):
|
|
||||||
return None
|
|
||||||
|
|
||||||
document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
|
|
||||||
if document_doc:
|
|
||||||
return DocumentContent(**document_doc)
|
|
||||||
return None
|
|
||||||
except PyMongoError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
|
||||||
"""
|
|
||||||
Find document content by file hash.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_hash (str): File hash to search for
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DocumentContent or None: Document content if found, None otherwise
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
document_doc = await self.collection.find_one({"file_hash": file_hash})
|
|
||||||
if document_doc:
|
|
||||||
return DocumentContent(**document_doc)
|
|
||||||
return None
|
|
||||||
except PyMongoError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def content_exists(self, file_hash: str) -> bool:
|
|
||||||
"""
|
|
||||||
Check if document content exists by file hash.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_hash (str): File hash to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if document content exists, False otherwise
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
count = await self.collection.count_documents({"file_hash": file_hash})
|
|
||||||
return count > 0
|
|
||||||
except PyMongoError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
|
|
||||||
"""
|
|
||||||
Update document content information.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_id (str): Document content ID to update
|
|
||||||
update_data (dict): Updated document content data
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DocumentContent or None: Updated document content if found, None otherwise
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if not ObjectId.is_valid(document_id):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Remove None values and _id from update data
|
|
||||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
|
|
||||||
|
|
||||||
if not clean_update_data:
|
|
||||||
return await self.find_document_content_by_id(document_id)
|
|
||||||
|
|
||||||
result = await self.collection.find_one_and_update(
|
|
||||||
{"_id": ObjectId(document_id)},
|
|
||||||
{"$set": clean_update_data},
|
|
||||||
return_document=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
return DocumentContent(**result)
|
|
||||||
return None
|
|
||||||
|
|
||||||
except PyMongoError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def delete_document_content(self, document_id: str) -> bool:
|
|
||||||
"""
|
|
||||||
Delete document content from database.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_id (str): Document content ID to delete
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if document content was deleted, False otherwise
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if not ObjectId.is_valid(document_id):
|
|
||||||
return False
|
|
||||||
|
|
||||||
result = await self.collection.delete_one({"_id": ObjectId(document_id)})
|
|
||||||
return result.deleted_count > 0
|
|
||||||
except PyMongoError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
|
|
||||||
"""
|
|
||||||
List document contents with pagination.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
skip (int): Number of document contents to skip (default: 0)
|
|
||||||
limit (int): Maximum number of document contents to return (default: 100)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[DocumentContent]: List of document contents
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
|
|
||||||
document_docs = await cursor.to_list(length=limit)
|
|
||||||
return [DocumentContent(**document_doc) for document_doc in document_docs]
|
|
||||||
except PyMongoError:
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def count_document_contents(self) -> int:
|
|
||||||
"""
|
|
||||||
Count total number of document contents.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
int: Total number of document contents in database
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
return await self.collection.count_documents({})
|
|
||||||
except PyMongoError:
|
|
||||||
return 0
|
|
||||||
@@ -6,9 +6,13 @@ in MongoDB with proper error handling and type safety.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
|
|
||||||
from bson import ObjectId
|
from bson import ObjectId
|
||||||
|
from pymongo.collection import Collection
|
||||||
|
from pymongo.database import Database
|
||||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||||
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
|
|
||||||
|
from app.database.connection import get_extra_args
|
||||||
from app.models.document import FileDocument
|
from app.models.document import FileDocument
|
||||||
from app.utils.document_matching import fuzzy_matching, subsequence_matching
|
from app.utils.document_matching import fuzzy_matching, subsequence_matching
|
||||||
|
|
||||||
@@ -34,52 +38,49 @@ class FileDocumentRepository:
|
|||||||
with proper error handling and data validation.
|
with proper error handling and data validation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, database: AsyncIOMotorDatabase):
|
def __init__(self, database: Database):
|
||||||
"""Initialize file repository with database connection."""
|
"""Initialize file repository with database connection."""
|
||||||
self.db = database
|
self.db = database
|
||||||
self.collection: AsyncIOMotorCollection = self.db.files
|
self.collection: Collection = self.db.documents
|
||||||
self._ensure_indexes()
|
|
||||||
|
|
||||||
async def initialize(self):
|
def initialize(self):
|
||||||
"""
|
"""
|
||||||
Initialize repository by ensuring required indexes exist.
|
Initialize repository by ensuring required indexes exist.
|
||||||
|
|
||||||
Should be called after repository instantiation to setup database indexes.
|
Should be called after repository instantiation to setup database indexes.
|
||||||
"""
|
"""
|
||||||
await self._ensure_indexes()
|
self._ensure_indexes()
|
||||||
|
return self
|
||||||
|
|
||||||
async def _ensure_indexes(self):
|
def _ensure_indexes(self):
|
||||||
"""
|
"""
|
||||||
Ensure required database indexes exist.
|
Ensure required database indexes exist.
|
||||||
|
|
||||||
Creates unique index on username field to prevent duplicates.
|
Creates unique index on username field to prevent duplicates.
|
||||||
"""
|
"""
|
||||||
try:
|
pass
|
||||||
await self.collection.create_index("filepath", unique=True)
|
|
||||||
except PyMongoError:
|
|
||||||
# Index might already exist, ignore error
|
|
||||||
pass
|
|
||||||
|
|
||||||
async def create_document(self, file_data: FileDocument) -> FileDocument:
|
def create_document(self, file_data: FileDocument, session=None) -> FileDocument:
|
||||||
"""
|
"""
|
||||||
Create a new file document in database.
|
Create a new file document in database.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_data (FileDocument): File document data to create
|
file_data (FileDocument): File document data to create
|
||||||
|
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
FileDocument: Created file document with database ID
|
FileDocument: Created document with database ID
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If file creation fails due to validation
|
ValueError: If file creation fails due to validation
|
||||||
DuplicateKeyError: If file with same hash already exists
|
DuplicateKeyError: If a document with same hash already exists
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
|
file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
|
||||||
if "_id" in file_dict and file_dict["_id"] is None:
|
if "_id" in file_dict and file_dict["_id"] is None:
|
||||||
del file_dict["_id"]
|
del file_dict["_id"]
|
||||||
|
|
||||||
result = await self.collection.insert_one(file_dict)
|
result = self.collection.insert_one(file_dict, **get_extra_args(session))
|
||||||
file_data.id = result.inserted_id
|
file_data.id = result.inserted_id
|
||||||
return file_data
|
return file_data
|
||||||
|
|
||||||
@@ -88,7 +89,7 @@ class FileDocumentRepository:
|
|||||||
except PyMongoError as e:
|
except PyMongoError as e:
|
||||||
raise ValueError(f"Failed to create file document: {e}")
|
raise ValueError(f"Failed to create file document: {e}")
|
||||||
|
|
||||||
async def find_document_by_id(self, file_id: str) -> Optional[FileDocument]:
|
def find_document_by_id(self, file_id: str) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Find file document by ID.
|
Find file document by ID.
|
||||||
|
|
||||||
@@ -102,7 +103,7 @@ class FileDocumentRepository:
|
|||||||
if not ObjectId.is_valid(file_id):
|
if not ObjectId.is_valid(file_id):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
file_doc = await self.collection.find_one({"_id": ObjectId(file_id)})
|
file_doc = self.collection.find_one({"_id": ObjectId(file_id)})
|
||||||
if file_doc:
|
if file_doc:
|
||||||
return FileDocument(**file_doc)
|
return FileDocument(**file_doc)
|
||||||
return None
|
return None
|
||||||
@@ -110,7 +111,7 @@ class FileDocumentRepository:
|
|||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Find file document by file hash to detect duplicates.
|
Find file document by file hash to detect duplicates.
|
||||||
|
|
||||||
@@ -121,7 +122,7 @@ class FileDocumentRepository:
|
|||||||
FileDocument or None: File document if found, None otherwise
|
FileDocument or None: File document if found, None otherwise
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
file_doc = await self.collection.find_one({"file_hash": file_hash})
|
file_doc = self.collection.find_one({"file_hash": file_hash})
|
||||||
if file_doc:
|
if file_doc:
|
||||||
return FileDocument(**file_doc)
|
return FileDocument(**file_doc)
|
||||||
return None
|
return None
|
||||||
@@ -129,7 +130,7 @@ class FileDocumentRepository:
|
|||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Find file document by exact filepath.
|
Find file document by exact filepath.
|
||||||
|
|
||||||
@@ -140,7 +141,7 @@ class FileDocumentRepository:
|
|||||||
FileDocument or None: File document if found, None otherwise
|
FileDocument or None: File document if found, None otherwise
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
file_doc = await self.collection.find_one({"filepath": filepath})
|
file_doc = self.collection.find_one({"filepath": filepath})
|
||||||
if file_doc:
|
if file_doc:
|
||||||
return FileDocument(**file_doc)
|
return FileDocument(**file_doc)
|
||||||
return None
|
return None
|
||||||
@@ -148,7 +149,7 @@ class FileDocumentRepository:
|
|||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
|
def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Find file documents by filename using fuzzy matching.
|
Find file documents by filename using fuzzy matching.
|
||||||
|
|
||||||
@@ -162,8 +163,7 @@ class FileDocumentRepository:
|
|||||||
try:
|
try:
|
||||||
# Get all files from database
|
# Get all files from database
|
||||||
cursor = self.collection.find({})
|
cursor = self.collection.find({})
|
||||||
all_files = await cursor.to_list(length=None)
|
all_documents = [FileDocument(**file_doc) for file_doc in cursor]
|
||||||
all_documents = [FileDocument(**file_doc) for file_doc in all_files]
|
|
||||||
|
|
||||||
if isinstance(matching_method, FuzzyMatching):
|
if isinstance(matching_method, FuzzyMatching):
|
||||||
return fuzzy_matching(filename, all_documents, matching_method.threshold)
|
return fuzzy_matching(filename, all_documents, matching_method.threshold)
|
||||||
@@ -173,7 +173,7 @@ class FileDocumentRepository:
|
|||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]:
|
def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]:
|
||||||
"""
|
"""
|
||||||
List file documents with pagination.
|
List file documents with pagination.
|
||||||
|
|
||||||
@@ -186,13 +186,12 @@ class FileDocumentRepository:
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("detected_at", -1)
|
cursor = self.collection.find({}).skip(skip).limit(limit).sort("detected_at", -1)
|
||||||
file_docs = await cursor.to_list(length=limit)
|
return [FileDocument(**doc) for doc in cursor]
|
||||||
return [FileDocument(**doc) for doc in file_docs]
|
|
||||||
|
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def count_documents(self) -> int:
|
def count_documents(self) -> int:
|
||||||
"""
|
"""
|
||||||
Count total number of file documents.
|
Count total number of file documents.
|
||||||
|
|
||||||
@@ -200,17 +199,18 @@ class FileDocumentRepository:
|
|||||||
int: Total number of file documents in collection
|
int: Total number of file documents in collection
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return await self.collection.count_documents({})
|
return self.collection.count_documents({})
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]:
|
def update_document(self, file_id: str, update_data: dict, session=None) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Update file document with new data.
|
Update file document with new data.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_id (str): File document ID to update
|
file_id (str): File document ID to update
|
||||||
update_data (dict): Fields to update
|
update_data (dict): Fields to update
|
||||||
|
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
FileDocument or None: Updated file document if successful, None otherwise
|
FileDocument or None: Updated file document if successful, None otherwise
|
||||||
@@ -223,12 +223,13 @@ class FileDocumentRepository:
|
|||||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None}
|
clean_update_data = {k: v for k, v in update_data.items() if v is not None}
|
||||||
|
|
||||||
if not clean_update_data:
|
if not clean_update_data:
|
||||||
return await self.find_document_by_id(file_id)
|
return self.find_document_by_id(file_id)
|
||||||
|
|
||||||
result = await self.collection.find_one_and_update(
|
result = self.collection.find_one_and_update(
|
||||||
{"_id": ObjectId(file_id)},
|
{"_id": ObjectId(file_id)},
|
||||||
{"$set": clean_update_data},
|
{"$set": clean_update_data},
|
||||||
return_document=True
|
return_document=True,
|
||||||
|
**get_extra_args(session)
|
||||||
)
|
)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
@@ -238,12 +239,13 @@ class FileDocumentRepository:
|
|||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def delete_document(self, file_id: str) -> bool:
|
def delete_document(self, file_id: str, session=None) -> bool:
|
||||||
"""
|
"""
|
||||||
Delete file document from database.
|
Delete file document from database.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_id (str): File document ID to delete
|
file_id (str): File document ID to delete
|
||||||
|
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if file was deleted, False otherwise
|
bool: True if file was deleted, False otherwise
|
||||||
@@ -252,7 +254,7 @@ class FileDocumentRepository:
|
|||||||
if not ObjectId.is_valid(file_id):
|
if not ObjectId.is_valid(file_id):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
result = await self.collection.delete_one({"_id": ObjectId(file_id)})
|
result = self.collection.delete_one({"_id": ObjectId(file_id)}, **get_extra_args(session))
|
||||||
return result.deleted_count > 0
|
return result.deleted_count > 0
|
||||||
|
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
|
|||||||
230
src/file-processor/app/database/repositories/job_repository.py
Normal file
230
src/file-processor/app/database/repositories/job_repository.py
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
"""
|
||||||
|
Repository for managing processing jobs in MongoDB.
|
||||||
|
|
||||||
|
This module provides data access layer for ProcessingJob operations
|
||||||
|
with automatic timestamp management and error handling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from pymongo.collection import Collection
|
||||||
|
from pymongo.database import Database
|
||||||
|
from pymongo.errors import PyMongoError
|
||||||
|
|
||||||
|
from app.exceptions.job_exceptions import JobRepositoryError
|
||||||
|
from app.models.job import ProcessingJob, ProcessingStatus
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
|
||||||
|
|
||||||
|
class JobRepository:
|
||||||
|
"""
|
||||||
|
Repository for processing job data access operations.
|
||||||
|
|
||||||
|
Provides CRUD operations for ProcessingJob documents with automatic
|
||||||
|
timestamp management and proper error handling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, database: Database):
|
||||||
|
"""Initialize repository with MongoDB collection reference."""
|
||||||
|
self.db = database
|
||||||
|
self.collection: Collection = self.db.processing_jobs
|
||||||
|
|
||||||
|
def _ensure_indexes(self):
|
||||||
|
"""
|
||||||
|
Ensure required database indexes exist.
|
||||||
|
|
||||||
|
Creates unique index on username field to prevent duplicates.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
self.collection.create_index("document_id", unique=True)
|
||||||
|
except PyMongoError:
|
||||||
|
# Index might already exist, ignore error
|
||||||
|
pass
|
||||||
|
|
||||||
|
def initialize(self):
|
||||||
|
"""
|
||||||
|
Initialize repository by ensuring required indexes exist.
|
||||||
|
|
||||||
|
Should be called after repository instantiation to setup database indexes.
|
||||||
|
"""
|
||||||
|
self._ensure_indexes()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob:
|
||||||
|
"""
|
||||||
|
Create a new processing job.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_id: Reference to the file document
|
||||||
|
task_id: Optional Celery task UUID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The created ProcessingJob
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
job_data = {
|
||||||
|
"document_id": document_id,
|
||||||
|
"status": ProcessingStatus.PENDING,
|
||||||
|
"task_id": task_id,
|
||||||
|
"created_at": datetime.now(),
|
||||||
|
"started_at": None,
|
||||||
|
"completed_at": None,
|
||||||
|
"error_message": None
|
||||||
|
}
|
||||||
|
|
||||||
|
result = self.collection.insert_one(job_data)
|
||||||
|
job_data["_id"] = result.inserted_id
|
||||||
|
|
||||||
|
return ProcessingJob(**job_data)
|
||||||
|
|
||||||
|
except PyMongoError as e:
|
||||||
|
raise JobRepositoryError("create_job", e)
|
||||||
|
|
||||||
|
def find_job_by_id(self, job_id: PyObjectId) -> Optional[ProcessingJob]:
|
||||||
|
"""
|
||||||
|
Retrieve a job by its ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The job ObjectId
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The ProcessingJob document
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobNotFoundError: If job doesn't exist
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
job_data = self.collection.find_one({"_id": job_id})
|
||||||
|
if job_data:
|
||||||
|
return ProcessingJob(**job_data)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
except PyMongoError as e:
|
||||||
|
raise JobRepositoryError("get_job_by_id", e)
|
||||||
|
|
||||||
|
def update_job_status(
|
||||||
|
self,
|
||||||
|
job_id: PyObjectId,
|
||||||
|
status: ProcessingStatus,
|
||||||
|
error_message: Optional[str] = None
|
||||||
|
) -> Optional[ProcessingJob]:
|
||||||
|
"""
|
||||||
|
Update job status with automatic timestamp management.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The job ObjectId
|
||||||
|
status: New processing status
|
||||||
|
error_message: Optional error message for failed jobs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The updated ProcessingJob
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobNotFoundError: If job doesn't exist
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Prepare update data
|
||||||
|
update_data = {"status": status}
|
||||||
|
|
||||||
|
# Set appropriate timestamp based on status
|
||||||
|
current_time = datetime.now()
|
||||||
|
if status == ProcessingStatus.PROCESSING:
|
||||||
|
update_data["started_at"] = current_time
|
||||||
|
elif status in (ProcessingStatus.COMPLETED, ProcessingStatus.FAILED):
|
||||||
|
update_data["completed_at"] = current_time
|
||||||
|
|
||||||
|
# Add error message if provided
|
||||||
|
if error_message is not None:
|
||||||
|
update_data["error_message"] = error_message
|
||||||
|
|
||||||
|
result = self.collection.find_one_and_update(
|
||||||
|
{"_id": job_id},
|
||||||
|
{"$set": update_data},
|
||||||
|
return_document=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
return ProcessingJob(**result)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
except PyMongoError as e:
|
||||||
|
raise JobRepositoryError("update_job_status", e)
|
||||||
|
|
||||||
|
def delete_job(self, job_id: PyObjectId) -> bool:
|
||||||
|
"""
|
||||||
|
Delete a job from the database.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The job ObjectId
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if job was deleted, False if not found
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result = self.collection.delete_one({"_id": job_id})
|
||||||
|
|
||||||
|
return result.deleted_count > 0
|
||||||
|
|
||||||
|
except PyMongoError as e:
|
||||||
|
raise JobRepositoryError("delete_job", e)
|
||||||
|
|
||||||
|
def find_jobs_by_document_id(self, document_id: PyObjectId) -> List[ProcessingJob]:
|
||||||
|
"""
|
||||||
|
Retrieve all jobs for a specific file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: The file ObjectId
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ProcessingJob documents
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
cursor = self.collection.find({"document_id": document_id})
|
||||||
|
|
||||||
|
jobs = []
|
||||||
|
for job_data in cursor:
|
||||||
|
jobs.append(ProcessingJob(**job_data))
|
||||||
|
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
except PyMongoError as e:
|
||||||
|
raise JobRepositoryError("get_jobs_by_file_id", e)
|
||||||
|
|
||||||
|
def get_jobs_by_status(self, status: ProcessingStatus) -> List[ProcessingJob]:
|
||||||
|
"""
|
||||||
|
Retrieve all jobs with a specific status.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
status: The processing status to filter by
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ProcessingJob documents
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
cursor = self.collection.find({"status": status})
|
||||||
|
|
||||||
|
jobs = []
|
||||||
|
for job_data in cursor:
|
||||||
|
jobs.append(ProcessingJob(**job_data))
|
||||||
|
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
except PyMongoError as e:
|
||||||
|
raise JobRepositoryError("get_jobs_by_status", e)
|
||||||
@@ -5,10 +5,12 @@ This module implements the repository pattern for user CRUD operations
|
|||||||
with dependency injection of the database connection using async/await.
|
with dependency injection of the database connection using async/await.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional, List
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Optional, List
|
||||||
|
|
||||||
from bson import ObjectId
|
from bson import ObjectId
|
||||||
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
|
from pymongo.collection import Collection
|
||||||
|
from pymongo.database import Database
|
||||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||||
|
|
||||||
from app.models.user import UserCreate, UserInDB, UserUpdate
|
from app.models.user import UserCreate, UserInDB, UserUpdate
|
||||||
@@ -23,7 +25,7 @@ class UserRepository:
|
|||||||
following the repository pattern with dependency injection and async/await.
|
following the repository pattern with dependency injection and async/await.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, database: AsyncIOMotorDatabase):
|
def __init__(self, database: Database):
|
||||||
"""
|
"""
|
||||||
Initialize repository with database dependency.
|
Initialize repository with database dependency.
|
||||||
|
|
||||||
@@ -31,30 +33,30 @@ class UserRepository:
|
|||||||
database (AsyncIOMotorDatabase): MongoDB database instance
|
database (AsyncIOMotorDatabase): MongoDB database instance
|
||||||
"""
|
"""
|
||||||
self.db = database
|
self.db = database
|
||||||
self.collection: AsyncIOMotorCollection = database.users
|
self.collection: Collection = database.users
|
||||||
self._ensure_indexes()
|
|
||||||
|
|
||||||
async def initialize(self):
|
def initialize(self):
|
||||||
"""
|
"""
|
||||||
Initialize repository by ensuring required indexes exist.
|
Initialize repository by ensuring required indexes exist.
|
||||||
|
|
||||||
Should be called after repository instantiation to setup database indexes.
|
Should be called after repository instantiation to setup database indexes.
|
||||||
"""
|
"""
|
||||||
await self._ensure_indexes()
|
self._ensure_indexes()
|
||||||
|
return self
|
||||||
|
|
||||||
async def _ensure_indexes(self):
|
def _ensure_indexes(self):
|
||||||
"""
|
"""
|
||||||
Ensure required database indexes exist.
|
Ensure required database indexes exist.
|
||||||
|
|
||||||
Creates unique index on username field to prevent duplicates.
|
Creates unique index on username field to prevent duplicates.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
await self.collection.create_index("username", unique=True)
|
self.collection.create_index("username", unique=True)
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
# Index might already exist, ignore error
|
# Index might already exist, ignore error
|
||||||
pass
|
pass
|
||||||
|
|
||||||
async def create_user(self, user_data: UserCreate) -> UserInDB:
|
def create_user(self, user_data: UserCreate) -> UserInDB:
|
||||||
"""
|
"""
|
||||||
Create a new user in the database.
|
Create a new user in the database.
|
||||||
|
|
||||||
@@ -79,7 +81,7 @@ class UserRepository:
|
|||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = await self.collection.insert_one(user_dict)
|
result = self.collection.insert_one(user_dict)
|
||||||
user_dict["_id"] = result.inserted_id
|
user_dict["_id"] = result.inserted_id
|
||||||
return UserInDB(**user_dict)
|
return UserInDB(**user_dict)
|
||||||
except DuplicateKeyError as e:
|
except DuplicateKeyError as e:
|
||||||
@@ -87,7 +89,7 @@ class UserRepository:
|
|||||||
except PyMongoError as e:
|
except PyMongoError as e:
|
||||||
raise ValueError(f"Failed to create user: {e}")
|
raise ValueError(f"Failed to create user: {e}")
|
||||||
|
|
||||||
async def find_user_by_username(self, username: str) -> Optional[UserInDB]:
|
def find_user_by_username(self, username: str) -> Optional[UserInDB]:
|
||||||
"""
|
"""
|
||||||
Find user by username.
|
Find user by username.
|
||||||
|
|
||||||
@@ -98,14 +100,14 @@ class UserRepository:
|
|||||||
UserInDB or None: User if found, None otherwise
|
UserInDB or None: User if found, None otherwise
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
user_doc = await self.collection.find_one({"username": username})
|
user_doc = self.collection.find_one({"username": username})
|
||||||
if user_doc:
|
if user_doc:
|
||||||
return UserInDB(**user_doc)
|
return UserInDB(**user_doc)
|
||||||
return None
|
return None
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
|
def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
|
||||||
"""
|
"""
|
||||||
Find user by ID.
|
Find user by ID.
|
||||||
|
|
||||||
@@ -119,14 +121,14 @@ class UserRepository:
|
|||||||
if not ObjectId.is_valid(user_id):
|
if not ObjectId.is_valid(user_id):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
user_doc = await self.collection.find_one({"_id": ObjectId(user_id)})
|
user_doc = self.collection.find_one({"_id": ObjectId(user_id)})
|
||||||
if user_doc:
|
if user_doc:
|
||||||
return UserInDB(**user_doc)
|
return UserInDB(**user_doc)
|
||||||
return None
|
return None
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def find_user_by_email(self, email: str) -> Optional[UserInDB]:
|
def find_user_by_email(self, email: str) -> Optional[UserInDB]:
|
||||||
"""
|
"""
|
||||||
Find user by email address.
|
Find user by email address.
|
||||||
|
|
||||||
@@ -137,14 +139,14 @@ class UserRepository:
|
|||||||
UserInDB or None: User if found, None otherwise
|
UserInDB or None: User if found, None otherwise
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
user_doc = await self.collection.find_one({"email": email})
|
user_doc = self.collection.find_one({"email": email})
|
||||||
if user_doc:
|
if user_doc:
|
||||||
return UserInDB(**user_doc)
|
return UserInDB(**user_doc)
|
||||||
return None
|
return None
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
|
def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
|
||||||
"""
|
"""
|
||||||
Update user information.
|
Update user information.
|
||||||
|
|
||||||
@@ -177,9 +179,9 @@ class UserRepository:
|
|||||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None}
|
clean_update_data = {k: v for k, v in update_data.items() if v is not None}
|
||||||
|
|
||||||
if not clean_update_data:
|
if not clean_update_data:
|
||||||
return await self.find_user_by_id(user_id)
|
return self.find_user_by_id(user_id)
|
||||||
|
|
||||||
result = await self.collection.find_one_and_update(
|
result = self.collection.find_one_and_update(
|
||||||
{"_id": ObjectId(user_id)},
|
{"_id": ObjectId(user_id)},
|
||||||
{"$set": clean_update_data},
|
{"$set": clean_update_data},
|
||||||
return_document=True
|
return_document=True
|
||||||
@@ -192,7 +194,7 @@ class UserRepository:
|
|||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def delete_user(self, user_id: str) -> bool:
|
def delete_user(self, user_id: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Delete user from database.
|
Delete user from database.
|
||||||
|
|
||||||
@@ -206,12 +208,12 @@ class UserRepository:
|
|||||||
if not ObjectId.is_valid(user_id):
|
if not ObjectId.is_valid(user_id):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
result = await self.collection.delete_one({"_id": ObjectId(user_id)})
|
result = self.collection.delete_one({"_id": ObjectId(user_id)})
|
||||||
return result.deleted_count > 0
|
return result.deleted_count > 0
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
|
def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
|
||||||
"""
|
"""
|
||||||
List users with pagination.
|
List users with pagination.
|
||||||
|
|
||||||
@@ -224,12 +226,12 @@ class UserRepository:
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("created_at", -1)
|
cursor = self.collection.find({}).skip(skip).limit(limit).sort("created_at", -1)
|
||||||
user_docs = await cursor.to_list(length=limit)
|
user_docs = cursor.to_list(length=limit)
|
||||||
return [UserInDB(**user_doc) for user_doc in user_docs]
|
return [UserInDB(**user_doc) for user_doc in user_docs]
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def count_users(self) -> int:
|
def count_users(self) -> int:
|
||||||
"""
|
"""
|
||||||
Count total number of users.
|
Count total number of users.
|
||||||
|
|
||||||
@@ -237,11 +239,11 @@ class UserRepository:
|
|||||||
int: Total number of users in database
|
int: Total number of users in database
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return await self.collection.count_documents({})
|
return self.collection.count_documents({})
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
async def user_exists(self, username: str) -> bool:
|
def user_exists(self, username: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if user exists by username.
|
Check if user exists by username.
|
||||||
|
|
||||||
@@ -252,7 +254,7 @@ class UserRepository:
|
|||||||
bool: True if user exists, False otherwise
|
bool: True if user exists, False otherwise
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
count = await self.collection.count_documents({"username": username})
|
count = self.collection.count_documents({"username": username})
|
||||||
return count > 0
|
return count > 0
|
||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return False
|
return False
|
||||||
|
|||||||
0
src/file-processor/app/exceptions/__init__.py
Normal file
0
src/file-processor/app/exceptions/__init__.py
Normal file
38
src/file-processor/app/exceptions/job_exceptions.py
Normal file
38
src/file-processor/app/exceptions/job_exceptions.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
"""
|
||||||
|
Custom exceptions for job management operations.
|
||||||
|
|
||||||
|
This module defines specific exceptions for job processing lifecycle
|
||||||
|
and repository operations to provide clear error handling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from app.models.job import ProcessingStatus
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidStatusTransitionError(Exception):
|
||||||
|
"""
|
||||||
|
Raised when an invalid status transition is attempted.
|
||||||
|
|
||||||
|
This exception indicates that an attempt was made to change a job's
|
||||||
|
status to an invalid target status given the current status.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, current_status: ProcessingStatus, target_status: ProcessingStatus):
|
||||||
|
self.current_status = current_status
|
||||||
|
self.target_status = target_status
|
||||||
|
super().__init__(
|
||||||
|
f"Invalid status transition from '{current_status}' to '{target_status}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class JobRepositoryError(Exception):
|
||||||
|
"""
|
||||||
|
Raised when a MongoDB operation fails in the job repository.
|
||||||
|
|
||||||
|
This exception wraps database-related errors that occur during
|
||||||
|
job repository operations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, operation: str, original_error: Exception):
|
||||||
|
self.operation = operation
|
||||||
|
self.original_error = original_error
|
||||||
|
super().__init__(f"Repository operation '{operation}' failed: {str(original_error)}")
|
||||||
243
src/file-processor/app/file_watcher.py
Normal file
243
src/file-processor/app/file_watcher.py
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
"""
|
||||||
|
File watcher implementation with Watchdog observer and ProcessingJob management.
|
||||||
|
|
||||||
|
This module provides real-time file monitoring for document processing.
|
||||||
|
When a file is created in the watched directory, it:
|
||||||
|
1. Creates a document record via DocumentService
|
||||||
|
2. Dispatches a Celery task for processing
|
||||||
|
3. Creates a ProcessingJob to track the task lifecycle
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from watchdog.events import FileSystemEventHandler, FileCreatedEvent
|
||||||
|
from watchdog.observers import Observer
|
||||||
|
|
||||||
|
from app.services.document_service import DocumentService
|
||||||
|
from app.services.job_service import JobService
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentFileEventHandler(FileSystemEventHandler):
|
||||||
|
"""
|
||||||
|
Event handler for document file creation events.
|
||||||
|
|
||||||
|
Processes newly created files by creating document records,
|
||||||
|
dispatching Celery tasks, and managing processing jobs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx'}
|
||||||
|
|
||||||
|
def __init__(self, document_service: DocumentService, job_service: JobService):
|
||||||
|
"""
|
||||||
|
Initialize the event handler.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_service: Service for document management
|
||||||
|
job_service: Service for processing job management
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.document_service = document_service
|
||||||
|
self.job_service = job_service
|
||||||
|
|
||||||
|
def on_created(self, event: FileCreatedEvent) -> None:
|
||||||
|
"""
|
||||||
|
Handle file creation events.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
event: File system event containing file path information
|
||||||
|
"""
|
||||||
|
if event.is_directory:
|
||||||
|
return
|
||||||
|
|
||||||
|
filepath = event.src_path
|
||||||
|
file_extension = Path(filepath).suffix.lower()
|
||||||
|
|
||||||
|
if file_extension not in self.SUPPORTED_EXTENSIONS:
|
||||||
|
logger.info(f"Ignoring unsupported file type: {filepath}")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Processing new file: {filepath}")
|
||||||
|
|
||||||
|
# try:
|
||||||
|
from tasks.document_processing import process_document
|
||||||
|
task_result = process_document.delay(filepath)
|
||||||
|
print(task_result)
|
||||||
|
print("hello world")
|
||||||
|
# task_id = task_result.task_id
|
||||||
|
# logger.info(f"Dispatched Celery task with ID: {task_id}")
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# logger.error(f"Failed to process file {filepath}: {str(e)}")
|
||||||
|
# # Note: We don't re-raise the exception to keep the watcher running
|
||||||
|
|
||||||
|
|
||||||
|
class FileWatcher:
|
||||||
|
"""
|
||||||
|
File system watcher for automatic document processing.
|
||||||
|
|
||||||
|
Monitors a directory for new files and triggers processing pipeline
|
||||||
|
using a dedicated observer thread.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
watch_directory: str,
|
||||||
|
document_service: DocumentService,
|
||||||
|
job_service: JobService,
|
||||||
|
recursive: bool = True
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the file watcher.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
watch_directory: Directory path to monitor
|
||||||
|
document_service: Service for document management
|
||||||
|
job_service: Service for processing job management
|
||||||
|
recursive: Whether to watch subdirectories recursively
|
||||||
|
"""
|
||||||
|
self.watch_directory = Path(watch_directory)
|
||||||
|
self.recursive = recursive
|
||||||
|
self.observer: Optional[Observer] = None
|
||||||
|
self._observer_thread: Optional[threading.Thread] = None
|
||||||
|
self._stop_event = threading.Event()
|
||||||
|
|
||||||
|
# Validate watch directory
|
||||||
|
if not self.watch_directory.exists():
|
||||||
|
raise ValueError(f"Watch directory does not exist: {watch_directory}")
|
||||||
|
|
||||||
|
if not self.watch_directory.is_dir():
|
||||||
|
raise ValueError(f"Watch path is not a directory: {watch_directory}")
|
||||||
|
|
||||||
|
# Create event handler
|
||||||
|
self.event_handler = DocumentFileEventHandler(
|
||||||
|
document_service=document_service,
|
||||||
|
job_service=job_service
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"FileWatcher initialized for directory: {self.watch_directory}")
|
||||||
|
|
||||||
|
def start(self) -> None:
|
||||||
|
"""
|
||||||
|
Start the file watcher in a separate thread.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If the watcher is already running
|
||||||
|
"""
|
||||||
|
if self.is_running():
|
||||||
|
raise RuntimeError("FileWatcher is already running")
|
||||||
|
|
||||||
|
self.observer = Observer()
|
||||||
|
self.observer.schedule(
|
||||||
|
self.event_handler,
|
||||||
|
str(self.watch_directory),
|
||||||
|
recursive=self.recursive
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start observer in separate thread
|
||||||
|
self._observer_thread = threading.Thread(
|
||||||
|
target=self._run_observer,
|
||||||
|
name="FileWatcher-Observer"
|
||||||
|
)
|
||||||
|
self._stop_event.clear()
|
||||||
|
self._observer_thread.start()
|
||||||
|
|
||||||
|
logger.info("FileWatcher started successfully")
|
||||||
|
|
||||||
|
def stop(self, timeout: float = 5.0) -> None:
|
||||||
|
"""
|
||||||
|
Stop the file watcher gracefully.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timeout: Maximum time to wait for graceful shutdown
|
||||||
|
"""
|
||||||
|
if not self.is_running():
|
||||||
|
logger.warning("FileWatcher is not running")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Stopping FileWatcher...")
|
||||||
|
|
||||||
|
# Signal stop and wait for observer thread
|
||||||
|
self._stop_event.set()
|
||||||
|
|
||||||
|
if self.observer:
|
||||||
|
self.observer.stop()
|
||||||
|
|
||||||
|
if self._observer_thread and self._observer_thread.is_alive():
|
||||||
|
self._observer_thread.join(timeout=timeout)
|
||||||
|
|
||||||
|
if self._observer_thread.is_alive():
|
||||||
|
logger.warning("FileWatcher thread did not stop gracefully within timeout")
|
||||||
|
else:
|
||||||
|
logger.info("FileWatcher stopped gracefully")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
self.observer = None
|
||||||
|
self._observer_thread = None
|
||||||
|
|
||||||
|
def is_running(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the file watcher is currently running.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the watcher is running, False otherwise
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
self.observer is not None
|
||||||
|
and self._observer_thread is not None
|
||||||
|
and self._observer_thread.is_alive()
|
||||||
|
)
|
||||||
|
|
||||||
|
def _run_observer(self) -> None:
|
||||||
|
"""
|
||||||
|
Internal method to run the observer in a separate thread.
|
||||||
|
|
||||||
|
This method should not be called directly.
|
||||||
|
"""
|
||||||
|
if not self.observer:
|
||||||
|
logger.error("Observer not initialized")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.observer.start()
|
||||||
|
logger.info("Observer thread started")
|
||||||
|
|
||||||
|
# Keep the observer running until stop is requested
|
||||||
|
while not self._stop_event.is_set():
|
||||||
|
self._stop_event.wait(timeout=1.0)
|
||||||
|
|
||||||
|
logger.info("Observer thread stopping...")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Observer thread error: {str(e)}")
|
||||||
|
finally:
|
||||||
|
if self.observer:
|
||||||
|
self.observer.join()
|
||||||
|
logger.info("Observer thread stopped")
|
||||||
|
|
||||||
|
|
||||||
|
def create_file_watcher(
|
||||||
|
watch_directory: str,
|
||||||
|
document_service: DocumentService,
|
||||||
|
job_service: JobService
|
||||||
|
) -> FileWatcher:
|
||||||
|
"""
|
||||||
|
Factory function to create a FileWatcher instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
watch_directory: Directory path to monitor
|
||||||
|
document_service: Service for document management
|
||||||
|
job_service: Service for processing job management
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configured FileWatcher instance
|
||||||
|
"""
|
||||||
|
return FileWatcher(
|
||||||
|
watch_directory=watch_directory,
|
||||||
|
document_service=document_service,
|
||||||
|
job_service=job_service
|
||||||
|
)
|
||||||
@@ -1,203 +1,169 @@
|
|||||||
"""
|
"""
|
||||||
FastAPI application for MyDocManager file processor service.
|
FastAPI application with integrated FileWatcher for document processing.
|
||||||
|
|
||||||
This service provides API endpoints for health checks and task dispatching.
|
This module provides the main FastAPI application with:
|
||||||
|
- JWT authentication
|
||||||
|
- User management APIs
|
||||||
|
- Real-time file monitoring via FileWatcher
|
||||||
|
- Document processing via Celery tasks
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from fastapi import FastAPI, HTTPException, Depends
|
from typing import AsyncGenerator
|
||||||
from pydantic import BaseModel
|
|
||||||
import redis
|
|
||||||
from celery import Celery
|
|
||||||
|
|
||||||
from app.database.connection import test_database_connection, get_database
|
from fastapi import FastAPI
|
||||||
from app.database.repositories.user_repository import UserRepository
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from app.models.user import UserCreate
|
|
||||||
|
from app.api.routes.auth import router as auth_router
|
||||||
|
from app.api.routes.users import router as users_router
|
||||||
|
from app.config import settings
|
||||||
|
from app.database.connection import get_database
|
||||||
|
from app.file_watcher import create_file_watcher, FileWatcher
|
||||||
|
from app.services.document_service import DocumentService
|
||||||
from app.services.init_service import InitializationService
|
from app.services.init_service import InitializationService
|
||||||
|
from app.services.job_service import JobService
|
||||||
from app.services.user_service import UserService
|
from app.services.user_service import UserService
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Global file watcher instance
|
||||||
|
file_watcher: FileWatcher = None
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||||
"""
|
"""
|
||||||
Application lifespan manager for startup and shutdown tasks.
|
FastAPI lifespan context manager.
|
||||||
|
|
||||||
Handles initialization tasks that need to run when the application starts,
|
Handles application startup and shutdown events including:
|
||||||
including admin user creation and other setup procedures.
|
- Database connection
|
||||||
|
- Default admin user creation
|
||||||
|
- FileWatcher startup/shutdown
|
||||||
"""
|
"""
|
||||||
# Startup tasks
|
global file_watcher
|
||||||
|
|
||||||
|
# Startup
|
||||||
logger.info("Starting MyDocManager application...")
|
logger.info("Starting MyDocManager application...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Initialize database connection
|
# Initialize database connection
|
||||||
database = get_database()
|
database = get_database()
|
||||||
|
logger.info("Database connection established")
|
||||||
|
|
||||||
# Initialize repositories and services
|
document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder())
|
||||||
user_repository = UserRepository(database)
|
job_service = JobService(database=database)
|
||||||
user_service = UserService(user_repository)
|
user_service = UserService(database=database)
|
||||||
|
logger.info("Service created")
|
||||||
|
|
||||||
|
# Create default admin user
|
||||||
init_service = InitializationService(user_service)
|
init_service = InitializationService(user_service)
|
||||||
|
init_service.initialize_application()
|
||||||
|
logger.info("Default admin user initialization completed")
|
||||||
|
|
||||||
# Run initialization tasks
|
# Create and start file watcher
|
||||||
initialization_result = init_service.initialize_application()
|
file_watcher = create_file_watcher(
|
||||||
|
watch_directory=settings.watch_directory(),
|
||||||
|
document_service=document_service,
|
||||||
|
job_service=job_service
|
||||||
|
)
|
||||||
|
file_watcher.start()
|
||||||
|
logger.info(f"FileWatcher started for directory: {settings.watch_directory()}")
|
||||||
|
|
||||||
if initialization_result["initialization_success"]:
|
logger.info("Application startup completed successfully")
|
||||||
logger.info("Application startup completed successfully")
|
|
||||||
if initialization_result["admin_user_created"]:
|
yield
|
||||||
logger.info("Default admin user was created during startup")
|
|
||||||
else:
|
|
||||||
logger.error("Application startup completed with errors:")
|
|
||||||
for error in initialization_result["errors"]:
|
|
||||||
logger.error(f" - {error}")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Critical error during application startup: {str(e)}")
|
logger.error(f"Application startup failed: {str(e)}")
|
||||||
# You might want to decide if the app should continue or exit here
|
raise
|
||||||
# For now, we log the error but continue
|
|
||||||
|
|
||||||
yield # Application is running
|
finally:
|
||||||
|
# Shutdown
|
||||||
# Shutdown tasks (if needed)
|
logger.info("Shutting down MyDocManager application...")
|
||||||
logger.info("Shutting down MyDocManager application...")
|
|
||||||
|
if file_watcher and file_watcher.is_running():
|
||||||
|
file_watcher.stop()
|
||||||
|
logger.info("FileWatcher stopped")
|
||||||
|
|
||||||
|
logger.info("Application shutdown completed")
|
||||||
|
|
||||||
|
|
||||||
# Initialize FastAPI app
|
# Create FastAPI application
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="MyDocManager File Processor",
|
title="MyDocManager",
|
||||||
description="File processing and task dispatch service",
|
description="Real-time document processing application with authentication",
|
||||||
version="1.0.0",
|
version="0.1.0",
|
||||||
lifespan=lifespan
|
lifespan=lifespan
|
||||||
)
|
)
|
||||||
|
|
||||||
# Environment variables
|
# Configure CORS
|
||||||
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
app.add_middleware(
|
||||||
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
CORSMiddleware,
|
||||||
|
allow_origins=["http://localhost:5173"], # React frontend
|
||||||
# Initialize Redis client
|
allow_credentials=True,
|
||||||
try:
|
allow_methods=["*"],
|
||||||
redis_client = redis.from_url(REDIS_URL)
|
allow_headers=["*"],
|
||||||
except Exception as e:
|
|
||||||
redis_client = None
|
|
||||||
print(f"Warning: Could not connect to Redis: {e}")
|
|
||||||
|
|
||||||
# Initialize Celery
|
|
||||||
celery_app = Celery(
|
|
||||||
"file_processor",
|
|
||||||
broker=REDIS_URL,
|
|
||||||
backend=REDIS_URL
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Include routers
|
||||||
|
app.include_router(auth_router, prefix="/auth", tags=["Authentication"])
|
||||||
|
app.include_router(users_router, prefix="/users", tags=["User Management"])
|
||||||
|
# app.include_router(documents_router, prefix="/documents", tags=["Documents"])
|
||||||
|
# app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"])
|
||||||
|
|
||||||
# Pydantic models
|
|
||||||
class TestTaskRequest(BaseModel):
|
|
||||||
"""Request model for test task."""
|
|
||||||
message: str
|
|
||||||
|
|
||||||
|
|
||||||
def get_user_service() -> UserService:
|
|
||||||
"""
|
|
||||||
Dependency to get user service instance.
|
|
||||||
|
|
||||||
This should be properly implemented with database connection management
|
|
||||||
in your actual application.
|
|
||||||
"""
|
|
||||||
database = get_database()
|
|
||||||
user_repository = UserRepository(database)
|
|
||||||
return UserService(user_repository)
|
|
||||||
|
|
||||||
|
|
||||||
# Your API routes would use the service like this:
|
|
||||||
@app.post("/api/users")
|
|
||||||
async def create_user(
|
|
||||||
user_data: UserCreate,
|
|
||||||
user_service: UserService = Depends(get_user_service)
|
|
||||||
):
|
|
||||||
return user_service.create_user(user_data)
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health_check():
|
async def health_check():
|
||||||
"""
|
"""
|
||||||
Health check endpoint.
|
Health check endpoint.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Service health status with dependencies
|
Dictionary containing application health status
|
||||||
"""
|
"""
|
||||||
health_status = {
|
return {
|
||||||
"status": "healthy",
|
"status": "healthy",
|
||||||
"service": "file-processor",
|
"service": "MyDocManager",
|
||||||
"dependencies": {
|
"version": "1.0.0",
|
||||||
"redis": "unknown",
|
"file_watcher_running": file_watcher.is_running() if file_watcher else False
|
||||||
"mongodb": "unknown"
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check Redis connection
|
|
||||||
if redis_client:
|
|
||||||
try:
|
|
||||||
redis_client.ping()
|
|
||||||
health_status["dependencies"]["redis"] = "connected"
|
|
||||||
except Exception:
|
|
||||||
health_status["dependencies"]["redis"] = "disconnected"
|
|
||||||
health_status["status"] = "degraded"
|
|
||||||
|
|
||||||
# check MongoDB connection
|
|
||||||
if test_database_connection():
|
|
||||||
health_status["dependencies"]["mongodb"] = "connected"
|
|
||||||
else:
|
|
||||||
health_status["dependencies"]["mongodb"] = "disconnected"
|
|
||||||
|
|
||||||
return health_status
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/test-task")
|
|
||||||
async def dispatch_test_task(request: TestTaskRequest):
|
|
||||||
"""
|
|
||||||
Dispatch a test task to Celery worker.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
request: Test task request containing message
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Task dispatch information
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
HTTPException: If task dispatch fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Send task to worker
|
|
||||||
task = celery_app.send_task(
|
|
||||||
"main.test_task",
|
|
||||||
args=[request.message]
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"status": "dispatched",
|
|
||||||
"task_id": task.id,
|
|
||||||
"message": f"Test task dispatched with message: {request.message}"
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=500,
|
|
||||||
detail=f"Failed to dispatch task: {str(e)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def root():
|
async def root():
|
||||||
"""
|
"""
|
||||||
Root endpoint.
|
Root endpoint with basic application information.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Basic service information
|
Dictionary containing welcome message and available endpoints
|
||||||
"""
|
"""
|
||||||
return {
|
return {
|
||||||
"service": "MyDocManager File Processor",
|
"message": "Welcome to MyDocManager",
|
||||||
"version": "1.0.0",
|
"description": "Real-time document processing application",
|
||||||
"status": "running"
|
"docs": "/docs",
|
||||||
|
"health": "/health"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/watcher/status")
|
||||||
|
async def watcher_status():
|
||||||
|
"""
|
||||||
|
Get file watcher status.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing file watcher status information
|
||||||
|
"""
|
||||||
|
if not file_watcher:
|
||||||
|
return {
|
||||||
|
"status": "not_initialized",
|
||||||
|
"running": False
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "initialized",
|
||||||
|
"running": file_watcher.is_running(),
|
||||||
|
"watch_directory": str(file_watcher.watch_directory),
|
||||||
|
"recursive": file_watcher.recursive
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,12 +3,45 @@ Authentication models and enums for user management.
|
|||||||
|
|
||||||
Contains user roles enumeration and authentication-related Pydantic models.
|
Contains user roles enumeration and authentication-related Pydantic models.
|
||||||
"""
|
"""
|
||||||
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
|
||||||
|
|
||||||
class UserRole(str, Enum):
|
class UserRole(str, Enum):
|
||||||
"""User roles enumeration with string values."""
|
"""User roles enumeration with string values."""
|
||||||
|
|
||||||
USER = "user"
|
USER = "user"
|
||||||
ADMIN = "admin"
|
ADMIN = "admin"
|
||||||
|
|
||||||
|
|
||||||
|
class UserResponse(BaseModel):
|
||||||
|
"""Model for user data in API responses (excludes password_hash)."""
|
||||||
|
|
||||||
|
id: PyObjectId = Field(alias="_id")
|
||||||
|
username: str
|
||||||
|
email: str
|
||||||
|
role: UserRole
|
||||||
|
is_active: bool
|
||||||
|
created_at: datetime
|
||||||
|
updated_at: datetime
|
||||||
|
|
||||||
|
model_config = {
|
||||||
|
"populate_by_name": True,
|
||||||
|
"arbitrary_types_allowed": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class LoginResponse(BaseModel):
|
||||||
|
"""Response model for successful login."""
|
||||||
|
access_token: str
|
||||||
|
token_type: str = "bearer"
|
||||||
|
user: UserResponse
|
||||||
|
|
||||||
|
|
||||||
|
class MessageResponse(BaseModel):
|
||||||
|
"""Generic message response."""
|
||||||
|
message: str
|
||||||
|
|||||||
@@ -33,15 +33,6 @@ class ExtractionMethod(str, Enum):
|
|||||||
HYBRID = "hybrid"
|
HYBRID = "hybrid"
|
||||||
|
|
||||||
|
|
||||||
class ProcessingStatus(str, Enum):
|
|
||||||
"""Status values for processing jobs."""
|
|
||||||
|
|
||||||
PENDING = "pending"
|
|
||||||
PROCESSING = "processing"
|
|
||||||
COMPLETED = "completed"
|
|
||||||
FAILED = "failed"
|
|
||||||
|
|
||||||
|
|
||||||
class FileDocument(BaseModel):
|
class FileDocument(BaseModel):
|
||||||
"""
|
"""
|
||||||
Model for file documents stored in the 'files' collection.
|
Model for file documents stored in the 'files' collection.
|
||||||
@@ -58,6 +49,9 @@ class FileDocument(BaseModel):
|
|||||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||||
|
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||||
|
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||||
|
mime_type: str = Field(..., description="MIME type detected")
|
||||||
|
|
||||||
@field_validator('filepath')
|
@field_validator('filepath')
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -74,69 +68,3 @@ class FileDocument(BaseModel):
|
|||||||
if not v.strip():
|
if not v.strip():
|
||||||
raise ValueError("Filename cannot be empty")
|
raise ValueError("Filename cannot be empty")
|
||||||
return v.strip()
|
return v.strip()
|
||||||
|
|
||||||
class Config:
|
|
||||||
"""Pydantic configuration."""
|
|
||||||
populate_by_name = True
|
|
||||||
arbitrary_types_allowed = True
|
|
||||||
json_encoders = {ObjectId: str}
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentContent(BaseModel):
|
|
||||||
"""Model for document content."""
|
|
||||||
|
|
||||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
|
||||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
|
||||||
content: str = Field(..., description="File content")
|
|
||||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
|
||||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
|
||||||
mime_type: str = Field(..., description="MIME type detected")
|
|
||||||
|
|
||||||
|
|
||||||
class ProcessingJob(BaseModel):
|
|
||||||
"""
|
|
||||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
|
||||||
|
|
||||||
Tracks the lifecycle and status of document processing tasks.
|
|
||||||
"""
|
|
||||||
|
|
||||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
|
||||||
file_id: PyObjectId = Field(..., description="Reference to file document")
|
|
||||||
status: ProcessingStatus = Field(
|
|
||||||
default=ProcessingStatus.PENDING,
|
|
||||||
description="Current processing status"
|
|
||||||
)
|
|
||||||
task_id: Optional[str] = Field(
|
|
||||||
default=None,
|
|
||||||
description="Celery task UUID"
|
|
||||||
)
|
|
||||||
created_at: Optional[datetime] = Field(
|
|
||||||
default=None,
|
|
||||||
description="Timestamp when job was created"
|
|
||||||
)
|
|
||||||
started_at: Optional[datetime] = Field(
|
|
||||||
default=None,
|
|
||||||
description="Timestamp when processing started"
|
|
||||||
)
|
|
||||||
completed_at: Optional[datetime] = Field(
|
|
||||||
default=None,
|
|
||||||
description="Timestamp when processing completed"
|
|
||||||
)
|
|
||||||
error_message: Optional[str] = Field(
|
|
||||||
default=None,
|
|
||||||
description="Error message if processing failed"
|
|
||||||
)
|
|
||||||
|
|
||||||
@field_validator('error_message')
|
|
||||||
@classmethod
|
|
||||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
|
||||||
"""Clean up error message."""
|
|
||||||
if v is not None:
|
|
||||||
return v.strip() if v.strip() else None
|
|
||||||
return v
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
"""Pydantic configuration."""
|
|
||||||
populate_by_name = True
|
|
||||||
arbitrary_types_allowed = True
|
|
||||||
json_encoders = {ObjectId: str}
|
|
||||||
|
|||||||
@@ -0,0 +1,42 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from bson import ObjectId
|
||||||
|
from pydantic import BaseModel, Field, field_validator
|
||||||
|
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessingStatus(str, Enum):
|
||||||
|
"""Status values for processing jobs."""
|
||||||
|
|
||||||
|
PENDING = "pending"
|
||||||
|
PROCESSING = "processing"
|
||||||
|
COMPLETED = "completed"
|
||||||
|
FAILED = "failed"
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessingJob(BaseModel):
|
||||||
|
"""
|
||||||
|
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||||
|
|
||||||
|
Tracks the lifecycle and status of document processing tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||||
|
document_id: PyObjectId = Field(..., description="Reference to file document")
|
||||||
|
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
|
||||||
|
task_id: Optional[str] = Field(default=None, description="Celery task UUID")
|
||||||
|
created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
|
||||||
|
started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
|
||||||
|
completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
|
||||||
|
error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
|
||||||
|
|
||||||
|
@field_validator('error_message')
|
||||||
|
@classmethod
|
||||||
|
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||||
|
"""Clean up error message."""
|
||||||
|
if v is not None:
|
||||||
|
return v.strip() if v.strip() else None
|
||||||
|
return v
|
||||||
@@ -7,10 +7,10 @@ and API responses with proper validation and type safety.
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, Any
|
from typing import Optional
|
||||||
|
|
||||||
from bson import ObjectId
|
from bson import ObjectId
|
||||||
from pydantic import BaseModel, Field, field_validator, EmailStr
|
from pydantic import BaseModel, Field, field_validator, EmailStr
|
||||||
from pydantic_core import core_schema
|
|
||||||
|
|
||||||
from app.models.auth import UserRole
|
from app.models.auth import UserRole
|
||||||
from app.models.types import PyObjectId
|
from app.models.types import PyObjectId
|
||||||
@@ -138,21 +138,3 @@ class UserInDB(BaseModel):
|
|||||||
"arbitrary_types_allowed": True,
|
"arbitrary_types_allowed": True,
|
||||||
"json_encoders": {ObjectId: str}
|
"json_encoders": {ObjectId: str}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class UserResponse(BaseModel):
|
|
||||||
"""Model for user data in API responses (excludes password_hash)."""
|
|
||||||
|
|
||||||
id: PyObjectId = Field(alias="_id")
|
|
||||||
username: str
|
|
||||||
email: str
|
|
||||||
role: UserRole
|
|
||||||
is_active: bool
|
|
||||||
created_at: datetime
|
|
||||||
updated_at: datetime
|
|
||||||
|
|
||||||
model_config = {
|
|
||||||
"populate_by_name": True,
|
|
||||||
"arbitrary_types_allowed": True,
|
|
||||||
"json_encoders": {ObjectId: str}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -4,7 +4,11 @@ Authentication service for password hashing and verification.
|
|||||||
This module provides authentication-related functionality including
|
This module provides authentication-related functionality including
|
||||||
password hashing, verification, and JWT token management.
|
password hashing, verification, and JWT token management.
|
||||||
"""
|
"""
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
import jwt
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
from app.utils.security import hash_password, verify_password
|
from app.utils.security import hash_password, verify_password
|
||||||
|
|
||||||
|
|
||||||
@@ -55,4 +59,26 @@ class AuthService:
|
|||||||
>>> auth.verify_user_password("wrongpassword", hashed)
|
>>> auth.verify_user_password("wrongpassword", hashed)
|
||||||
False
|
False
|
||||||
"""
|
"""
|
||||||
return verify_password(password, hashed_password)
|
return verify_password(password, hashed_password)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_access_token(data=dict) -> str:
|
||||||
|
"""
|
||||||
|
Create a JWT access token.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Payload data to include in the token.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Encoded JWT token.
|
||||||
|
"""
|
||||||
|
# Copy data to avoid modifying the original dict
|
||||||
|
to_encode = data.copy()
|
||||||
|
|
||||||
|
# Add expiration time
|
||||||
|
expire = datetime.now() + timedelta(hours=settings.get_jwt_expire_hours())
|
||||||
|
to_encode.update({"exp": expire})
|
||||||
|
|
||||||
|
# Encode JWT
|
||||||
|
encoded_jwt = jwt.encode(to_encode, settings.get_jwt_secret_key(), algorithm=settings.get_jwt_algorithm())
|
||||||
|
return encoded_jwt
|
||||||
|
|||||||
@@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import magic
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Dict, Any, Tuple
|
from typing import List, Optional, Dict, Any
|
||||||
|
|
||||||
from motor.motor_asyncio import AsyncIOMotorClientSession
|
import magic
|
||||||
from pymongo.errors import PyMongoError
|
from pymongo.errors import PyMongoError
|
||||||
|
|
||||||
from app.database.connection import get_database
|
from app.config.settings import get_objects_folder
|
||||||
from app.database.repositories.document_repository import FileDocumentRepository
|
from app.database.repositories.document_repository import FileDocumentRepository
|
||||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
|
||||||
from app.models.document import (
|
from app.models.document import (
|
||||||
FileDocument,
|
FileDocument,
|
||||||
DocumentContent,
|
|
||||||
FileType,
|
FileType,
|
||||||
ProcessingStatus
|
|
||||||
)
|
)
|
||||||
from app.models.types import PyObjectId
|
from app.models.types import PyObjectId
|
||||||
|
|
||||||
@@ -34,13 +31,25 @@ class DocumentService:
|
|||||||
and their content while ensuring data consistency through transactions.
|
and their content while ensuring data consistency through transactions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, database, objects_folder: str = None):
|
||||||
"""Initialize the document service with repository dependencies."""
|
"""
|
||||||
self.db = get_database()
|
Initialize the document service with repository dependencies.
|
||||||
self.file_repository = FileDocumentRepository(self.db)
|
|
||||||
self.content_repository = DocumentContentRepository(self.db)
|
Args:
|
||||||
|
database: Database instance
|
||||||
|
objects_folder: folder to store files by their hash
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.db = database
|
||||||
|
self.document_repository = FileDocumentRepository(self.db)
|
||||||
|
self.objects_folder = objects_folder or get_objects_folder()
|
||||||
|
|
||||||
def _calculate_file_hash(self, file_bytes: bytes) -> str:
|
def initialize(self):
|
||||||
|
self.document_repository.initialize()
|
||||||
|
return self
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _calculate_file_hash(file_bytes: bytes) -> str:
|
||||||
"""
|
"""
|
||||||
Calculate SHA256 hash of file content.
|
Calculate SHA256 hash of file content.
|
||||||
|
|
||||||
@@ -52,7 +61,8 @@ class DocumentService:
|
|||||||
"""
|
"""
|
||||||
return hashlib.sha256(file_bytes).hexdigest()
|
return hashlib.sha256(file_bytes).hexdigest()
|
||||||
|
|
||||||
def _detect_file_type(self, file_path: str) -> FileType:
|
@staticmethod
|
||||||
|
def _detect_file_type(file_path: str) -> FileType:
|
||||||
"""
|
"""
|
||||||
Detect file type from file extension.
|
Detect file type from file extension.
|
||||||
|
|
||||||
@@ -72,7 +82,8 @@ class DocumentService:
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError(f"Unsupported file type: {extension}")
|
raise ValueError(f"Unsupported file type: {extension}")
|
||||||
|
|
||||||
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
@staticmethod
|
||||||
|
def _detect_mime_type(file_bytes: bytes) -> str:
|
||||||
"""
|
"""
|
||||||
Detect MIME type from file content.
|
Detect MIME type from file content.
|
||||||
|
|
||||||
@@ -84,10 +95,51 @@ class DocumentService:
|
|||||||
"""
|
"""
|
||||||
return magic.from_buffer(file_bytes, mime=True)
|
return magic.from_buffer(file_bytes, mime=True)
|
||||||
|
|
||||||
async def create_document(
|
@staticmethod
|
||||||
|
def _read_file_bytes(file_path: str | Path) -> bytes:
|
||||||
|
"""
|
||||||
|
Read file content as bytes asynchronously.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str | Path): Path of the file to read
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bytes: Content of the file
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If the file does not exist
|
||||||
|
OSError: If any I/O error occurs
|
||||||
|
"""
|
||||||
|
path = Path(file_path)
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
|
||||||
|
return path.read_bytes()
|
||||||
|
|
||||||
|
def _get_document_path(self, file_hash):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param file_hash:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
||||||
|
|
||||||
|
def save_content_if_needed(self, file_hash, content: bytes):
|
||||||
|
target_path = self._get_document_path(file_hash)
|
||||||
|
if os.path.exists(target_path):
|
||||||
|
return
|
||||||
|
|
||||||
|
if not os.path.exists(os.path.dirname(target_path)):
|
||||||
|
os.makedirs(os.path.dirname(target_path))
|
||||||
|
|
||||||
|
with open(target_path, "wb") as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
def create_document(
|
||||||
self,
|
self,
|
||||||
file_path: str,
|
file_path: str,
|
||||||
file_bytes: bytes,
|
file_bytes: bytes | None = None,
|
||||||
encoding: str = "utf-8"
|
encoding: str = "utf-8"
|
||||||
) -> FileDocument:
|
) -> FileDocument:
|
||||||
"""
|
"""
|
||||||
@@ -110,57 +162,40 @@ class DocumentService:
|
|||||||
PyMongoError: If database operation fails
|
PyMongoError: If database operation fails
|
||||||
"""
|
"""
|
||||||
# Calculate automatic attributes
|
# Calculate automatic attributes
|
||||||
|
file_bytes = file_bytes if file_bytes is not None else self._read_file_bytes(file_path)
|
||||||
file_hash = self._calculate_file_hash(file_bytes)
|
file_hash = self._calculate_file_hash(file_bytes)
|
||||||
file_type = self._detect_file_type(file_path)
|
file_type = self._detect_file_type(file_path)
|
||||||
mime_type = self._detect_mime_type(file_bytes)
|
mime_type = self._detect_mime_type(file_bytes)
|
||||||
file_size = len(file_bytes)
|
file_size = len(file_bytes)
|
||||||
filename = Path(file_path).name
|
filename = Path(file_path).name
|
||||||
detected_at = datetime.utcnow()
|
detected_at = datetime.now()
|
||||||
|
|
||||||
# Start MongoDB transaction
|
try:
|
||||||
async with await self.db.client.start_session() as session:
|
self.save_content_if_needed(file_hash, file_bytes)
|
||||||
async with session.start_transaction():
|
|
||||||
try:
|
# Create FileDocument
|
||||||
# Check if content already exists
|
file_data = FileDocument(
|
||||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
filename=filename,
|
||||||
file_hash, session=session
|
filepath=file_path,
|
||||||
)
|
file_type=file_type,
|
||||||
|
extraction_method=None, # Will be set by processing workers
|
||||||
# Create DocumentContent if it doesn't exist
|
metadata={}, # Empty for now
|
||||||
if not existing_content:
|
detected_at=detected_at,
|
||||||
content_data = DocumentContent(
|
file_hash=file_hash,
|
||||||
file_hash=file_hash,
|
encoding=encoding,
|
||||||
content="", # Will be populated by processing workers
|
file_size=file_size,
|
||||||
encoding=encoding,
|
mime_type=mime_type
|
||||||
file_size=file_size,
|
)
|
||||||
mime_type=mime_type
|
|
||||||
)
|
created_file = self.document_repository.create_document(file_data)
|
||||||
await self.content_repository.create_document_content(
|
|
||||||
content_data, session=session
|
return created_file
|
||||||
)
|
|
||||||
|
except Exception as e:
|
||||||
# Create FileDocument
|
# Transaction will automatically rollback if supported
|
||||||
file_data = FileDocument(
|
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||||
filename=filename,
|
|
||||||
filepath=file_path,
|
|
||||||
file_type=file_type,
|
|
||||||
extraction_method=None, # Will be set by processing workers
|
|
||||||
metadata={}, # Empty for now
|
|
||||||
detected_at=detected_at,
|
|
||||||
file_hash=file_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
created_file = await self.file_repository.create_document(
|
|
||||||
file_data, session=session
|
|
||||||
)
|
|
||||||
|
|
||||||
return created_file
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# Transaction will automatically rollback
|
|
||||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
|
||||||
|
|
||||||
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Retrieve a document by its ID.
|
Retrieve a document by its ID.
|
||||||
|
|
||||||
@@ -170,9 +205,9 @@ class DocumentService:
|
|||||||
Returns:
|
Returns:
|
||||||
FileDocument if found, None otherwise
|
FileDocument if found, None otherwise
|
||||||
"""
|
"""
|
||||||
return await self.file_repository.find_document_by_id(document_id)
|
return self.document_repository.find_document_by_id(str(document_id))
|
||||||
|
|
||||||
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Retrieve a document by its file hash.
|
Retrieve a document by its file hash.
|
||||||
|
|
||||||
@@ -182,9 +217,9 @@ class DocumentService:
|
|||||||
Returns:
|
Returns:
|
||||||
FileDocument if found, None otherwise
|
FileDocument if found, None otherwise
|
||||||
"""
|
"""
|
||||||
return await self.file_repository.find_document_by_hash(file_hash)
|
return self.document_repository.find_document_by_hash(file_hash)
|
||||||
|
|
||||||
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Retrieve a document by its file path.
|
Retrieve a document by its file path.
|
||||||
|
|
||||||
@@ -194,34 +229,17 @@ class DocumentService:
|
|||||||
Returns:
|
Returns:
|
||||||
FileDocument if found, None otherwise
|
FileDocument if found, None otherwise
|
||||||
"""
|
"""
|
||||||
return await self.file_repository.find_document_by_filepath(filepath)
|
return self.document_repository.find_document_by_filepath(filepath)
|
||||||
|
|
||||||
async def get_document_with_content(
|
def get_document_content_by_hash(self, file_hash):
|
||||||
self,
|
target_path = self._get_document_path(file_hash)
|
||||||
document_id: PyObjectId
|
if not os.path.exists(target_path):
|
||||||
) -> Optional[Tuple[FileDocument, DocumentContent]]:
|
|
||||||
"""
|
|
||||||
Retrieve a document with its associated content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_id: Document ObjectId
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (FileDocument, DocumentContent) if found, None otherwise
|
|
||||||
"""
|
|
||||||
document = await self.get_document_by_id(document_id)
|
|
||||||
if not document:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
content = await self.content_repository.find_document_content_by_file_hash(
|
with open(target_path, "rb") as f:
|
||||||
document.file_hash
|
return f.read()
|
||||||
)
|
|
||||||
if not content:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return (document, content)
|
|
||||||
|
|
||||||
async def list_documents(
|
def list_documents(
|
||||||
self,
|
self,
|
||||||
skip: int = 0,
|
skip: int = 0,
|
||||||
limit: int = 100
|
limit: int = 100
|
||||||
@@ -236,18 +254,18 @@ class DocumentService:
|
|||||||
Returns:
|
Returns:
|
||||||
List of FileDocument instances
|
List of FileDocument instances
|
||||||
"""
|
"""
|
||||||
return await self.file_repository.list_documents(skip=skip, limit=limit)
|
return self.document_repository.list_documents(skip=skip, limit=limit)
|
||||||
|
|
||||||
async def count_documents(self) -> int:
|
def count_documents(self) -> int:
|
||||||
"""
|
"""
|
||||||
Get total number of documents.
|
Get total number of documents.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Total document count
|
Total document count
|
||||||
"""
|
"""
|
||||||
return await self.file_repository.count_documents()
|
return self.document_repository.count_documents()
|
||||||
|
|
||||||
async def update_document(
|
def update_document(
|
||||||
self,
|
self,
|
||||||
document_id: PyObjectId,
|
document_id: PyObjectId,
|
||||||
update_data: Dict[str, Any]
|
update_data: Dict[str, Any]
|
||||||
@@ -262,9 +280,14 @@ class DocumentService:
|
|||||||
Returns:
|
Returns:
|
||||||
Updated FileDocument if found, None otherwise
|
Updated FileDocument if found, None otherwise
|
||||||
"""
|
"""
|
||||||
return await self.file_repository.update_document(document_id, update_data)
|
if "file_bytes" in update_data:
|
||||||
|
file_hash = self._calculate_file_hash(update_data["file_bytes"])
|
||||||
|
update_data["file_hash"] = file_hash
|
||||||
|
self.save_content_if_needed(file_hash, update_data["file_bytes"])
|
||||||
|
|
||||||
|
return self.document_repository.update_document(document_id, update_data)
|
||||||
|
|
||||||
async def delete_document(self, document_id: PyObjectId) -> bool:
|
def delete_document(self, document_id: PyObjectId) -> bool:
|
||||||
"""
|
"""
|
||||||
Delete a document and its orphaned content.
|
Delete a document and its orphaned content.
|
||||||
|
|
||||||
@@ -281,100 +304,31 @@ class DocumentService:
|
|||||||
Raises:
|
Raises:
|
||||||
PyMongoError: If database operation fails
|
PyMongoError: If database operation fails
|
||||||
"""
|
"""
|
||||||
# Start MongoDB transaction
|
# Start transaction
|
||||||
async with await self.db.client.start_session() as session:
|
|
||||||
async with session.start_transaction():
|
try:
|
||||||
|
# Get document to find its hash
|
||||||
|
document = self.document_repository.find_document_by_id(document_id)
|
||||||
|
if not document:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Delete the document
|
||||||
|
deleted = self.document_repository.delete_document(document_id)
|
||||||
|
if not deleted:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if content is orphaned
|
||||||
|
remaining_files = self.document_repository.find_document_by_hash(document.file_hash)
|
||||||
|
|
||||||
|
# If no other files reference this content, delete it
|
||||||
|
if not remaining_files:
|
||||||
try:
|
try:
|
||||||
# Get document to find its hash
|
os.remove(self._get_document_path(document.file_hash))
|
||||||
document = await self.file_repository.find_document_by_id(
|
except Exception:
|
||||||
document_id, session=session
|
pass
|
||||||
)
|
|
||||||
if not document:
|
return True
|
||||||
return False
|
|
||||||
|
|
||||||
# Delete the document
|
|
||||||
deleted = await self.file_repository.delete_document(
|
|
||||||
document_id, session=session
|
|
||||||
)
|
|
||||||
if not deleted:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check if content is orphaned
|
|
||||||
remaining_files = await self.file_repository.find_document_by_hash(
|
|
||||||
document.file_hash, session=session
|
|
||||||
)
|
|
||||||
|
|
||||||
# If no other files reference this content, delete it
|
|
||||||
if not remaining_files:
|
|
||||||
content = await self.content_repository.find_document_content_by_file_hash(
|
|
||||||
document.file_hash, session=session
|
|
||||||
)
|
|
||||||
if content:
|
|
||||||
await self.content_repository.delete_document_content(
|
|
||||||
content.id, session=session
|
|
||||||
)
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# Transaction will automatically rollback
|
|
||||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
|
||||||
|
|
||||||
async def content_exists(self, file_hash: str) -> bool:
|
|
||||||
"""
|
|
||||||
Check if content with given hash exists.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_hash: SHA256 hash of file content
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if content exists, False otherwise
|
|
||||||
"""
|
|
||||||
return await self.content_repository.content_exists(file_hash)
|
|
||||||
|
|
||||||
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
|
||||||
"""
|
|
||||||
Retrieve content by file hash.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_hash: SHA256 hash of file content
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DocumentContent if found, None otherwise
|
|
||||||
"""
|
|
||||||
return await self.content_repository.find_document_content_by_file_hash(file_hash)
|
|
||||||
|
|
||||||
async def update_document_content(
|
|
||||||
self,
|
|
||||||
file_hash: str,
|
|
||||||
content: str,
|
|
||||||
encoding: str = "utf-8"
|
|
||||||
) -> Optional[DocumentContent]:
|
|
||||||
"""
|
|
||||||
Update the extracted content for a document.
|
|
||||||
|
|
||||||
This method is typically called by processing workers to store
|
|
||||||
the extracted text content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_hash: SHA256 hash of file content
|
|
||||||
content: Extracted text content
|
|
||||||
encoding: Character encoding
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Updated DocumentContent if found, None otherwise
|
|
||||||
"""
|
|
||||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
|
||||||
file_hash
|
|
||||||
)
|
|
||||||
if not existing_content:
|
|
||||||
return None
|
|
||||||
|
|
||||||
update_data = {
|
except Exception as e:
|
||||||
"content": content,
|
# Transaction will automatically rollback if supported
|
||||||
"encoding": encoding
|
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||||
}
|
|
||||||
|
|
||||||
return await self.content_repository.update_document_content(
|
|
||||||
existing_content.id, update_data
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -8,8 +8,8 @@ creating default admin user if none exists.
|
|||||||
import logging
|
import logging
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from app.models.user import UserCreate, UserInDB, UserCreateNoValidation
|
|
||||||
from app.models.auth import UserRole
|
from app.models.auth import UserRole
|
||||||
|
from app.models.user import UserInDB, UserCreateNoValidation
|
||||||
from app.services.user_service import UserService
|
from app.services.user_service import UserService
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -31,7 +31,6 @@ class InitializationService:
|
|||||||
user_service (UserService): Service for user operations
|
user_service (UserService): Service for user operations
|
||||||
"""
|
"""
|
||||||
self.user_service = user_service
|
self.user_service = user_service
|
||||||
|
|
||||||
|
|
||||||
def ensure_admin_user_exists(self) -> Optional[UserInDB]:
|
def ensure_admin_user_exists(self) -> Optional[UserInDB]:
|
||||||
"""
|
"""
|
||||||
@@ -131,4 +130,23 @@ class InitializationService:
|
|||||||
logger.error(error_msg)
|
logger.error(error_msg)
|
||||||
initialization_summary["errors"].append(error_msg)
|
initialization_summary["errors"].append(error_msg)
|
||||||
|
|
||||||
return initialization_summary
|
self.log_initialization_result(initialization_summary)
|
||||||
|
|
||||||
|
return initialization_summary
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def log_initialization_result(summary: dict) -> None:
|
||||||
|
"""
|
||||||
|
Log the result of the initialization process.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
summary (dict): Summary of initialization tasks performed
|
||||||
|
"""
|
||||||
|
if summary["initialization_success"]:
|
||||||
|
logger.info("Application startup completed successfully")
|
||||||
|
if summary["admin_user_created"]:
|
||||||
|
logger.info("Default admin user was created during startup")
|
||||||
|
else:
|
||||||
|
logger.error("Application startup completed with errors:")
|
||||||
|
for error in summary["errors"]:
|
||||||
|
logger.error(f" - {error}")
|
||||||
|
|||||||
182
src/file-processor/app/services/job_service.py
Normal file
182
src/file-processor/app/services/job_service.py
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
"""
|
||||||
|
Service layer for job processing business logic.
|
||||||
|
|
||||||
|
This module provides high-level operations for managing processing jobs
|
||||||
|
with strict status transition validation and business rules enforcement.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from app.database.repositories.job_repository import JobRepository
|
||||||
|
from app.exceptions.job_exceptions import InvalidStatusTransitionError
|
||||||
|
from app.models.job import ProcessingJob, ProcessingStatus
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
|
||||||
|
|
||||||
|
class JobService:
|
||||||
|
"""
|
||||||
|
Service for processing job business logic operations.
|
||||||
|
|
||||||
|
Provides high-level job management with strict status transition
|
||||||
|
validation and business rule enforcement.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, database):
|
||||||
|
"""
|
||||||
|
Initialize service with job repository.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repository: Optional JobRepository instance (creates default if None)
|
||||||
|
"""
|
||||||
|
self.db = database
|
||||||
|
self.repository = JobRepository(database)
|
||||||
|
|
||||||
|
def initialize(self):
|
||||||
|
self.repository.initialize()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob:
|
||||||
|
"""
|
||||||
|
Create a new processing job.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: Reference to the file document
|
||||||
|
task_id: Optional Celery task UUID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The created ProcessingJob
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
return self.repository.create_job(document_id, task_id)
|
||||||
|
|
||||||
|
def get_job_by_id(self, job_id: PyObjectId) -> ProcessingJob:
|
||||||
|
"""
|
||||||
|
Retrieve a job by its ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The job ObjectId
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The ProcessingJob document
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobNotFoundError: If job doesn't exist
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
return self.repository.find_job_by_id(job_id)
|
||||||
|
|
||||||
|
def mark_job_as_started(self, job_id: PyObjectId) -> ProcessingJob:
|
||||||
|
"""
|
||||||
|
Mark a job as started (PENDING → PROCESSING).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The job ObjectId
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The updated ProcessingJob
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobNotFoundError: If job doesn't exist
|
||||||
|
InvalidStatusTransitionError: If job is not in PENDING status
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
# Get current job to validate transition
|
||||||
|
current_job = self.repository.find_job_by_id(job_id)
|
||||||
|
|
||||||
|
# Validate status transition
|
||||||
|
if current_job.status != ProcessingStatus.PENDING:
|
||||||
|
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.PROCESSING)
|
||||||
|
|
||||||
|
# Update status
|
||||||
|
return self.repository.update_job_status(job_id, ProcessingStatus.PROCESSING)
|
||||||
|
|
||||||
|
def mark_job_as_completed(self, job_id: PyObjectId) -> ProcessingJob:
|
||||||
|
"""
|
||||||
|
Mark a job as completed (PROCESSING → COMPLETED).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The job ObjectId
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The updated ProcessingJob
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobNotFoundError: If job doesn't exist
|
||||||
|
InvalidStatusTransitionError: If job is not in PROCESSING status
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
# Get current job to validate transition
|
||||||
|
current_job = self.repository.find_job_by_id(job_id)
|
||||||
|
|
||||||
|
# Validate status transition
|
||||||
|
if current_job.status != ProcessingStatus.PROCESSING:
|
||||||
|
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED)
|
||||||
|
|
||||||
|
# Update status
|
||||||
|
return self.repository.update_job_status(job_id, ProcessingStatus.COMPLETED)
|
||||||
|
|
||||||
|
def mark_job_as_failed(
|
||||||
|
self,
|
||||||
|
job_id: PyObjectId,
|
||||||
|
error_message: Optional[str] = None
|
||||||
|
) -> ProcessingJob:
|
||||||
|
"""
|
||||||
|
Mark a job as failed (PROCESSING → FAILED).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The job ObjectId
|
||||||
|
error_message: Optional error description
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The updated ProcessingJob
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobNotFoundError: If job doesn't exist
|
||||||
|
InvalidStatusTransitionError: If job is not in PROCESSING status
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
# Get current job to validate transition
|
||||||
|
current_job = self.repository.find_job_by_id(job_id)
|
||||||
|
|
||||||
|
# Validate status transition
|
||||||
|
if current_job.status != ProcessingStatus.PROCESSING:
|
||||||
|
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED)
|
||||||
|
|
||||||
|
# Update status with error message
|
||||||
|
return self.repository.update_job_status(
|
||||||
|
job_id,
|
||||||
|
ProcessingStatus.FAILED,
|
||||||
|
error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
def delete_job(self, job_id: PyObjectId) -> bool:
|
||||||
|
"""
|
||||||
|
Delete a job from the database.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: The job ObjectId
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if job was deleted, False if not found
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
return self.repository.delete_job(job_id)
|
||||||
|
|
||||||
|
def get_jobs_by_status(self, status: ProcessingStatus) -> list[ProcessingJob]:
|
||||||
|
"""
|
||||||
|
Retrieve all jobs with a specific status.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
status: The processing status to filter by
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ProcessingJob documents
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
JobRepositoryError: If database operation fails
|
||||||
|
"""
|
||||||
|
return self.repository.get_jobs_by_status(status)
|
||||||
@@ -6,11 +6,11 @@ retrieval, updates, and authentication operations with proper error handling.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
|
|
||||||
from pymongo.errors import DuplicateKeyError
|
from pymongo.errors import DuplicateKeyError
|
||||||
|
|
||||||
from app.models.user import UserCreate, UserInDB, UserUpdate, UserResponse, UserCreateNoValidation
|
|
||||||
from app.models.auth import UserRole
|
|
||||||
from app.database.repositories.user_repository import UserRepository
|
from app.database.repositories.user_repository import UserRepository
|
||||||
|
from app.models.user import UserCreate, UserInDB, UserUpdate, UserCreateNoValidation
|
||||||
from app.services.auth_service import AuthService
|
from app.services.auth_service import AuthService
|
||||||
|
|
||||||
|
|
||||||
@@ -22,16 +22,21 @@ class UserService:
|
|||||||
authentication, and data management with proper validation.
|
authentication, and data management with proper validation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, user_repository: UserRepository):
|
def __init__(self, database):
|
||||||
"""
|
"""
|
||||||
Initialize user service with repository dependency.
|
Initialize user service with repository dependency.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
user_repository (UserRepository): Repository for user data operations
|
user_repository (UserRepository): Repository for user data operations
|
||||||
"""
|
"""
|
||||||
self.user_repository = user_repository
|
self.db = database
|
||||||
|
self.user_repository = UserRepository(self.db)
|
||||||
self.auth_service = AuthService()
|
self.auth_service = AuthService()
|
||||||
|
|
||||||
|
def initialize(self):
|
||||||
|
self.user_repository.initialize()
|
||||||
|
return self
|
||||||
|
|
||||||
def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
|
def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
|
||||||
"""
|
"""
|
||||||
Create a new user with business logic validation.
|
Create a new user with business logic validation.
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
|
asgiref==3.9.1
|
||||||
bcrypt==4.3.0
|
bcrypt==4.3.0
|
||||||
celery==5.5.3
|
celery==5.5.3
|
||||||
email-validator==2.3.0
|
email-validator==2.3.0
|
||||||
fastapi==0.116.1
|
fastapi==0.116.1
|
||||||
httptools==0.6.4
|
httptools==0.6.4
|
||||||
motor==3.7.1
|
motor==3.7.1
|
||||||
pymongo==4.15.0
|
|
||||||
pydantic==2.11.9
|
pydantic==2.11.9
|
||||||
|
PyJWT==2.10.1
|
||||||
|
pymongo==4.15.0
|
||||||
redis==6.4.0
|
redis==6.4.0
|
||||||
uvicorn==0.35.0
|
uvicorn==0.35.0
|
||||||
python-magic==0.4.27
|
python-magic==0.4.27
|
||||||
|
watchdog==6.0.0
|
||||||
41
src/frontend/.dockerignore
Normal file
41
src/frontend/.dockerignore
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# Dependencies
|
||||||
|
node_modules
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
|
||||||
|
# Build outputs
|
||||||
|
dist
|
||||||
|
build
|
||||||
|
|
||||||
|
# Environment files
|
||||||
|
.env.local
|
||||||
|
.env.development.local
|
||||||
|
.env.test.local
|
||||||
|
.env.production.local
|
||||||
|
|
||||||
|
# IDE files
|
||||||
|
.vscode
|
||||||
|
.idea
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# OS generated files
|
||||||
|
.DS_Store
|
||||||
|
.DS_Store?
|
||||||
|
._*
|
||||||
|
.Spotlight-V100
|
||||||
|
.Trashes
|
||||||
|
ehthumbs.db
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Git
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
Dockerfile
|
||||||
|
.dockerignore
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
20
src/frontend/Dockerfile
Normal file
20
src/frontend/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
# Use Node.js 20 Alpine for lightweight container
|
||||||
|
FROM node:20-alpine
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy package.json and package-lock.json (if available)
|
||||||
|
COPY package*.json ./
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN npm install
|
||||||
|
|
||||||
|
# Copy source code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Expose Vite default port
|
||||||
|
EXPOSE 5173
|
||||||
|
|
||||||
|
# Start development server with host 0.0.0.0 to accept external connections
|
||||||
|
CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0", "--port", "5173"]
|
||||||
@@ -3,12 +3,18 @@ FROM python:3.12-slim
|
|||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install libmagic
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libmagic1 \
|
||||||
|
file \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Copy requirements and install dependencies
|
# Copy requirements and install dependencies
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
# Copy application code
|
# Copy application code
|
||||||
COPY tasks/ .
|
COPY . .
|
||||||
|
|
||||||
# Command will be overridden by docker-compose
|
# Command will be overridden by docker-compose
|
||||||
CMD ["celery", "-A", "main", "worker", "--loglevel=info"]
|
CMD ["celery", "-A", "main", "worker", "--loglevel=info"]
|
||||||
|
|||||||
@@ -1,4 +1,13 @@
|
|||||||
|
asgiref==3.9.1
|
||||||
|
bcrypt==4.3.0
|
||||||
celery==5.5.3
|
celery==5.5.3
|
||||||
|
email-validator==2.3.0
|
||||||
|
fastapi==0.116.1
|
||||||
|
httptools==0.6.4
|
||||||
|
motor==3.7.1
|
||||||
|
pymongo==4.15.0
|
||||||
|
pydantic==2.11.9
|
||||||
redis==6.4.0
|
redis==6.4.0
|
||||||
pymongo==4.15.0
|
uvicorn==0.35.0
|
||||||
|
python-magic==0.4.27
|
||||||
|
watchdog==6.0.0
|
||||||
85
src/worker/tasks/document_processing.py
Normal file
85
src/worker/tasks/document_processing.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
"""
|
||||||
|
Celery tasks for document processing with ProcessingJob status management.
|
||||||
|
|
||||||
|
This module contains Celery tasks that handle document content extraction
|
||||||
|
and update processing job statuses throughout the task lifecycle.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
from app.database.connection import get_database
|
||||||
|
from app.services.document_service import DocumentService
|
||||||
|
from tasks.main import celery_app
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||||
|
def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Process a document file and extract its content.
|
||||||
|
|
||||||
|
This task:
|
||||||
|
1. Updates the processing job status to PROCESSING
|
||||||
|
2. Performs document content extraction
|
||||||
|
3. Updates job status to COMPLETED or FAILED based on result
|
||||||
|
|
||||||
|
Args:
|
||||||
|
self : Celery task instance
|
||||||
|
filepath: Full path to the document file to process
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing processing results
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: Any processing error (will trigger retry)
|
||||||
|
"""
|
||||||
|
task_id = self.request.id
|
||||||
|
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
||||||
|
|
||||||
|
database = get_database()
|
||||||
|
document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder())
|
||||||
|
from app.services.job_service import JobService
|
||||||
|
job_service = JobService(database=database)
|
||||||
|
|
||||||
|
job = None
|
||||||
|
try:
|
||||||
|
# Step 1: Insert the document in DB
|
||||||
|
document = document_service.create_document(filepath)
|
||||||
|
logger.info(f"Job {task_id} created for document {document.id} with file path: {filepath}")
|
||||||
|
|
||||||
|
# Step 2: Create a new job record for the document
|
||||||
|
job = job_service.create_job(task_id=task_id, document_id=document.id)
|
||||||
|
|
||||||
|
# Step 3: Mark job as started
|
||||||
|
job_service.mark_job_as_started(job_id=job.id)
|
||||||
|
logger.info(f"Job {task_id} marked as PROCESSING")
|
||||||
|
|
||||||
|
# Step 4: Mark job as completed
|
||||||
|
job_service.mark_job_as_completed(job_id=job.id)
|
||||||
|
logger.info(f"Job {task_id} marked as COMPLETED")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"task_id": task_id,
|
||||||
|
"filepath": filepath,
|
||||||
|
"status": "completed",
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_message = f"Document processing failed: {str(e)}"
|
||||||
|
logger.error(f"Task {task_id} failed: {error_message}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Mark job as failed
|
||||||
|
if job is not None:
|
||||||
|
job_service.mark_job_as_failed(job_id=job.id, error_message=error_message)
|
||||||
|
logger.info(f"Job {task_id} marked as FAILED")
|
||||||
|
else:
|
||||||
|
logger.error(f"Failed to process {filepath}. error = {str(e)}")
|
||||||
|
except Exception as job_error:
|
||||||
|
logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
|
||||||
|
|
||||||
|
# Re-raise the exception to trigger Celery retry mechanism
|
||||||
|
raise
|
||||||
|
|
||||||
@@ -3,9 +3,8 @@ Celery worker for MyDocManager document processing tasks.
|
|||||||
|
|
||||||
This module contains all Celery tasks for processing documents.
|
This module contains all Celery tasks for processing documents.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
from celery import Celery
|
from celery import Celery
|
||||||
|
|
||||||
# Environment variables
|
# Environment variables
|
||||||
@@ -13,101 +12,25 @@ REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
|||||||
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
||||||
|
|
||||||
# Initialize Celery app
|
# Initialize Celery app
|
||||||
app = Celery(
|
celery_app = Celery(
|
||||||
"mydocmanager_worker",
|
"mydocmanager_worker",
|
||||||
broker=REDIS_URL,
|
broker=REDIS_URL,
|
||||||
backend=REDIS_URL
|
backend=REDIS_URL,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
celery_app.autodiscover_tasks(["tasks.document_processing"])
|
||||||
|
|
||||||
# Celery configuration
|
# Celery configuration
|
||||||
app.conf.update(
|
celery_app.conf.update(
|
||||||
task_serializer="json",
|
task_serializer="json",
|
||||||
accept_content=["json"],
|
accept_content=["json"],
|
||||||
result_serializer="json",
|
result_serializer="json",
|
||||||
timezone="UTC",
|
timezone="UTC",
|
||||||
enable_utc=True,
|
enable_utc=True,
|
||||||
task_track_started=True,
|
task_track_started=True,
|
||||||
task_time_limit=300, # 5 minutes
|
task_time_limit=300, # 5 minutes
|
||||||
task_soft_time_limit=240, # 4 minutes
|
task_soft_time_limit=240, # 4 minutes
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.task(bind=True)
|
|
||||||
def test_task(self, message: str):
|
|
||||||
"""
|
|
||||||
Test task for validating worker functionality.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
message: Test message to process
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Task result with processing information
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
print(f"[WORKER] Starting test task with message: {message}")
|
|
||||||
|
|
||||||
# Simulate some work
|
|
||||||
for i in range(5):
|
|
||||||
print(f"[WORKER] Processing step {i + 1}/5...")
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
# Update task progress
|
|
||||||
self.update_state(
|
|
||||||
state="PROGRESS",
|
|
||||||
meta={
|
|
||||||
"current": i + 1,
|
|
||||||
"total": 5,
|
|
||||||
"message": f"Processing step {i + 1}"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"status": "completed",
|
|
||||||
"message": f"Successfully processed: {message}",
|
|
||||||
"processed_at": time.time(),
|
|
||||||
"worker_id": self.request.id
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"[WORKER] Test task completed successfully: {result}")
|
|
||||||
return result
|
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[WORKER] Test task failed: {str(exc)}")
|
|
||||||
raise self.retry(exc=exc, countdown=60, max_retries=3)
|
|
||||||
|
|
||||||
|
|
||||||
@app.task(bind=True)
|
|
||||||
def process_document_task(self, file_path: str):
|
|
||||||
"""
|
|
||||||
Placeholder task for document processing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to the document to process
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Processing result
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
print(f"[WORKER] Starting document processing for: {file_path}")
|
|
||||||
|
|
||||||
# Placeholder for document processing logic
|
|
||||||
time.sleep(2) # Simulate processing time
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"status": "completed",
|
|
||||||
"file_path": file_path,
|
|
||||||
"processed_at": time.time(),
|
|
||||||
"content": f"Placeholder content for {file_path}",
|
|
||||||
"worker_id": self.request.id
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"[WORKER] Document processing completed: {file_path}")
|
|
||||||
return result
|
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[WORKER] Document processing failed for {file_path}: {str(exc)}")
|
|
||||||
raise self.retry(exc=exc, countdown=60, max_retries=3)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.start()
|
celery_app.start()
|
||||||
|
|||||||
0
tests/api/__init__.py
Normal file
0
tests/api/__init__.py
Normal file
149
tests/api/test_auth_routes.py
Normal file
149
tests/api/test_auth_routes.py
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi import status, HTTPException
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from mongomock.mongo_client import MongoClient
|
||||||
|
|
||||||
|
from app.api.dependencies import get_auth_service, get_user_service, get_current_user
|
||||||
|
from app.main import app # Assuming you have FastAPI app defined in app/main.py
|
||||||
|
from app.models.auth import UserRole
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
from app.models.user import UserInDB
|
||||||
|
from app.services.auth_service import AuthService
|
||||||
|
from app.services.user_service import UserService
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fake_user():
|
||||||
|
return UserInDB(
|
||||||
|
_id=PyObjectId(),
|
||||||
|
username="testuser",
|
||||||
|
email="test@example.com",
|
||||||
|
role=UserRole.USER,
|
||||||
|
is_active=True,
|
||||||
|
hashed_password="hashed-secret",
|
||||||
|
created_at=datetime(2025, 1, 1),
|
||||||
|
updated_at=datetime(2025, 1, 2),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def override_auth_service():
|
||||||
|
mock = MagicMock(spec=AuthService)
|
||||||
|
mock.verify_user_password.return_value = True
|
||||||
|
mock.create_access_token.return_value = "fake-jwt-token"
|
||||||
|
return mock
|
||||||
|
|
||||||
|
|
||||||
|
def override_user_service(fake_user):
|
||||||
|
mock = MagicMock(spec=UserService)
|
||||||
|
mock.get_user_by_username.return_value = fake_user
|
||||||
|
return mock
|
||||||
|
|
||||||
|
|
||||||
|
def override_get_current_user(fake_user):
|
||||||
|
def _override():
|
||||||
|
return fake_user
|
||||||
|
|
||||||
|
return _override
|
||||||
|
|
||||||
|
|
||||||
|
def override_get_database():
|
||||||
|
def _override():
|
||||||
|
client = MongoClient()
|
||||||
|
db = client.test_database
|
||||||
|
return db
|
||||||
|
|
||||||
|
return _override
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------- TESTS FOR /auth/login ----------------------
|
||||||
|
class TestLogin:
|
||||||
|
def test_i_can_login_with_valid_credentials(self, client, fake_user):
|
||||||
|
auth_service = override_auth_service()
|
||||||
|
user_service = override_user_service(fake_user)
|
||||||
|
|
||||||
|
client.app.dependency_overrides[get_auth_service] = lambda: auth_service
|
||||||
|
client.app.dependency_overrides[get_user_service] = lambda: user_service
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
"/auth/login",
|
||||||
|
data={"username": "testuser", "password": "secret"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == status.HTTP_200_OK
|
||||||
|
data = response.json()
|
||||||
|
assert "access_token" in data
|
||||||
|
assert data["user"]["username"] == "testuser"
|
||||||
|
|
||||||
|
def test_i_cannot_login_with_invalid_username(self, client):
|
||||||
|
auth_service = override_auth_service()
|
||||||
|
user_service = MagicMock(spec=UserService)
|
||||||
|
user_service.get_user_by_username.return_value = None
|
||||||
|
|
||||||
|
client.app.dependency_overrides[get_auth_service] = lambda: auth_service
|
||||||
|
client.app.dependency_overrides[get_user_service] = lambda: user_service
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
"/auth/login",
|
||||||
|
data={"username": "unknown", "password": "secret"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == status.HTTP_401_UNAUTHORIZED
|
||||||
|
|
||||||
|
def test_i_cannot_login_with_inactive_user(self, client, fake_user):
|
||||||
|
fake_user.is_active = False
|
||||||
|
auth_service = override_auth_service()
|
||||||
|
user_service = override_user_service(fake_user)
|
||||||
|
client.app.dependency_overrides[get_auth_service] = lambda: auth_service
|
||||||
|
client.app.dependency_overrides[get_user_service] = lambda: user_service
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
"/auth/login",
|
||||||
|
data={"username": "testuser", "password": "secret"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == status.HTTP_401_UNAUTHORIZED
|
||||||
|
|
||||||
|
def test_i_cannot_login_with_wrong_password(self, client, fake_user):
|
||||||
|
auth_service = override_auth_service()
|
||||||
|
auth_service.verify_user_password.return_value = False
|
||||||
|
user_service = override_user_service(fake_user)
|
||||||
|
client.app.dependency_overrides[get_auth_service] = lambda: auth_service
|
||||||
|
client.app.dependency_overrides[get_user_service] = lambda: user_service
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
"/auth/login",
|
||||||
|
data={"username": "testuser", "password": "wrong"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == status.HTTP_401_UNAUTHORIZED
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------- TESTS FOR /auth/me ----------------------
|
||||||
|
class TesteMe:
|
||||||
|
def test_i_can_get_current_user_profile(self, client, fake_user):
|
||||||
|
client.app.dependency_overrides[get_current_user] = override_get_current_user(fake_user)
|
||||||
|
|
||||||
|
response = client.get("/auth/me")
|
||||||
|
|
||||||
|
assert response.status_code == status.HTTP_200_OK
|
||||||
|
data = response.json()
|
||||||
|
assert data["username"] == fake_user.username
|
||||||
|
assert data["email"] == fake_user.email
|
||||||
|
|
||||||
|
def test_i_cannot_get_profile_without_authentication(self, client, monkeypatch):
|
||||||
|
def raise_http_exception():
|
||||||
|
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)
|
||||||
|
|
||||||
|
client.app.dependency_overrides[get_current_user] = raise_http_exception
|
||||||
|
|
||||||
|
response = client.get("/auth/me")
|
||||||
|
|
||||||
|
assert response.status_code == status.HTTP_401_UNAUTHORIZED
|
||||||
167
tests/api/test_users.py
Normal file
167
tests/api/test_users.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
# File: tests/api/test_users.py
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi import status
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from app.api.dependencies import get_admin_user, get_user_service
|
||||||
|
from app.main import app
|
||||||
|
from app.models.auth import UserRole
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
from app.models.user import UserInDB, UserCreate
|
||||||
|
from app.services.user_service import UserService
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------
|
||||||
|
# Fixtures
|
||||||
|
# -----------------------
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fake_user_admin():
|
||||||
|
return UserInDB(
|
||||||
|
_id=PyObjectId(),
|
||||||
|
username="admin",
|
||||||
|
email="admin@example.com",
|
||||||
|
role=UserRole.ADMIN,
|
||||||
|
is_active=True,
|
||||||
|
hashed_password="hashed-secret",
|
||||||
|
created_at=datetime(2025, 1, 1),
|
||||||
|
updated_at=datetime(2025, 1, 2),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fake_user_response():
|
||||||
|
return UserInDB(
|
||||||
|
_id=PyObjectId(),
|
||||||
|
username="other",
|
||||||
|
email="other@example.com",
|
||||||
|
role=UserRole.USER,
|
||||||
|
is_active=True,
|
||||||
|
hashed_password="hashed-secret-2",
|
||||||
|
created_at=datetime(2025, 1, 1),
|
||||||
|
updated_at=datetime(2025, 1, 2),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(fake_user_admin):
|
||||||
|
# Fake admin dependency
|
||||||
|
def get_admin_user_override():
|
||||||
|
return fake_user_admin
|
||||||
|
|
||||||
|
# Fake user service
|
||||||
|
user_service_mock = MagicMock(spec=UserService)
|
||||||
|
|
||||||
|
def get_user_service_override():
|
||||||
|
return user_service_mock
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
client.app.dependency_overrides = {
|
||||||
|
get_admin_user: get_admin_user_override,
|
||||||
|
get_user_service: get_user_service_override
|
||||||
|
}
|
||||||
|
|
||||||
|
client.user_service_mock = user_service_mock
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------
|
||||||
|
# Tests
|
||||||
|
# -----------------------
|
||||||
|
|
||||||
|
class TestListUsers:
|
||||||
|
|
||||||
|
def test_i_can_list_users(self, client, fake_user_admin, fake_user_response):
|
||||||
|
client.user_service_mock.list_users.return_value = [fake_user_admin, fake_user_response]
|
||||||
|
response = client.get("/users")
|
||||||
|
assert response.status_code == status.HTTP_200_OK
|
||||||
|
data = response.json()
|
||||||
|
assert len(data) == 2
|
||||||
|
assert data[0]["username"] == "admin"
|
||||||
|
|
||||||
|
def test_i_can_list_users_when_empty(self, client):
|
||||||
|
client.user_service_mock.list_users.return_value = []
|
||||||
|
response = client.get("/users")
|
||||||
|
assert response.status_code == status.HTTP_200_OK
|
||||||
|
assert response.json() == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetUserById:
|
||||||
|
|
||||||
|
def test_i_can_get_user_by_id(self, client, fake_user_response):
|
||||||
|
client.user_service_mock.get_user_by_id.return_value = fake_user_response
|
||||||
|
response = client.get(f"/users/{fake_user_response.id}")
|
||||||
|
assert response.status_code == status.HTTP_200_OK
|
||||||
|
data = response.json()
|
||||||
|
assert data["username"] == fake_user_response.username
|
||||||
|
|
||||||
|
def test_i_cannot_get_user_by_id_not_found(self, client):
|
||||||
|
client.user_service_mock.get_user_by_id.return_value = None
|
||||||
|
response = client.get("/users/64f0c9f4b0d1c8b7b8e1f0a2")
|
||||||
|
assert response.status_code == status.HTTP_404_NOT_FOUND
|
||||||
|
assert response.json()["detail"] == "User not found"
|
||||||
|
|
||||||
|
|
||||||
|
class TestCreateUser:
|
||||||
|
|
||||||
|
def test_i_can_create_user(self, client, fake_user_response):
|
||||||
|
user_data = UserCreate(username="newuser",
|
||||||
|
email="new@example.com",
|
||||||
|
password="#Passw0rd!",
|
||||||
|
role=UserRole.USER)
|
||||||
|
|
||||||
|
client.user_service_mock.create_user.return_value = fake_user_response
|
||||||
|
response = client.post("/users", json=user_data.model_dump(mode="json"))
|
||||||
|
assert response.status_code == status.HTTP_201_CREATED
|
||||||
|
data = response.json()
|
||||||
|
assert data["username"] == fake_user_response.username
|
||||||
|
|
||||||
|
def test_i_cannot_create_user_when_service_raises_value_error(self, client):
|
||||||
|
user_data = {"username": "baduser", "email": "bad@example.com", "role": "user", "password": "password"}
|
||||||
|
client.user_service_mock.create_user.side_effect = ValueError("Invalid data")
|
||||||
|
response = client.post("/users", json=user_data)
|
||||||
|
assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
|
||||||
|
|
||||||
|
|
||||||
|
class TestUpdateUser:
|
||||||
|
|
||||||
|
def test_i_can_update_user(self, client, fake_user_response):
|
||||||
|
user_data = {"username": "updateduser", "email": "updated@example.com"}
|
||||||
|
client.user_service_mock.update_user.return_value = fake_user_response
|
||||||
|
response = client.put(f"/users/{fake_user_response.id}", json=user_data)
|
||||||
|
assert response.status_code == status.HTTP_200_OK
|
||||||
|
data = response.json()
|
||||||
|
assert data["username"] == fake_user_response.username
|
||||||
|
|
||||||
|
def test_i_cannot_update_user_not_found(self, client):
|
||||||
|
client.user_service_mock.update_user.return_value = None
|
||||||
|
user_data = {"username": "updateduser"}
|
||||||
|
response = client.put("/users/64f0c9f4b0d1c8b7b8e1f0a2", json=user_data)
|
||||||
|
assert response.status_code == status.HTTP_404_NOT_FOUND
|
||||||
|
assert response.json()["detail"] == "User not found"
|
||||||
|
|
||||||
|
def test_i_cannot_update_user_when_service_raises_value_error(self, client):
|
||||||
|
client.user_service_mock.update_user.side_effect = ValueError("Invalid update")
|
||||||
|
user_data = {"username": "badupdate"}
|
||||||
|
response = client.put("/users/64f0c9f4b0d1c8b7b8e1f0a2", json=user_data)
|
||||||
|
assert response.status_code == status.HTTP_400_BAD_REQUEST
|
||||||
|
assert response.json()["detail"] == "Invalid update"
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeleteUser:
|
||||||
|
|
||||||
|
def test_i_can_delete_user(self, client):
|
||||||
|
client.user_service_mock.delete_user.return_value = True
|
||||||
|
response = client.delete("/users/64f0c9f4b0d1c8b7b8e1f0a1")
|
||||||
|
assert response.status_code == status.HTTP_200_OK
|
||||||
|
data = response.json()
|
||||||
|
assert data["message"] == "User successfully deleted"
|
||||||
|
|
||||||
|
def test_i_cannot_delete_user_not_found(self, client):
|
||||||
|
client.user_service_mock.delete_user.return_value = False
|
||||||
|
response = client.delete("/users/64f0c9f4b0d1c8b7b8e1f0a2")
|
||||||
|
assert response.status_code == status.HTTP_404_NOT_FOUND
|
||||||
|
assert response.json()["detail"] == "User not found"
|
||||||
0
tests/database/__init__.py
Normal file
0
tests/database/__init__.py
Normal file
0
tests/models/__init__.py
Normal file
0
tests/models/__init__.py
Normal file
@@ -10,8 +10,8 @@ from pydantic import ValidationError
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bson import ObjectId
|
from bson import ObjectId
|
||||||
|
|
||||||
from app.models.user import UserCreate, UserUpdate, UserInDB, UserResponse
|
from app.models.user import UserCreate, UserUpdate, UserInDB
|
||||||
from app.models.auth import UserRole
|
from app.models.auth import UserRole, UserResponse
|
||||||
|
|
||||||
|
|
||||||
class TestUserCreateModel:
|
class TestUserCreateModel:
|
||||||
@@ -349,7 +349,7 @@ class TestUserResponseModel:
|
|||||||
|
|
||||||
# Convert to response model (excluding password_hash)
|
# Convert to response model (excluding password_hash)
|
||||||
user_response = UserResponse(
|
user_response = UserResponse(
|
||||||
id=user_in_db.id,
|
_id=user_in_db.id,
|
||||||
username=user_in_db.username,
|
username=user_in_db.username,
|
||||||
email=user_in_db.email,
|
email=user_in_db.email,
|
||||||
role=user_in_db.role,
|
role=user_in_db.role,
|
||||||
0
tests/repositories/__init__.py
Normal file
0
tests/repositories/__init__.py
Normal file
611
tests/repositories/test_document_repository.py
Normal file
611
tests/repositories/test_document_repository.py
Normal file
@@ -0,0 +1,611 @@
|
|||||||
|
"""
|
||||||
|
Test suite for FileDocumentRepository with async/support.
|
||||||
|
|
||||||
|
This module contains comprehensive tests for all FileDocumentRepository methods
|
||||||
|
using mongomock-motor for in-memory MongoDB testing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from bson import ObjectId
|
||||||
|
from mongomock.mongo_client import MongoClient
|
||||||
|
from pymongo.errors import PyMongoError
|
||||||
|
|
||||||
|
from app.database.repositories.document_repository import (
|
||||||
|
FileDocumentRepository,
|
||||||
|
MatchMethodBase,
|
||||||
|
SubsequenceMatching,
|
||||||
|
FuzzyMatching
|
||||||
|
)
|
||||||
|
from app.models.document import FileDocument, FileType, ExtractionMethod
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def in_memory_repository():
|
||||||
|
"""Create an in-memory FileDocumentRepository for testing."""
|
||||||
|
client = MongoClient()
|
||||||
|
db = client.test_database
|
||||||
|
repo = FileDocumentRepository(db)
|
||||||
|
repo.initialize()
|
||||||
|
return repo
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_file_document():
|
||||||
|
"""Sample FileDocument data for testing."""
|
||||||
|
return FileDocument(
|
||||||
|
filename="sample_document.pdf",
|
||||||
|
filepath="/home/user/documents/sample_document.pdf",
|
||||||
|
file_type=FileType.PDF,
|
||||||
|
extraction_method=ExtractionMethod.OCR,
|
||||||
|
metadata={"pages": 5, "language": "en", "author": "John Doe"},
|
||||||
|
detected_at=datetime.now(),
|
||||||
|
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
|
||||||
|
encoding="utf-8",
|
||||||
|
file_size=1024000,
|
||||||
|
mime_type="application/pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_update_data():
|
||||||
|
"""Sample update data for testing."""
|
||||||
|
return {
|
||||||
|
"extraction_method": ExtractionMethod.HYBRID,
|
||||||
|
"metadata": {"pages": 10, "language": "fr", "updated": True},
|
||||||
|
"file_size": 2048000
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def multiple_sample_files():
|
||||||
|
"""Multiple FileDocument objects for list/search testing."""
|
||||||
|
base_time = datetime.now()
|
||||||
|
return [
|
||||||
|
FileDocument(
|
||||||
|
filename="first_doc.txt",
|
||||||
|
filepath="/docs/first_doc.txt",
|
||||||
|
file_type=FileType.TXT,
|
||||||
|
extraction_method=ExtractionMethod.DIRECT_TEXT,
|
||||||
|
metadata={"words": 500},
|
||||||
|
detected_at=base_time,
|
||||||
|
file_hash="hash1" + "0" * 58,
|
||||||
|
encoding="utf-8",
|
||||||
|
file_size=5000,
|
||||||
|
mime_type="text/plain"
|
||||||
|
),
|
||||||
|
FileDocument(
|
||||||
|
filename="second_document.pdf",
|
||||||
|
filepath="/docs/second_document.pdf",
|
||||||
|
file_type=FileType.PDF,
|
||||||
|
extraction_method=ExtractionMethod.OCR,
|
||||||
|
metadata={"pages": 8},
|
||||||
|
detected_at=base_time,
|
||||||
|
file_hash="hash2" + "0" * 58,
|
||||||
|
encoding="utf-8",
|
||||||
|
file_size=10000,
|
||||||
|
mime_type="application/pdf"
|
||||||
|
),
|
||||||
|
FileDocument(
|
||||||
|
filename="third_file.docx",
|
||||||
|
filepath="/docs/third_file.docx",
|
||||||
|
file_type=FileType.DOCX,
|
||||||
|
extraction_method=ExtractionMethod.HYBRID,
|
||||||
|
metadata={"paragraphs": 15},
|
||||||
|
detected_at=base_time,
|
||||||
|
file_hash="hash3" + "0" * 58,
|
||||||
|
encoding="utf-8",
|
||||||
|
file_size=15000,
|
||||||
|
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileDocumentRepositoryInitialization:
|
||||||
|
"""Tests for repository initialization."""
|
||||||
|
|
||||||
|
def test_i_can_initialize_repository(self):
|
||||||
|
"""Test repository initialization."""
|
||||||
|
# Arrange
|
||||||
|
client = MongoClient()
|
||||||
|
db = client.test_database
|
||||||
|
repo = FileDocumentRepository(db)
|
||||||
|
repo.initialize()
|
||||||
|
|
||||||
|
# Act & Assert (should not raise any exception)
|
||||||
|
assert repo.db is not None
|
||||||
|
assert repo.collection is not None
|
||||||
|
# TODO : check that the indexes are created
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileDocumentRepositoryCreation:
|
||||||
|
"""Tests for file document creation functionality."""
|
||||||
|
|
||||||
|
def test_i_can_create_file_document(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test successful file document creation."""
|
||||||
|
# Act
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert created_file is not None
|
||||||
|
assert created_file.filename == sample_file_document.filename
|
||||||
|
assert created_file.filepath == sample_file_document.filepath
|
||||||
|
assert created_file.file_type == sample_file_document.file_type
|
||||||
|
assert created_file.extraction_method == sample_file_document.extraction_method
|
||||||
|
assert created_file.metadata == sample_file_document.metadata
|
||||||
|
assert created_file.file_hash == sample_file_document.file_hash
|
||||||
|
assert created_file.file_size == sample_file_document.file_size
|
||||||
|
assert created_file.mime_type == sample_file_document.mime_type
|
||||||
|
assert created_file.id is not None
|
||||||
|
assert isinstance(created_file.id, ObjectId)
|
||||||
|
|
||||||
|
def test_i_can_create_file_document_without_id(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test creating file document with _id set to None (should be removed)."""
|
||||||
|
# Arrange
|
||||||
|
sample_file_document.id = None
|
||||||
|
|
||||||
|
# Act
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert created_file is not None
|
||||||
|
assert created_file.id is not None
|
||||||
|
assert isinstance(created_file.id, ObjectId)
|
||||||
|
|
||||||
|
def test_i_cannot_create_file_document_with_pymongo_error(self, in_memory_repository,
|
||||||
|
sample_file_document, mocker):
|
||||||
|
"""Test handling of PyMongo errors during file document creation."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(ValueError) as exc_info:
|
||||||
|
in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
assert "Failed to create file document" in str(exc_info.value)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileDocumentRepositoryFinding:
|
||||||
|
"""Tests for file document finding functionality."""
|
||||||
|
|
||||||
|
def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test finding file document by valid ObjectId."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_file = in_memory_repository.find_document_by_id(str(created_file.id))
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_file is not None
|
||||||
|
assert found_file.id == created_file.id
|
||||||
|
assert found_file.filename == created_file.filename
|
||||||
|
assert found_file.filepath == created_file.filepath
|
||||||
|
|
||||||
|
def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository):
|
||||||
|
"""Test that invalid ObjectId returns None."""
|
||||||
|
# Act
|
||||||
|
found_file = in_memory_repository.find_document_by_id("invalid_id")
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_file is None
|
||||||
|
|
||||||
|
def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository):
|
||||||
|
"""Test that nonexistent but valid ObjectId returns None."""
|
||||||
|
# Arrange
|
||||||
|
nonexistent_id = str(ObjectId())
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_file = in_memory_repository.find_document_by_id(nonexistent_id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_file is None
|
||||||
|
|
||||||
|
def test_i_can_find_document_by_file_hash(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test finding file document by file hash."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_file = in_memory_repository.find_document_by_hash(sample_file_document.file_hash)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_file is not None
|
||||||
|
assert found_file.file_hash == created_file.file_hash
|
||||||
|
assert found_file.id == created_file.id
|
||||||
|
|
||||||
|
def test_i_cannot_find_document_with_nonexistent_file_hash(self, in_memory_repository):
|
||||||
|
"""Test that nonexistent file hash returns None."""
|
||||||
|
# Act
|
||||||
|
found_file = in_memory_repository.find_document_by_hash("nonexistent_hash")
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_file is None
|
||||||
|
|
||||||
|
def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test finding file document by filepath."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_file = in_memory_repository.find_document_by_filepath(sample_file_document.filepath)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_file is not None
|
||||||
|
assert found_file.filepath == created_file.filepath
|
||||||
|
assert found_file.id == created_file.id
|
||||||
|
|
||||||
|
def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository):
|
||||||
|
"""Test that nonexistent filepath returns None."""
|
||||||
|
# Act
|
||||||
|
found_file = in_memory_repository.find_document_by_filepath("/nonexistent/path/file.pdf")
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_file is None
|
||||||
|
|
||||||
|
def test_i_cannot_find_document_with_pymongo_error(self, in_memory_repository, mocker):
|
||||||
|
"""Test handling of PyMongo errors during file document finding."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_file = in_memory_repository.find_document_by_hash("test_hash")
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_file is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileDocumentRepositoryNameMatching:
|
||||||
|
"""Tests for file document name matching functionality."""
|
||||||
|
|
||||||
|
def test_i_can_find_documents_by_name_with_fuzzy_matching(self, in_memory_repository, multiple_sample_files):
|
||||||
|
"""Test finding file documents by filename using fuzzy matching."""
|
||||||
|
# Arrange
|
||||||
|
for file_doc in multiple_sample_files:
|
||||||
|
in_memory_repository.create_document(file_doc)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
fuzzy_method = FuzzyMatching(threshold=0.5)
|
||||||
|
found_files = in_memory_repository.find_document_by_name("document", fuzzy_method)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(found_files) >= 1
|
||||||
|
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
|
||||||
|
# Should find files with "document" in the name
|
||||||
|
found_filenames = [f.filename for f in found_files]
|
||||||
|
assert any("document" in fname.lower() for fname in found_filenames)
|
||||||
|
|
||||||
|
def test_i_can_find_documents_by_name_with_subsequence_matching(self, in_memory_repository,
|
||||||
|
multiple_sample_files):
|
||||||
|
"""Test finding file documents by filename using subsequence matching."""
|
||||||
|
# Arrange
|
||||||
|
for file_doc in multiple_sample_files:
|
||||||
|
in_memory_repository.create_document(file_doc)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
subsequence_method = SubsequenceMatching()
|
||||||
|
found_files = in_memory_repository.find_document_by_name("doc", subsequence_method)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(found_files) >= 1
|
||||||
|
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
|
||||||
|
|
||||||
|
def test_i_can_find_documents_by_name_with_default_method(self, in_memory_repository, multiple_sample_files):
|
||||||
|
"""Test finding file documents by filename with default matching method."""
|
||||||
|
# Arrange
|
||||||
|
for file_doc in multiple_sample_files:
|
||||||
|
in_memory_repository.create_document(file_doc)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_files = in_memory_repository.find_document_by_name("first")
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(found_files) >= 0
|
||||||
|
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
|
||||||
|
|
||||||
|
def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker):
|
||||||
|
"""Test handling of PyMongo errors during document name matching."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_files = in_memory_repository.find_document_by_name("test")
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_files == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileDocumentRepositoryListing:
|
||||||
|
"""Tests for file document listing functionality."""
|
||||||
|
|
||||||
|
def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_files):
|
||||||
|
"""Test listing file documents with default pagination."""
|
||||||
|
# Arrange
|
||||||
|
for file_doc in multiple_sample_files:
|
||||||
|
in_memory_repository.create_document(file_doc)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
files = in_memory_repository.list_documents()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(files) == len(multiple_sample_files)
|
||||||
|
assert all(isinstance(file_doc, FileDocument) for file_doc in files)
|
||||||
|
|
||||||
|
def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_files):
|
||||||
|
"""Test listing file documents with custom pagination."""
|
||||||
|
# Arrange
|
||||||
|
for file_doc in multiple_sample_files:
|
||||||
|
in_memory_repository.create_document(file_doc)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
files_page1 = in_memory_repository.list_documents(skip=0, limit=2)
|
||||||
|
files_page2 = in_memory_repository.list_documents(skip=2, limit=2)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(files_page1) == 2
|
||||||
|
assert len(files_page2) == 1 # Only 3 total files
|
||||||
|
|
||||||
|
# Ensure no overlap between pages
|
||||||
|
page1_ids = [file_doc.id for file_doc in files_page1]
|
||||||
|
page2_ids = [file_doc.id for file_doc in files_page2]
|
||||||
|
assert len(set(page1_ids).intersection(set(page2_ids))) == 0
|
||||||
|
|
||||||
|
def test_i_can_list_documents_sorted_by_detected_at(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test that file documents are sorted by detected_at in descending order."""
|
||||||
|
# Arrange
|
||||||
|
file1 = sample_file_document.model_copy()
|
||||||
|
file1.filepath = "/docs/file1.pdf"
|
||||||
|
file1.filename = "file1.pdf"
|
||||||
|
file1.file_hash = "hash1" + "0" * 58
|
||||||
|
file1.detected_at = datetime(2024, 1, 1, 10, 0, 0)
|
||||||
|
|
||||||
|
file2 = sample_file_document.model_copy()
|
||||||
|
file2.filepath = "/docs/file2.pdf"
|
||||||
|
file2.filename = "file2.pdf"
|
||||||
|
file2.file_hash = "hash2" + "0" * 58
|
||||||
|
file2.detected_at = datetime(2024, 1, 2, 10, 0, 0) # Later date
|
||||||
|
|
||||||
|
created_file1 = in_memory_repository.create_document(file1)
|
||||||
|
created_file2 = in_memory_repository.create_document(file2)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
files = in_memory_repository.list_documents()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(files) == 2
|
||||||
|
# Most recent (latest detected_at) should be first
|
||||||
|
assert files[0].id == created_file2.id
|
||||||
|
assert files[1].id == created_file1.id
|
||||||
|
|
||||||
|
def test_i_can_list_empty_documents(self, in_memory_repository):
|
||||||
|
"""Test listing file documents from empty collection."""
|
||||||
|
# Act
|
||||||
|
files = in_memory_repository.list_documents()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert files == []
|
||||||
|
|
||||||
|
def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker):
|
||||||
|
"""Test handling of PyMongo errors during file document listing."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act
|
||||||
|
files = in_memory_repository.list_documents()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert files == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileDocumentRepositoryUpdate:
|
||||||
|
"""Tests for file document update functionality."""
|
||||||
|
|
||||||
|
def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document,
|
||||||
|
sample_update_data):
|
||||||
|
"""Test successful file document update."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
updated_file = in_memory_repository.update_document(str(created_file.id), sample_update_data)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert updated_file is not None
|
||||||
|
assert updated_file.extraction_method == sample_update_data["extraction_method"]
|
||||||
|
assert updated_file.metadata == sample_update_data["metadata"]
|
||||||
|
assert updated_file.file_size == sample_update_data["file_size"]
|
||||||
|
assert updated_file.id == created_file.id
|
||||||
|
assert updated_file.filename == created_file.filename # Unchanged fields remain
|
||||||
|
assert updated_file.filepath == created_file.filepath
|
||||||
|
|
||||||
|
def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test updating file document with partial data."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
partial_update = {"file_size": 999999}
|
||||||
|
|
||||||
|
# Act
|
||||||
|
updated_file = in_memory_repository.update_document(str(created_file.id), partial_update)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert updated_file is not None
|
||||||
|
assert updated_file.file_size == 999999
|
||||||
|
assert updated_file.filename == created_file.filename # Should remain unchanged
|
||||||
|
assert updated_file.metadata == created_file.metadata # Should remain unchanged
|
||||||
|
|
||||||
|
def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test that None values are filtered out from update data."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
update_with_none = {"file_size": 777777, "metadata": None}
|
||||||
|
|
||||||
|
# Act
|
||||||
|
updated_file = in_memory_repository.update_document(str(created_file.id), update_with_none)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert updated_file is not None
|
||||||
|
assert updated_file.file_size == 777777
|
||||||
|
assert updated_file.metadata == created_file.metadata # Should remain unchanged (None filtered out)
|
||||||
|
|
||||||
|
def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test updating file document with empty data returns current document."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
empty_update = {}
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.update_document(str(created_file.id), empty_update)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is not None
|
||||||
|
assert result.filename == created_file.filename
|
||||||
|
assert result.filepath == created_file.filepath
|
||||||
|
assert result.metadata == created_file.metadata
|
||||||
|
|
||||||
|
def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data):
|
||||||
|
"""Test that updating with invalid ID returns None."""
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.update_document("invalid_id", sample_update_data)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data):
|
||||||
|
"""Test that updating nonexistent file document returns None."""
|
||||||
|
# Arrange
|
||||||
|
nonexistent_id = str(ObjectId())
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.update_document(nonexistent_id, sample_update_data)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document,
|
||||||
|
sample_update_data, mocker):
|
||||||
|
"""Test handling of PyMongo errors during file document update."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
|
||||||
|
side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.update_document(str(created_file.id), sample_update_data)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileDocumentRepositoryDeletion:
|
||||||
|
"""Tests for file document deletion functionality."""
|
||||||
|
|
||||||
|
def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test successful file document deletion."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
deletion_result = in_memory_repository.delete_document(str(created_file.id))
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert deletion_result is True
|
||||||
|
|
||||||
|
# Verify document is actually deleted
|
||||||
|
found_file = in_memory_repository.find_document_by_id(str(created_file.id))
|
||||||
|
assert found_file is None
|
||||||
|
|
||||||
|
def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository):
|
||||||
|
"""Test that deleting with invalid ID returns False."""
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.delete_document("invalid_id")
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_i_cannot_delete_nonexistent_document(self, in_memory_repository):
|
||||||
|
"""Test that deleting nonexistent file document returns False."""
|
||||||
|
# Arrange
|
||||||
|
nonexistent_id = str(ObjectId())
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.delete_document(nonexistent_id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
|
||||||
|
"""Test handling of PyMongo errors during file document deletion."""
|
||||||
|
# Arrange
|
||||||
|
created_file = in_memory_repository.create_document(sample_file_document)
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.delete_document(str(created_file.id))
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileDocumentRepositoryUtilities:
|
||||||
|
"""Tests for utility methods."""
|
||||||
|
|
||||||
|
def test_i_can_count_documents(self, in_memory_repository, sample_file_document):
|
||||||
|
"""Test counting file documents."""
|
||||||
|
# Arrange
|
||||||
|
initial_count = in_memory_repository.count_documents()
|
||||||
|
in_memory_repository.create_document(sample_file_document)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
final_count = in_memory_repository.count_documents()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert final_count == initial_count + 1
|
||||||
|
|
||||||
|
def test_i_can_count_zero_documents(self, in_memory_repository):
|
||||||
|
"""Test counting file documents in empty collection."""
|
||||||
|
# Act
|
||||||
|
count = in_memory_repository.count_documents()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert count == 0
|
||||||
|
|
||||||
|
def test_i_cannot_count_documents_with_pymongo_error(self, in_memory_repository, mocker):
|
||||||
|
"""Test handling of PyMongo errors during file document counting."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'count_documents', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act
|
||||||
|
count = in_memory_repository.count_documents()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert count == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestMatchingMethods:
|
||||||
|
"""Tests for matching method classes."""
|
||||||
|
|
||||||
|
def test_i_can_create_fuzzy_matching_with_default_threshold(self):
|
||||||
|
"""Test creating FuzzyMatching with default threshold."""
|
||||||
|
# Act
|
||||||
|
fuzzy = FuzzyMatching()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert fuzzy.threshold == 0.6
|
||||||
|
|
||||||
|
def test_i_can_create_fuzzy_matching_with_custom_threshold(self):
|
||||||
|
"""Test creating FuzzyMatching with custom threshold."""
|
||||||
|
# Act
|
||||||
|
fuzzy = FuzzyMatching(threshold=0.8)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert fuzzy.threshold == 0.8
|
||||||
|
|
||||||
|
def test_i_can_create_subsequence_matching(self):
|
||||||
|
"""Test creating SubsequenceMatching."""
|
||||||
|
# Act
|
||||||
|
subsequence = SubsequenceMatching()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert isinstance(subsequence, MatchMethodBase)
|
||||||
|
assert isinstance(subsequence, SubsequenceMatching)
|
||||||
496
tests/repositories/test_job_repository.py
Normal file
496
tests/repositories/test_job_repository.py
Normal file
@@ -0,0 +1,496 @@
|
|||||||
|
"""
|
||||||
|
Test suite for JobRepository with async/support.
|
||||||
|
|
||||||
|
This module contains comprehensive tests for all JobRepository methods
|
||||||
|
using mongomock-motor for in-memory MongoDB testing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from bson import ObjectId
|
||||||
|
from mongomock.mongo_client import MongoClient
|
||||||
|
from mongomock_motor import AsyncMongoMockClient
|
||||||
|
from pymongo.errors import PyMongoError
|
||||||
|
|
||||||
|
from app.database.repositories.job_repository import JobRepository
|
||||||
|
from app.exceptions.job_exceptions import JobRepositoryError
|
||||||
|
from app.models.job import ProcessingJob, ProcessingStatus
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def in_memory_repository():
|
||||||
|
"""Create an in-memory JobRepository for testing."""
|
||||||
|
client = MongoClient()
|
||||||
|
db = client.test_database
|
||||||
|
repo = JobRepository(db)
|
||||||
|
repo.initialize()
|
||||||
|
return repo
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_document_id():
|
||||||
|
"""Sample document ObjectId for testing."""
|
||||||
|
return PyObjectId()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_task_id():
|
||||||
|
"""Sample Celery task ID for testing."""
|
||||||
|
return "celery-task-12345-abcde"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def multiple_sample_jobs():
|
||||||
|
"""Multiple ProcessingJob objects for testing."""
|
||||||
|
doc_id_1 = ObjectId()
|
||||||
|
doc_id_2 = ObjectId()
|
||||||
|
base_time = datetime.utcnow()
|
||||||
|
|
||||||
|
return [
|
||||||
|
ProcessingJob(
|
||||||
|
document_id=doc_id_1,
|
||||||
|
status=ProcessingStatus.PENDING,
|
||||||
|
task_id="task-1",
|
||||||
|
created_at=base_time,
|
||||||
|
started_at=None,
|
||||||
|
completed_at=None,
|
||||||
|
error_message=None
|
||||||
|
),
|
||||||
|
ProcessingJob(
|
||||||
|
document_id=doc_id_2,
|
||||||
|
status=ProcessingStatus.PROCESSING,
|
||||||
|
task_id="task-2",
|
||||||
|
created_at=base_time,
|
||||||
|
started_at=base_time,
|
||||||
|
completed_at=None,
|
||||||
|
error_message=None
|
||||||
|
),
|
||||||
|
ProcessingJob(
|
||||||
|
document_id=doc_id_1,
|
||||||
|
status=ProcessingStatus.COMPLETED,
|
||||||
|
task_id="task-3",
|
||||||
|
created_at=base_time,
|
||||||
|
started_at=base_time,
|
||||||
|
completed_at=base_time,
|
||||||
|
error_message=None
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class TestJobRepositoryInitialization:
|
||||||
|
"""Tests for repository initialization."""
|
||||||
|
|
||||||
|
def test_i_can_initialize_repository(self):
|
||||||
|
"""Test repository initialization."""
|
||||||
|
# Arrange
|
||||||
|
client = AsyncMongoMockClient()
|
||||||
|
db = client.test_database
|
||||||
|
repo = JobRepository(db)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
initialized_repo = repo.initialize()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert initialized_repo is repo
|
||||||
|
assert repo.db is not None
|
||||||
|
assert repo.collection is not None
|
||||||
|
|
||||||
|
|
||||||
|
class TestJobRepositoryCreation:
|
||||||
|
"""Tests for job creation functionality."""
|
||||||
|
|
||||||
|
def test_i_can_create_job_with_task_id(self, in_memory_repository, sample_document_id, sample_task_id):
|
||||||
|
"""Test successful job creation with task ID."""
|
||||||
|
# Act
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert created_job is not None
|
||||||
|
assert created_job.document_id == sample_document_id
|
||||||
|
assert created_job.task_id == sample_task_id
|
||||||
|
assert created_job.status == ProcessingStatus.PENDING
|
||||||
|
assert created_job.created_at is not None
|
||||||
|
assert created_job.started_at is None
|
||||||
|
assert created_job.completed_at is None
|
||||||
|
assert created_job.error_message is None
|
||||||
|
assert created_job.id is not None
|
||||||
|
assert isinstance(created_job.id, ObjectId)
|
||||||
|
|
||||||
|
def test_i_can_create_job_without_task_id(self, in_memory_repository, sample_document_id):
|
||||||
|
"""Test successful job creation without task ID."""
|
||||||
|
# Act
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert created_job is not None
|
||||||
|
assert created_job.document_id == sample_document_id
|
||||||
|
assert created_job.task_id is None
|
||||||
|
assert created_job.status == ProcessingStatus.PENDING
|
||||||
|
assert created_job.created_at is not None
|
||||||
|
assert created_job.started_at is None
|
||||||
|
assert created_job.completed_at is None
|
||||||
|
assert created_job.error_message is None
|
||||||
|
assert created_job.id is not None
|
||||||
|
assert isinstance(created_job.id, ObjectId)
|
||||||
|
|
||||||
|
def test_i_cannot_create_duplicate_job_for_document(self, in_memory_repository, sample_document_id,
|
||||||
|
sample_task_id):
|
||||||
|
"""Test that creating job with duplicate document_id raises DuplicateKeyError."""
|
||||||
|
# Arrange
|
||||||
|
in_memory_repository.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(JobRepositoryError) as exc_info:
|
||||||
|
in_memory_repository.create_job(sample_document_id, "different-task-id")
|
||||||
|
|
||||||
|
assert "create_job" in str(exc_info.value)
|
||||||
|
|
||||||
|
def test_i_cannot_create_job_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker):
|
||||||
|
"""Test handling of PyMongo errors during job creation."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(JobRepositoryError) as exc_info:
|
||||||
|
in_memory_repository.create_job(sample_document_id)
|
||||||
|
|
||||||
|
assert "create_job" in str(exc_info.value)
|
||||||
|
|
||||||
|
|
||||||
|
class TestJobRepositoryFinding:
|
||||||
|
"""Tests for job finding functionality."""
|
||||||
|
|
||||||
|
def test_i_can_find_job_by_valid_id(self, in_memory_repository, sample_document_id, sample_task_id):
|
||||||
|
"""Test finding job by valid ObjectId."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_job = in_memory_repository.find_job_by_id(created_job.id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_job is not None
|
||||||
|
assert found_job.id == created_job.id
|
||||||
|
assert found_job.document_id == created_job.document_id
|
||||||
|
assert found_job.task_id == created_job.task_id
|
||||||
|
assert found_job.status == created_job.status
|
||||||
|
|
||||||
|
def test_i_cannot_find_job_by_nonexistent_id(self, in_memory_repository):
|
||||||
|
"""Test that nonexistent ObjectId returns None."""
|
||||||
|
# Arrange
|
||||||
|
nonexistent_id = PyObjectId()
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_job = in_memory_repository.find_job_by_id(nonexistent_id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_job is None
|
||||||
|
|
||||||
|
def test_i_cannot_find_job_with_pymongo_error(self, in_memory_repository, mocker):
|
||||||
|
"""Test handling of PyMongo errors during job finding."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(JobRepositoryError) as exc_info:
|
||||||
|
in_memory_repository.find_job_by_id(PyObjectId())
|
||||||
|
|
||||||
|
assert "get_job_by_id" in str(exc_info.value)
|
||||||
|
|
||||||
|
def test_i_can_find_jobs_by_document_id(self, in_memory_repository, sample_document_id, sample_task_id):
|
||||||
|
"""Test finding jobs by document ID."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_jobs = in_memory_repository.find_jobs_by_document_id(sample_document_id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(found_jobs) == 1
|
||||||
|
assert found_jobs[0].id == created_job.id
|
||||||
|
assert found_jobs[0].document_id == sample_document_id
|
||||||
|
|
||||||
|
def test_i_can_find_empty_jobs_list_for_nonexistent_document(self, in_memory_repository):
|
||||||
|
"""Test that nonexistent document ID returns empty list."""
|
||||||
|
# Arrange
|
||||||
|
nonexistent_id = ObjectId()
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_jobs = in_memory_repository.find_jobs_by_document_id(nonexistent_id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_jobs == []
|
||||||
|
|
||||||
|
def test_i_cannot_find_jobs_by_document_with_pymongo_error(self, in_memory_repository, mocker):
|
||||||
|
"""Test handling of PyMongo errors during finding jobs by document ID."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(JobRepositoryError) as exc_info:
|
||||||
|
in_memory_repository.find_jobs_by_document_id(PyObjectId())
|
||||||
|
|
||||||
|
assert "get_jobs_by_file_id" in str(exc_info.value)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("status", [
|
||||||
|
ProcessingStatus.PENDING,
|
||||||
|
ProcessingStatus.PROCESSING,
|
||||||
|
ProcessingStatus.COMPLETED
|
||||||
|
])
|
||||||
|
def test_i_can_find_jobs_by_pending_status(self, in_memory_repository, sample_document_id, status):
|
||||||
|
"""Test finding jobs by PENDING status."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
in_memory_repository.update_job_status(created_job.id, status)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_jobs = in_memory_repository.get_jobs_by_status(status)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(found_jobs) == 1
|
||||||
|
assert found_jobs[0].id == created_job.id
|
||||||
|
assert found_jobs[0].status == status
|
||||||
|
|
||||||
|
def test_i_can_find_jobs_by_failed_status(self, in_memory_repository, sample_document_id):
|
||||||
|
"""Test finding jobs by FAILED status."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
in_memory_repository.update_job_status(created_job.id, ProcessingStatus.FAILED, "Test error")
|
||||||
|
|
||||||
|
# Act
|
||||||
|
found_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.FAILED)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(found_jobs) == 1
|
||||||
|
assert found_jobs[0].id == created_job.id
|
||||||
|
assert found_jobs[0].status == ProcessingStatus.FAILED
|
||||||
|
assert found_jobs[0].error_message == "Test error"
|
||||||
|
|
||||||
|
def test_i_can_find_empty_jobs_list_for_unused_status(self, in_memory_repository):
|
||||||
|
"""Test that unused status returns empty list."""
|
||||||
|
# Act
|
||||||
|
found_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.COMPLETED)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert found_jobs == []
|
||||||
|
|
||||||
|
def test_i_cannot_find_jobs_by_status_with_pymongo_error(self, in_memory_repository, mocker):
|
||||||
|
"""Test handling of PyMongo errors during finding jobs by status."""
|
||||||
|
# Arrange
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(JobRepositoryError) as exc_info:
|
||||||
|
in_memory_repository.get_jobs_by_status(ProcessingStatus.PENDING)
|
||||||
|
|
||||||
|
assert "get_jobs_by_status" in str(exc_info.value)
|
||||||
|
|
||||||
|
|
||||||
|
class TestJobRepositoryStatusUpdate:
|
||||||
|
"""Tests for job status update functionality."""
|
||||||
|
|
||||||
|
def test_i_can_update_job_status_to_processing(self, in_memory_repository, sample_document_id):
|
||||||
|
"""Test updating job status to PROCESSING with started_at timestamp."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.PROCESSING)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert updated_job is not None
|
||||||
|
assert updated_job.id == created_job.id
|
||||||
|
assert updated_job.status == ProcessingStatus.PROCESSING
|
||||||
|
assert updated_job.started_at is not None
|
||||||
|
assert updated_job.completed_at is None
|
||||||
|
assert updated_job.error_message is None
|
||||||
|
|
||||||
|
def test_i_can_update_job_status_to_completed(self, in_memory_repository, sample_document_id):
|
||||||
|
"""Test updating job status to COMPLETED with completed_at timestamp."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
in_memory_repository.update_job_status(created_job.id, ProcessingStatus.PROCESSING)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.COMPLETED)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert updated_job is not None
|
||||||
|
assert updated_job.id == created_job.id
|
||||||
|
assert updated_job.status == ProcessingStatus.COMPLETED
|
||||||
|
assert updated_job.started_at is not None
|
||||||
|
assert updated_job.completed_at is not None
|
||||||
|
assert updated_job.error_message is None
|
||||||
|
|
||||||
|
def test_i_can_update_job_status_to_failed_with_error(self, in_memory_repository, sample_document_id):
|
||||||
|
"""Test updating job status to FAILED with error message and completed_at timestamp."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
error_message = "Processing failed due to invalid format"
|
||||||
|
|
||||||
|
# Act
|
||||||
|
updated_job = in_memory_repository.update_job_status(
|
||||||
|
created_job.id, ProcessingStatus.FAILED, error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert updated_job is not None
|
||||||
|
assert updated_job.id == created_job.id
|
||||||
|
assert updated_job.status == ProcessingStatus.FAILED
|
||||||
|
assert updated_job.completed_at is not None
|
||||||
|
assert updated_job.error_message == error_message
|
||||||
|
|
||||||
|
def test_i_can_update_job_status_to_failed_without_error(self, in_memory_repository, sample_document_id):
|
||||||
|
"""Test updating job status to FAILED without error message."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
updated_job = in_memory_repository.update_job_status(created_job.id, ProcessingStatus.FAILED)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert updated_job is not None
|
||||||
|
assert updated_job.id == created_job.id
|
||||||
|
assert updated_job.status == ProcessingStatus.FAILED
|
||||||
|
assert updated_job.completed_at is not None
|
||||||
|
assert updated_job.error_message is None
|
||||||
|
|
||||||
|
def test_i_cannot_update_nonexistent_job_status(self, in_memory_repository):
|
||||||
|
"""Test that updating nonexistent job returns None."""
|
||||||
|
# Arrange
|
||||||
|
nonexistent_id = ObjectId()
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.update_job_status(nonexistent_id, ProcessingStatus.COMPLETED)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_i_cannot_update_job_status_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker):
|
||||||
|
"""Test handling of PyMongo errors during job status update."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
|
||||||
|
side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(JobRepositoryError) as exc_info:
|
||||||
|
in_memory_repository.update_job_status(created_job.id, ProcessingStatus.COMPLETED)
|
||||||
|
|
||||||
|
assert "update_job_status" in str(exc_info.value)
|
||||||
|
|
||||||
|
|
||||||
|
class TestJobRepositoryDeletion:
|
||||||
|
"""Tests for job deletion functionality."""
|
||||||
|
|
||||||
|
def test_i_can_delete_existing_job(self, in_memory_repository, sample_document_id):
|
||||||
|
"""Test successful job deletion."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
deletion_result = in_memory_repository.delete_job(created_job.id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert deletion_result is True
|
||||||
|
|
||||||
|
# Verify job is actually deleted
|
||||||
|
found_job = in_memory_repository.find_job_by_id(created_job.id)
|
||||||
|
assert found_job is None
|
||||||
|
|
||||||
|
def test_i_cannot_delete_nonexistent_job(self, in_memory_repository):
|
||||||
|
"""Test that deleting nonexistent job returns False."""
|
||||||
|
# Arrange
|
||||||
|
nonexistent_id = ObjectId()
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = in_memory_repository.delete_job(nonexistent_id)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_i_cannot_delete_job_with_pymongo_error(self, in_memory_repository, sample_document_id, mocker):
|
||||||
|
"""Test handling of PyMongo errors during job deletion."""
|
||||||
|
# Arrange
|
||||||
|
created_job = in_memory_repository.create_job(sample_document_id)
|
||||||
|
mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
|
||||||
|
|
||||||
|
# Act & Assert
|
||||||
|
with pytest.raises(JobRepositoryError) as exc_info:
|
||||||
|
in_memory_repository.delete_job(created_job.id)
|
||||||
|
|
||||||
|
assert "delete_job" in str(exc_info.value)
|
||||||
|
|
||||||
|
|
||||||
|
class TestJobRepositoryComplexScenarios:
|
||||||
|
"""Tests for complex job repository scenarios."""
|
||||||
|
|
||||||
|
def test_i_can_handle_complete_job_lifecycle(self, in_memory_repository, sample_document_id, sample_task_id):
|
||||||
|
"""Test complete job lifecycle from creation to completion."""
|
||||||
|
# Create job
|
||||||
|
job = in_memory_repository.create_job(sample_document_id, sample_task_id)
|
||||||
|
assert job.status == ProcessingStatus.PENDING
|
||||||
|
assert job.started_at is None
|
||||||
|
assert job.completed_at is None
|
||||||
|
|
||||||
|
# Start processing
|
||||||
|
job = in_memory_repository.update_job_status(job.id, ProcessingStatus.PROCESSING)
|
||||||
|
assert job.status == ProcessingStatus.PROCESSING
|
||||||
|
assert job.started_at is not None
|
||||||
|
assert job.completed_at is None
|
||||||
|
|
||||||
|
# Complete job
|
||||||
|
job = in_memory_repository.update_job_status(job.id, ProcessingStatus.COMPLETED)
|
||||||
|
assert job.status == ProcessingStatus.COMPLETED
|
||||||
|
assert job.started_at is not None
|
||||||
|
assert job.completed_at is not None
|
||||||
|
assert job.error_message is None
|
||||||
|
|
||||||
|
def test_i_can_handle_job_failure_scenario(self, in_memory_repository, sample_document_id, sample_task_id):
|
||||||
|
"""Test job failure scenario with error message."""
|
||||||
|
# Create and start job
|
||||||
|
job = in_memory_repository.create_job(sample_document_id, sample_task_id)
|
||||||
|
job = in_memory_repository.update_job_status(job.id, ProcessingStatus.PROCESSING)
|
||||||
|
|
||||||
|
# Fail job with error
|
||||||
|
error_msg = "File format not supported"
|
||||||
|
job = in_memory_repository.update_job_status(job.id, ProcessingStatus.FAILED, error_msg)
|
||||||
|
|
||||||
|
# Assert failure state
|
||||||
|
assert job.status == ProcessingStatus.FAILED
|
||||||
|
assert job.started_at is not None
|
||||||
|
assert job.completed_at is not None
|
||||||
|
assert job.error_message == error_msg
|
||||||
|
|
||||||
|
def test_i_can_handle_multiple_documents_with_different_statuses(self, in_memory_repository):
|
||||||
|
"""Test managing multiple jobs for different documents with various statuses."""
|
||||||
|
# Create jobs for different documents
|
||||||
|
doc1 = PyObjectId()
|
||||||
|
doc2 = PyObjectId()
|
||||||
|
doc3 = PyObjectId()
|
||||||
|
|
||||||
|
job1 = in_memory_repository.create_job(doc1, "task-1")
|
||||||
|
job2 = in_memory_repository.create_job(doc2, "task-2")
|
||||||
|
job3 = in_memory_repository.create_job(doc3, "task-3")
|
||||||
|
|
||||||
|
# Update to different statuses
|
||||||
|
in_memory_repository.update_job_status(job1.id, ProcessingStatus.PROCESSING)
|
||||||
|
in_memory_repository.update_job_status(job2.id, ProcessingStatus.COMPLETED)
|
||||||
|
in_memory_repository.update_job_status(job3.id, ProcessingStatus.FAILED, "Error occurred")
|
||||||
|
|
||||||
|
# Verify status queries
|
||||||
|
pending_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.PENDING)
|
||||||
|
processing_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.PROCESSING)
|
||||||
|
completed_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.COMPLETED)
|
||||||
|
failed_jobs = in_memory_repository.get_jobs_by_status(ProcessingStatus.FAILED)
|
||||||
|
|
||||||
|
assert len(pending_jobs) == 0
|
||||||
|
assert len(processing_jobs) == 1
|
||||||
|
assert len(completed_jobs) == 1
|
||||||
|
assert len(failed_jobs) == 1
|
||||||
|
|
||||||
|
assert processing_jobs[0].id == job1.id
|
||||||
|
assert completed_jobs[0].id == job2.id
|
||||||
|
assert failed_jobs[0].id == job3.id
|
||||||
@@ -1,29 +1,26 @@
|
|||||||
"""
|
"""
|
||||||
Test suite for UserRepository with async/await support.
|
Test suite for UserRepository with async/support.
|
||||||
|
|
||||||
This module contains comprehensive tests for all UserRepository methods
|
This module contains comprehensive tests for all UserRepository methods
|
||||||
using mongomock-motor for in-memory MongoDB testing.
|
using mongomock-motor for in-memory MongoDB testing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import pytest_asyncio
|
|
||||||
from bson import ObjectId
|
from bson import ObjectId
|
||||||
|
from mongomock.mongo_client import MongoClient
|
||||||
from pymongo.errors import DuplicateKeyError
|
from pymongo.errors import DuplicateKeyError
|
||||||
from mongomock_motor import AsyncMongoMockClient
|
|
||||||
|
|
||||||
from app.database.repositories.user_repository import UserRepository
|
from app.database.repositories.user_repository import UserRepository
|
||||||
from app.models.user import UserCreate, UserUpdate, UserInDB
|
from app.models.user import UserCreate, UserUpdate
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest.fixture
|
||||||
async def in_memory_repository():
|
def in_memory_repository():
|
||||||
"""Create an in-memory UserRepository for testing."""
|
"""Create an in-memory UserRepository for testing."""
|
||||||
client = AsyncMongoMockClient()
|
client = MongoClient()
|
||||||
db = client.test_database
|
db = client.test_database
|
||||||
repo = UserRepository(db)
|
repo = UserRepository(db)
|
||||||
await repo.initialize()
|
repo.initialize()
|
||||||
return repo
|
return repo
|
||||||
|
|
||||||
|
|
||||||
@@ -51,11 +48,10 @@ def sample_user_update():
|
|||||||
class TestUserRepositoryCreation:
|
class TestUserRepositoryCreation:
|
||||||
"""Tests for user creation functionality."""
|
"""Tests for user creation functionality."""
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_create_user(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_create_user(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test successful user creation."""
|
"""Test successful user creation."""
|
||||||
# Act
|
# Act
|
||||||
created_user = await in_memory_repository.create_user(sample_user_create)
|
created_user = in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert created_user is not None
|
assert created_user is not None
|
||||||
@@ -68,15 +64,14 @@ class TestUserRepositoryCreation:
|
|||||||
assert created_user.updated_at is not None
|
assert created_user.updated_at is not None
|
||||||
assert created_user.hashed_password != sample_user_create.password # Should be hashed
|
assert created_user.hashed_password != sample_user_create.password # Should be hashed
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_cannot_create_user_with_duplicate_username(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_cannot_create_user_with_duplicate_username(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test that creating user with duplicate username raises DuplicateKeyError."""
|
"""Test that creating user with duplicate username raises DuplicateKeyError."""
|
||||||
# Arrange
|
# Arrange
|
||||||
await in_memory_repository.create_user(sample_user_create)
|
in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
# Act & Assert
|
# Act & Assert
|
||||||
with pytest.raises(DuplicateKeyError) as exc_info:
|
with pytest.raises(DuplicateKeyError) as exc_info:
|
||||||
await in_memory_repository.create_user(sample_user_create)
|
in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
assert "already exists" in str(exc_info.value)
|
assert "already exists" in str(exc_info.value)
|
||||||
|
|
||||||
@@ -84,14 +79,13 @@ class TestUserRepositoryCreation:
|
|||||||
class TestUserRepositoryFinding:
|
class TestUserRepositoryFinding:
|
||||||
"""Tests for user finding functionality."""
|
"""Tests for user finding functionality."""
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_find_user_by_id(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_find_user_by_id(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test finding user by valid ID."""
|
"""Test finding user by valid ID."""
|
||||||
# Arrange
|
# Arrange
|
||||||
created_user = await in_memory_repository.create_user(sample_user_create)
|
created_user = in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
found_user = await in_memory_repository.find_user_by_id(str(created_user.id))
|
found_user = in_memory_repository.find_user_by_id(str(created_user.id))
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert found_user is not None
|
assert found_user is not None
|
||||||
@@ -99,69 +93,63 @@ class TestUserRepositoryFinding:
|
|||||||
assert found_user.username == created_user.username
|
assert found_user.username == created_user.username
|
||||||
assert found_user.email == created_user.email
|
assert found_user.email == created_user.email
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_cannot_find_user_by_invalid_id(self, in_memory_repository):
|
||||||
async def test_i_cannot_find_user_by_invalid_id(self, in_memory_repository):
|
|
||||||
"""Test that invalid ObjectId returns None."""
|
"""Test that invalid ObjectId returns None."""
|
||||||
# Act
|
# Act
|
||||||
found_user = await in_memory_repository.find_user_by_id("invalid_id")
|
found_user = in_memory_repository.find_user_by_id("invalid_id")
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert found_user is None
|
assert found_user is None
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_cannot_find_user_by_nonexistent_id(self, in_memory_repository):
|
||||||
async def test_i_cannot_find_user_by_nonexistent_id(self, in_memory_repository):
|
|
||||||
"""Test that nonexistent but valid ObjectId returns None."""
|
"""Test that nonexistent but valid ObjectId returns None."""
|
||||||
# Arrange
|
# Arrange
|
||||||
nonexistent_id = str(ObjectId())
|
nonexistent_id = str(ObjectId())
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
found_user = await in_memory_repository.find_user_by_id(nonexistent_id)
|
found_user = in_memory_repository.find_user_by_id(nonexistent_id)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert found_user is None
|
assert found_user is None
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_find_user_by_username(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_find_user_by_username(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test finding user by username."""
|
"""Test finding user by username."""
|
||||||
# Arrange
|
# Arrange
|
||||||
created_user = await in_memory_repository.create_user(sample_user_create)
|
created_user = in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
found_user = await in_memory_repository.find_user_by_username(sample_user_create.username)
|
found_user = in_memory_repository.find_user_by_username(sample_user_create.username)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert found_user is not None
|
assert found_user is not None
|
||||||
assert found_user.username == created_user.username
|
assert found_user.username == created_user.username
|
||||||
assert found_user.id == created_user.id
|
assert found_user.id == created_user.id
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_cannot_find_user_by_nonexistent_username(self, in_memory_repository):
|
||||||
async def test_i_cannot_find_user_by_nonexistent_username(self, in_memory_repository):
|
|
||||||
"""Test that nonexistent username returns None."""
|
"""Test that nonexistent username returns None."""
|
||||||
# Act
|
# Act
|
||||||
found_user = await in_memory_repository.find_user_by_username("nonexistent")
|
found_user = in_memory_repository.find_user_by_username("nonexistent")
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert found_user is None
|
assert found_user is None
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_find_user_by_email(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_find_user_by_email(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test finding user by email."""
|
"""Test finding user by email."""
|
||||||
# Arrange
|
# Arrange
|
||||||
created_user = await in_memory_repository.create_user(sample_user_create)
|
created_user = in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
found_user = await in_memory_repository.find_user_by_email(str(sample_user_create.email))
|
found_user = in_memory_repository.find_user_by_email(str(sample_user_create.email))
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert found_user is not None
|
assert found_user is not None
|
||||||
assert found_user.email == created_user.email
|
assert found_user.email == created_user.email
|
||||||
assert found_user.id == created_user.id
|
assert found_user.id == created_user.id
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_cannot_find_user_by_nonexistent_email(self, in_memory_repository):
|
||||||
async def test_i_cannot_find_user_by_nonexistent_email(self, in_memory_repository):
|
|
||||||
"""Test that nonexistent email returns None."""
|
"""Test that nonexistent email returns None."""
|
||||||
# Act
|
# Act
|
||||||
found_user = await in_memory_repository.find_user_by_email("nonexistent@example.com")
|
found_user = in_memory_repository.find_user_by_email("nonexistent@example.com")
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert found_user is None
|
assert found_user is None
|
||||||
@@ -170,15 +158,14 @@ class TestUserRepositoryFinding:
|
|||||||
class TestUserRepositoryUpdate:
|
class TestUserRepositoryUpdate:
|
||||||
"""Tests for user update functionality."""
|
"""Tests for user update functionality."""
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_update_user(self, in_memory_repository, sample_user_create, sample_user_update):
|
||||||
async def test_i_can_update_user(self, in_memory_repository, sample_user_create, sample_user_update):
|
|
||||||
"""Test successful user update."""
|
"""Test successful user update."""
|
||||||
# Arrange
|
# Arrange
|
||||||
created_user = await in_memory_repository.create_user(sample_user_create)
|
created_user = in_memory_repository.create_user(sample_user_create)
|
||||||
original_updated_at = created_user.updated_at
|
original_updated_at = created_user.updated_at
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
updated_user = await in_memory_repository.update_user(str(created_user.id), sample_user_update)
|
updated_user = in_memory_repository.update_user(str(created_user.id), sample_user_update)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert updated_user is not None
|
assert updated_user is not None
|
||||||
@@ -187,24 +174,22 @@ class TestUserRepositoryUpdate:
|
|||||||
assert updated_user.role == sample_user_update.role
|
assert updated_user.role == sample_user_update.role
|
||||||
assert updated_user.id == created_user.id
|
assert updated_user.id == created_user.id
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_cannot_update_user_with_invalid_id(self, in_memory_repository, sample_user_update):
|
||||||
async def test_i_cannot_update_user_with_invalid_id(self, in_memory_repository, sample_user_update):
|
|
||||||
"""Test that updating with invalid ID returns None."""
|
"""Test that updating with invalid ID returns None."""
|
||||||
# Act
|
# Act
|
||||||
result = await in_memory_repository.update_user("invalid_id", sample_user_update)
|
result = in_memory_repository.update_user("invalid_id", sample_user_update)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert result is None
|
assert result is None
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_update_user_with_partial_data(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_update_user_with_partial_data(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test updating user with partial data."""
|
"""Test updating user with partial data."""
|
||||||
# Arrange
|
# Arrange
|
||||||
created_user = await in_memory_repository.create_user(sample_user_create)
|
created_user = in_memory_repository.create_user(sample_user_create)
|
||||||
partial_update = UserUpdate(username="newusername")
|
partial_update = UserUpdate(username="newusername")
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
updated_user = await in_memory_repository.update_user(str(created_user.id), partial_update)
|
updated_user = in_memory_repository.update_user(str(created_user.id), partial_update)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert updated_user is not None
|
assert updated_user is not None
|
||||||
@@ -212,15 +197,14 @@ class TestUserRepositoryUpdate:
|
|||||||
assert updated_user.email == created_user.email # Should remain unchanged
|
assert updated_user.email == created_user.email # Should remain unchanged
|
||||||
assert updated_user.role == created_user.role # Should remain unchanged
|
assert updated_user.role == created_user.role # Should remain unchanged
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_update_user_with_empty_data(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_update_user_with_empty_data(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test updating user with empty data returns current user."""
|
"""Test updating user with empty data returns current user."""
|
||||||
# Arrange
|
# Arrange
|
||||||
created_user = await in_memory_repository.create_user(sample_user_create)
|
created_user = in_memory_repository.create_user(sample_user_create)
|
||||||
empty_update = UserUpdate()
|
empty_update = UserUpdate()
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
result = await in_memory_repository.update_user(str(created_user.id), empty_update)
|
result = in_memory_repository.update_user(str(created_user.id), empty_update)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert result is not None
|
assert result is not None
|
||||||
@@ -231,39 +215,36 @@ class TestUserRepositoryUpdate:
|
|||||||
class TestUserRepositoryDeletion:
|
class TestUserRepositoryDeletion:
|
||||||
"""Tests for user deletion functionality."""
|
"""Tests for user deletion functionality."""
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_delete_user(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_delete_user(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test successful user deletion."""
|
"""Test successful user deletion."""
|
||||||
# Arrange
|
# Arrange
|
||||||
created_user = await in_memory_repository.create_user(sample_user_create)
|
created_user = in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
deletion_result = await in_memory_repository.delete_user(str(created_user.id))
|
deletion_result = in_memory_repository.delete_user(str(created_user.id))
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert deletion_result is True
|
assert deletion_result is True
|
||||||
|
|
||||||
# Verify user is actually deleted
|
# Verify user is actually deleted
|
||||||
found_user = await in_memory_repository.find_user_by_id(str(created_user.id))
|
found_user = in_memory_repository.find_user_by_id(str(created_user.id))
|
||||||
assert found_user is None
|
assert found_user is None
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_cannot_delete_user_with_invalid_id(self, in_memory_repository):
|
||||||
async def test_i_cannot_delete_user_with_invalid_id(self, in_memory_repository):
|
|
||||||
"""Test that deleting with invalid ID returns False."""
|
"""Test that deleting with invalid ID returns False."""
|
||||||
# Act
|
# Act
|
||||||
result = await in_memory_repository.delete_user("invalid_id")
|
result = in_memory_repository.delete_user("invalid_id")
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert result is False
|
assert result is False
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_cannot_delete_nonexistent_user(self, in_memory_repository):
|
||||||
async def test_i_cannot_delete_nonexistent_user(self, in_memory_repository):
|
|
||||||
"""Test that deleting nonexistent user returns False."""
|
"""Test that deleting nonexistent user returns False."""
|
||||||
# Arrange
|
# Arrange
|
||||||
nonexistent_id = str(ObjectId())
|
nonexistent_id = str(ObjectId())
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
result = await in_memory_repository.delete_user(nonexistent_id)
|
result = in_memory_repository.delete_user(nonexistent_id)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert result is False
|
assert result is False
|
||||||
@@ -272,30 +253,27 @@ class TestUserRepositoryDeletion:
|
|||||||
class TestUserRepositoryUtilities:
|
class TestUserRepositoryUtilities:
|
||||||
"""Tests for utility methods."""
|
"""Tests for utility methods."""
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_count_users(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_count_users(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test counting users."""
|
"""Test counting users."""
|
||||||
# Arrange
|
# Arrange
|
||||||
initial_count = await in_memory_repository.count_users()
|
initial_count = in_memory_repository.count_users()
|
||||||
await in_memory_repository.create_user(sample_user_create)
|
in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
final_count = await in_memory_repository.count_users()
|
final_count = in_memory_repository.count_users()
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert final_count == initial_count + 1
|
assert final_count == initial_count + 1
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
def test_i_can_check_user_exists(self, in_memory_repository, sample_user_create):
|
||||||
async def test_i_can_check_user_exists(self, in_memory_repository, sample_user_create):
|
|
||||||
"""Test checking if user exists."""
|
"""Test checking if user exists."""
|
||||||
# Arrange
|
# Arrange
|
||||||
await in_memory_repository.create_user(sample_user_create)
|
in_memory_repository.create_user(sample_user_create)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
exists = await in_memory_repository.user_exists(sample_user_create.username)
|
exists = in_memory_repository.user_exists(sample_user_create.username)
|
||||||
not_exists = await in_memory_repository.user_exists("nonexistent")
|
not_exists = in_memory_repository.user_exists("nonexistent")
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert exists is True
|
assert exists is True
|
||||||
assert not_exists is False
|
assert not_exists is False
|
||||||
|
|
||||||
0
tests/services/__init__.py
Normal file
0
tests/services/__init__.py
Normal file
570
tests/services/test_document_service.py
Normal file
570
tests/services/test_document_service.py
Normal file
@@ -0,0 +1,570 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for DocumentService using in-memory MongoDB.
|
||||||
|
|
||||||
|
Tests the orchestration logic with real MongoDB operations
|
||||||
|
using mongomock for better integration testing.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pytest_asyncio
|
||||||
|
from bson import ObjectId
|
||||||
|
from mongomock.mongo_client import MongoClient
|
||||||
|
|
||||||
|
from app.models.document import FileType
|
||||||
|
from app.services.document_service import DocumentService
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def cleanup_test_folder():
|
||||||
|
"""Clean up test folder."""
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree("test_folder", ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def in_memory_database():
|
||||||
|
"""Create an in-memory database for testing."""
|
||||||
|
client = MongoClient()
|
||||||
|
return client.test_database
|
||||||
|
|
||||||
|
|
||||||
|
@pytest_asyncio.fixture
|
||||||
|
def document_service(in_memory_database):
|
||||||
|
"""Create DocumentService with in-memory repositories."""
|
||||||
|
service = DocumentService(in_memory_database, objects_folder="test_folder")
|
||||||
|
return service
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_file_bytes():
|
||||||
|
"""Sample file content as bytes."""
|
||||||
|
return b"This is a test PDF content"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_text_bytes():
|
||||||
|
"""Sample text file content as bytes."""
|
||||||
|
return b"This is a test text file content"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_file_hash():
|
||||||
|
"""Expected SHA256 hash for sample file bytes."""
|
||||||
|
import hashlib
|
||||||
|
return hashlib.sha256(b"This is a test PDF content").hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def validate_file_saved(document_service, file_hash, file_bytes):
|
||||||
|
# Verify file is saved to disk
|
||||||
|
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
|
||||||
|
assert os.path.exists(target_file_path)
|
||||||
|
|
||||||
|
with open(target_file_path, "rb") as f:
|
||||||
|
content = f.read()
|
||||||
|
assert content == file_bytes
|
||||||
|
|
||||||
|
|
||||||
|
class TestCreateDocument:
|
||||||
|
"""Tests for create_document method."""
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
@patch('app.services.document_service.datetime')
|
||||||
|
def test_i_can_create_document_with_new_content(
|
||||||
|
self,
|
||||||
|
mock_datetime,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test creating document when content doesn't exist yet."""
|
||||||
|
# Setup mocks
|
||||||
|
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
|
||||||
|
mock_datetime.now.return_value = fixed_time
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = document_service.create_document(
|
||||||
|
"/test/test.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify document creation
|
||||||
|
assert result is not None
|
||||||
|
assert result.filename == "test.pdf"
|
||||||
|
assert result.filepath == "/test/test.pdf"
|
||||||
|
assert result.file_type == FileType.PDF
|
||||||
|
assert result.detected_at == fixed_time
|
||||||
|
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
|
||||||
|
|
||||||
|
# Verify document created in database
|
||||||
|
doc_in_db = document_service.document_repository.find_document_by_id(result.id)
|
||||||
|
assert doc_in_db is not None
|
||||||
|
assert doc_in_db.id == result.id
|
||||||
|
assert doc_in_db.filename == result.filename
|
||||||
|
assert doc_in_db.filepath == result.filepath
|
||||||
|
assert doc_in_db.file_type == result.file_type
|
||||||
|
assert doc_in_db.detected_at == fixed_time
|
||||||
|
assert doc_in_db.file_hash == result.file_hash
|
||||||
|
|
||||||
|
# Verify file is saved to disk
|
||||||
|
validate_file_saved(document_service, result.file_hash, sample_file_bytes)
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
@patch('app.services.document_service.datetime')
|
||||||
|
def test_i_can_create_document_with_existing_content(
|
||||||
|
self,
|
||||||
|
mock_datetime,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test creating document when content already exists (deduplication)."""
|
||||||
|
# Setup mocks
|
||||||
|
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
|
||||||
|
mock_datetime.now.return_value = fixed_time
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Create first document
|
||||||
|
first_doc = document_service.create_document(
|
||||||
|
"/test/first.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create second document with same content
|
||||||
|
second_doc = document_service.create_document(
|
||||||
|
"/test/second.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify both documents exist but share same hash
|
||||||
|
assert first_doc.file_hash == second_doc.file_hash
|
||||||
|
assert first_doc.filename != second_doc.filename
|
||||||
|
assert first_doc.filepath != second_doc.filepath
|
||||||
|
|
||||||
|
def test_i_cannot_create_document_with_unsupported_file_type(
|
||||||
|
self,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test that unsupported file types raise ValueError."""
|
||||||
|
with pytest.raises(ValueError, match="Unsupported file type"):
|
||||||
|
document_service.create_document(
|
||||||
|
"/test/test.xyz", # Unsupported extension
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_i_cannot_create_document_with_empty_file_path(
|
||||||
|
self,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test that empty file path raises ValueError."""
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
document_service.create_document(
|
||||||
|
"", # Empty path
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_create_document_with_empty_bytes(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service
|
||||||
|
):
|
||||||
|
"""Test behavior with empty file bytes."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "text/plain"
|
||||||
|
|
||||||
|
# Execute with empty bytes
|
||||||
|
result = document_service.create_document(
|
||||||
|
"/test/empty.txt",
|
||||||
|
b"", # Empty bytes
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify file is saved to disk
|
||||||
|
validate_file_saved(document_service, result.file_hash, b"")
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetMethods:
|
||||||
|
"""Tests for document retrieval methods."""
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_get_document_by_id(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test retrieving document by ID."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Create a document first
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = document_service.get_document_by_id(created_doc.id)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is not None
|
||||||
|
assert result.id == created_doc.id
|
||||||
|
assert result.filename == created_doc.filename
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_get_document_by_hash(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test retrieving document by file hash."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Create a document first
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = document_service.get_document_by_hash(created_doc.file_hash)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is not None
|
||||||
|
assert result.file_hash == created_doc.file_hash
|
||||||
|
assert result.filename == created_doc.filename
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_get_document_by_filepath(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test retrieving document by file path."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
test_path = "/test/unique_test.pdf"
|
||||||
|
|
||||||
|
# Create a document first
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
test_path,
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = document_service.get_document_by_filepath(test_path)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is not None
|
||||||
|
assert result.filepath == test_path
|
||||||
|
assert result.id == created_doc.id
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_get_document_content(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test retrieving document with associated content."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Create a document first
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = document_service.get_document_content_by_hash(created_doc.file_hash)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result == sample_file_bytes
|
||||||
|
|
||||||
|
def test_i_cannot_get_nonexistent_document_by_id(
|
||||||
|
self,
|
||||||
|
document_service
|
||||||
|
):
|
||||||
|
"""Test that nonexistent document returns None."""
|
||||||
|
# Execute with random ObjectId
|
||||||
|
result = document_service.get_document_by_id(ObjectId())
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_i_cannot_get_nonexistent_document_by_hash(
|
||||||
|
self,
|
||||||
|
document_service
|
||||||
|
):
|
||||||
|
"""Test that nonexistent document hash returns None."""
|
||||||
|
# Execute
|
||||||
|
result = document_service.get_document_by_hash("nonexistent_hash")
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestPaginationAndCounting:
|
||||||
|
"""Tests for document listing and counting."""
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_list_documents_with_pagination(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test document listing with pagination parameters."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Create multiple documents
|
||||||
|
for i in range(5):
|
||||||
|
document_service.create_document(
|
||||||
|
f"/test/test{i}.pdf",
|
||||||
|
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute with pagination
|
||||||
|
result = document_service.list_documents(skip=1, limit=2)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert len(result) == 2
|
||||||
|
|
||||||
|
# Test counting
|
||||||
|
total_count = document_service.count_documents()
|
||||||
|
assert total_count == 5
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_count_documents(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test document counting."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "text/plain"
|
||||||
|
|
||||||
|
# Initially should be 0
|
||||||
|
initial_count = document_service.count_documents()
|
||||||
|
assert initial_count == 0
|
||||||
|
|
||||||
|
# Create some documents
|
||||||
|
for i in range(3):
|
||||||
|
document_service.create_document(
|
||||||
|
f"/test/test{i}.txt",
|
||||||
|
sample_file_bytes + bytes(str(i), 'utf-8'),
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
final_count = document_service.count_documents()
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert final_count == 3
|
||||||
|
|
||||||
|
|
||||||
|
class TestUpdateAndDelete:
|
||||||
|
"""Tests for document update and deletion operations."""
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_update_document_metadata(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test updating document metadata."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Create a document first
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute update
|
||||||
|
update_data = {"metadata": {"page_count": 5}}
|
||||||
|
result = document_service.update_document(created_doc.id, update_data)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is not None
|
||||||
|
assert result.metadata.get("page_count") == 5
|
||||||
|
assert result.filename == created_doc.filename
|
||||||
|
assert result.filepath == created_doc.filepath
|
||||||
|
assert result.file_hash == created_doc.file_hash
|
||||||
|
assert result.file_type == created_doc.file_type
|
||||||
|
assert result.metadata == update_data['metadata']
|
||||||
|
|
||||||
|
def test_i_can_update_document_content(
|
||||||
|
self,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
# Create a document first
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Execute update
|
||||||
|
update_data = {"file_bytes": b"this is an updated file content"}
|
||||||
|
result = document_service.update_document(created_doc.id, update_data)
|
||||||
|
|
||||||
|
assert result.filename == created_doc.filename
|
||||||
|
assert result.filepath == created_doc.filepath
|
||||||
|
assert result.file_hash != created_doc.file_hash
|
||||||
|
assert result.file_type == created_doc.file_type
|
||||||
|
assert result.metadata == created_doc.metadata
|
||||||
|
|
||||||
|
# Verify file is saved to disk
|
||||||
|
validate_file_saved(document_service, result.file_hash, b"this is an updated file content")
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_delete_document_and_orphaned_content(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test deleting document with orphaned content cleanup."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Create a document
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify content exists
|
||||||
|
validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes)
|
||||||
|
|
||||||
|
# Execute deletion
|
||||||
|
result = document_service.delete_document(created_doc.id)
|
||||||
|
|
||||||
|
# Verify document and content are deleted
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
deleted_doc = document_service.get_document_by_id(created_doc.id)
|
||||||
|
assert deleted_doc is None
|
||||||
|
|
||||||
|
# validate content is deleted
|
||||||
|
file_hash = created_doc.file_hash[:24]
|
||||||
|
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
|
||||||
|
assert not os.path.exists(target_file_path)
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_delete_document_without_affecting_shared_content(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test deleting document without removing shared content."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "application/pdf"
|
||||||
|
|
||||||
|
# Create two documents with same content
|
||||||
|
doc1 = document_service.create_document(
|
||||||
|
"/test/test1.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
doc2 = document_service.create_document(
|
||||||
|
"/test/test2.pdf",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# They should share the same hash
|
||||||
|
assert doc1.file_hash == doc2.file_hash
|
||||||
|
|
||||||
|
# Delete first document
|
||||||
|
result = document_service.delete_document(doc1.id)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Verify first document is deleted but content still exists
|
||||||
|
deleted_doc = document_service.get_document_by_id(doc1.id)
|
||||||
|
assert deleted_doc is None
|
||||||
|
|
||||||
|
remaining_doc = document_service.get_document_by_id(doc2.id)
|
||||||
|
assert remaining_doc is not None
|
||||||
|
|
||||||
|
validate_file_saved(document_service, doc2.file_hash, sample_file_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHashCalculation:
|
||||||
|
"""Tests for file hash calculation utility."""
|
||||||
|
|
||||||
|
def test_i_can_calculate_consistent_file_hash(self, document_service):
|
||||||
|
"""Test that file hash calculation is consistent."""
|
||||||
|
test_bytes = b"Test content for hashing"
|
||||||
|
|
||||||
|
# Calculate hash multiple times
|
||||||
|
hash1 = document_service._calculate_file_hash(test_bytes)
|
||||||
|
hash2 = document_service._calculate_file_hash(test_bytes)
|
||||||
|
|
||||||
|
# Should be identical
|
||||||
|
assert hash1 == hash2
|
||||||
|
assert len(hash1) == 64 # SHA256 produces 64-character hex string
|
||||||
|
|
||||||
|
def test_i_get_different_hashes_for_different_content(self, document_service):
|
||||||
|
"""Test that different content produces different hashes."""
|
||||||
|
content1 = b"First content"
|
||||||
|
content2 = b"Second content"
|
||||||
|
|
||||||
|
hash1 = document_service._calculate_file_hash(content1)
|
||||||
|
hash2 = document_service._calculate_file_hash(content2)
|
||||||
|
|
||||||
|
assert hash1 != hash2
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileTypeDetection:
|
||||||
|
"""Tests for file type detection."""
|
||||||
|
|
||||||
|
def test_i_can_detect_pdf_file_type(self, document_service):
|
||||||
|
"""Test PDF file type detection."""
|
||||||
|
file_type = document_service._detect_file_type("/path/to/document.pdf")
|
||||||
|
assert file_type == FileType.PDF
|
||||||
|
|
||||||
|
def test_i_can_detect_txt_file_type(self, document_service):
|
||||||
|
"""Test text file type detection."""
|
||||||
|
file_type = document_service._detect_file_type("/path/to/document.txt")
|
||||||
|
assert file_type == FileType.TXT
|
||||||
|
|
||||||
|
def test_i_can_detect_docx_file_type(self, document_service):
|
||||||
|
"""Test DOCX file type detection."""
|
||||||
|
file_type = document_service._detect_file_type("/path/to/document.docx")
|
||||||
|
assert file_type == FileType.DOCX
|
||||||
|
|
||||||
|
def test_i_cannot_detect_unsupported_file_type(self, document_service):
|
||||||
|
"""Test unsupported file type raises ValueError."""
|
||||||
|
with pytest.raises(ValueError, match="Unsupported file type"):
|
||||||
|
document_service._detect_file_type("/path/to/document.xyz")
|
||||||
518
tests/services/test_job_service.py
Normal file
518
tests/services/test_job_service.py
Normal file
@@ -0,0 +1,518 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for JobService using in-memory MongoDB.
|
||||||
|
|
||||||
|
Tests the business logic operations with real MongoDB operations
|
||||||
|
using mongomock for better integration testing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from bson import ObjectId
|
||||||
|
from mongomock.mongo_client import MongoClient
|
||||||
|
|
||||||
|
from app.exceptions.job_exceptions import InvalidStatusTransitionError
|
||||||
|
from app.models.job import ProcessingStatus
|
||||||
|
from app.models.types import PyObjectId
|
||||||
|
from app.services.job_service import JobService
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def in_memory_database():
|
||||||
|
"""Create an in-memory database for testing."""
|
||||||
|
client = MongoClient()
|
||||||
|
return client.test_database
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def job_service(in_memory_database):
|
||||||
|
"""Create JobService with in-memory repositories."""
|
||||||
|
service = JobService(in_memory_database).initialize()
|
||||||
|
return service
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_document_id():
|
||||||
|
"""Sample file ObjectId."""
|
||||||
|
return PyObjectId()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_task_id():
|
||||||
|
"""Sample Celery task UUID."""
|
||||||
|
return "550e8400-e29b-41d4-a716-446655440000"
|
||||||
|
|
||||||
|
|
||||||
|
class TestCreateJob:
|
||||||
|
"""Tests for create_job method."""
|
||||||
|
|
||||||
|
def test_i_can_create_job_with_task_id(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test creating job with task ID."""
|
||||||
|
# Execute
|
||||||
|
result = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Verify job creation
|
||||||
|
assert result is not None
|
||||||
|
assert result.document_id == sample_document_id
|
||||||
|
assert result.task_id == sample_task_id
|
||||||
|
assert result.status == ProcessingStatus.PENDING
|
||||||
|
assert result.created_at is not None
|
||||||
|
assert result.started_at is None
|
||||||
|
assert result.error_message is None
|
||||||
|
|
||||||
|
# Verify job exists in database
|
||||||
|
job_in_db = job_service.get_job_by_id(result.id)
|
||||||
|
assert job_in_db is not None
|
||||||
|
assert job_in_db.id == result.id
|
||||||
|
assert job_in_db.document_id == sample_document_id
|
||||||
|
assert job_in_db.task_id == sample_task_id
|
||||||
|
assert job_in_db.status == ProcessingStatus.PENDING
|
||||||
|
|
||||||
|
def test_i_can_create_job_without_task_id(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id
|
||||||
|
):
|
||||||
|
"""Test creating job without task ID."""
|
||||||
|
# Execute
|
||||||
|
result = job_service.create_job(sample_document_id)
|
||||||
|
|
||||||
|
# Verify job creation
|
||||||
|
assert result is not None
|
||||||
|
assert result.document_id == sample_document_id
|
||||||
|
assert result.task_id is None
|
||||||
|
assert result.status == ProcessingStatus.PENDING
|
||||||
|
assert result.created_at is not None
|
||||||
|
assert result.started_at is None
|
||||||
|
assert result.error_message is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetJobMethods:
|
||||||
|
"""Tests for job retrieval methods."""
|
||||||
|
|
||||||
|
def test_i_can_get_job_by_id(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test retrieving job by ID."""
|
||||||
|
# Create a job first
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = job_service.get_job_by_id(created_job.id)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is not None
|
||||||
|
assert result.id == created_job.id
|
||||||
|
assert result.document_id == created_job.document_id
|
||||||
|
assert result.task_id == created_job.task_id
|
||||||
|
assert result.status == created_job.status
|
||||||
|
|
||||||
|
def test_i_can_get_jobs_by_status(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id
|
||||||
|
):
|
||||||
|
"""Test retrieving jobs by status."""
|
||||||
|
# Create jobs with different statuses
|
||||||
|
pending_job = job_service.create_job(sample_document_id, "pending-task")
|
||||||
|
|
||||||
|
processing_job = job_service.create_job(ObjectId(), "processing-task")
|
||||||
|
job_service.mark_job_as_started(processing_job.id)
|
||||||
|
|
||||||
|
completed_job = job_service.create_job(ObjectId(), "completed-task")
|
||||||
|
job_service.mark_job_as_started(completed_job.id)
|
||||||
|
job_service.mark_job_as_completed(completed_job.id)
|
||||||
|
|
||||||
|
# Execute - get pending jobs
|
||||||
|
pending_results = job_service.get_jobs_by_status(ProcessingStatus.PENDING)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert len(pending_results) == 1
|
||||||
|
assert pending_results[0].id == pending_job.id
|
||||||
|
assert pending_results[0].status == ProcessingStatus.PENDING
|
||||||
|
|
||||||
|
# Execute - get processing jobs
|
||||||
|
processing_results = job_service.get_jobs_by_status(ProcessingStatus.PROCESSING)
|
||||||
|
assert len(processing_results) == 1
|
||||||
|
assert processing_results[0].status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
# Execute - get completed jobs
|
||||||
|
completed_results = job_service.get_jobs_by_status(ProcessingStatus.COMPLETED)
|
||||||
|
assert len(completed_results) == 1
|
||||||
|
assert completed_results[0].status == ProcessingStatus.COMPLETED
|
||||||
|
|
||||||
|
|
||||||
|
class TestUpdateStatus:
|
||||||
|
"""Tests for mark_job_as_started method."""
|
||||||
|
|
||||||
|
def test_i_can_mark_pending_job_as_started(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test marking pending job as started (PENDING → PROCESSING)."""
|
||||||
|
# Create a pending job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
assert created_job.status == ProcessingStatus.PENDING
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
# Verify status transition
|
||||||
|
assert result is not None
|
||||||
|
assert result.id == created_job.id
|
||||||
|
assert result.status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
# Verify in database
|
||||||
|
updated_job = job_service.get_job_by_id(created_job.id)
|
||||||
|
assert updated_job.status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
def test_i_cannot_mark_processing_job_as_started(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that processing job cannot be marked as started."""
|
||||||
|
# Create and start a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
# Try to start it again
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.PROCESSING
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
def test_i_cannot_mark_completed_job_as_started(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that completed job cannot be marked as started."""
|
||||||
|
# Create, start, and complete a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
job_service.mark_job_as_completed(created_job.id)
|
||||||
|
|
||||||
|
# Try to start it again
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.COMPLETED
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
def test_i_cannot_mark_failed_job_as_started(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that failed job cannot be marked as started."""
|
||||||
|
# Create, start, and fail a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
job_service.mark_job_as_failed(created_job.id, "Test error")
|
||||||
|
|
||||||
|
# Try to start it again
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.FAILED
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
def test_i_can_mark_processing_job_as_completed(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test marking processing job as completed (PROCESSING → COMPLETED)."""
|
||||||
|
# Create and start a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
started_job = job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = job_service.mark_job_as_completed(created_job.id)
|
||||||
|
|
||||||
|
# Verify status transition
|
||||||
|
assert result is not None
|
||||||
|
assert result.id == created_job.id
|
||||||
|
assert result.status == ProcessingStatus.COMPLETED
|
||||||
|
|
||||||
|
# Verify in database
|
||||||
|
updated_job = job_service.get_job_by_id(created_job.id)
|
||||||
|
assert updated_job.status == ProcessingStatus.COMPLETED
|
||||||
|
|
||||||
|
def test_i_cannot_mark_pending_job_as_completed(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that pending job cannot be marked as completed."""
|
||||||
|
# Create a pending job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Try to complete it directly
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_completed(created_job.id)
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.PENDING
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.COMPLETED
|
||||||
|
|
||||||
|
def test_i_cannot_mark_completed_job_as_completed(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that completed job cannot be marked as completed again."""
|
||||||
|
# Create, start, and complete a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
job_service.mark_job_as_completed(created_job.id)
|
||||||
|
|
||||||
|
# Try to complete it again
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_completed(created_job.id)
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.COMPLETED
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.COMPLETED
|
||||||
|
|
||||||
|
def test_i_cannot_mark_failed_job_as_completed(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that failed job cannot be marked as completed."""
|
||||||
|
# Create, start, and fail a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
job_service.mark_job_as_failed(created_job.id, "Test error")
|
||||||
|
|
||||||
|
# Try to complete it
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_completed(created_job.id)
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.FAILED
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.COMPLETED
|
||||||
|
|
||||||
|
def test_i_can_mark_processing_job_as_failed_with_error_message(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test marking processing job as failed with error message."""
|
||||||
|
# Create and start a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
started_job = job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
error_message = "Processing failed due to invalid file format"
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = job_service.mark_job_as_failed(created_job.id, error_message)
|
||||||
|
|
||||||
|
# Verify status transition
|
||||||
|
assert result is not None
|
||||||
|
assert result.id == created_job.id
|
||||||
|
assert result.status == ProcessingStatus.FAILED
|
||||||
|
assert result.error_message == error_message
|
||||||
|
|
||||||
|
# Verify in database
|
||||||
|
updated_job = job_service.get_job_by_id(created_job.id)
|
||||||
|
assert updated_job.status == ProcessingStatus.FAILED
|
||||||
|
assert updated_job.error_message == error_message
|
||||||
|
|
||||||
|
def test_i_can_mark_processing_job_as_failed_without_error_message(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test marking processing job as failed without error message."""
|
||||||
|
# Create and start a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
# Execute without error message
|
||||||
|
result = job_service.mark_job_as_failed(created_job.id)
|
||||||
|
|
||||||
|
# Verify status transition
|
||||||
|
assert result is not None
|
||||||
|
assert result.status == ProcessingStatus.FAILED
|
||||||
|
assert result.error_message is None
|
||||||
|
|
||||||
|
def test_i_cannot_mark_pending_job_as_failed(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that pending job cannot be marked as failed."""
|
||||||
|
# Create a pending job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Try to fail it directly
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_failed(created_job.id, "Test error")
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.PENDING
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.FAILED
|
||||||
|
|
||||||
|
def test_i_cannot_mark_completed_job_as_failed(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that completed job cannot be marked as failed."""
|
||||||
|
# Create, start, and complete a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
job_service.mark_job_as_completed(created_job.id)
|
||||||
|
|
||||||
|
# Try to fail it
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_failed(created_job.id, "Test error")
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.COMPLETED
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.FAILED
|
||||||
|
|
||||||
|
def test_i_cannot_mark_failed_job_as_failed(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that failed job cannot be marked as failed again."""
|
||||||
|
# Create, start, and fail a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
job_service.mark_job_as_failed(created_job.id, "First error")
|
||||||
|
|
||||||
|
# Try to fail it again
|
||||||
|
with pytest.raises(InvalidStatusTransitionError) as exc_info:
|
||||||
|
job_service.mark_job_as_failed(created_job.id, "Second error")
|
||||||
|
|
||||||
|
# Verify exception details
|
||||||
|
assert exc_info.value.current_status == ProcessingStatus.FAILED
|
||||||
|
assert exc_info.value.target_status == ProcessingStatus.FAILED
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeleteJob:
|
||||||
|
"""Tests for delete_job method."""
|
||||||
|
|
||||||
|
def test_i_can_delete_existing_job(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test deleting an existing job."""
|
||||||
|
# Create a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
|
||||||
|
# Verify job exists
|
||||||
|
job_before_delete = job_service.get_job_by_id(created_job.id)
|
||||||
|
assert job_before_delete is not None
|
||||||
|
|
||||||
|
# Execute deletion
|
||||||
|
result = job_service.delete_job(created_job.id)
|
||||||
|
|
||||||
|
# Verify deletion
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Verify job no longer exists
|
||||||
|
deleted_job = job_service.get_job_by_id(created_job.id)
|
||||||
|
assert deleted_job is None
|
||||||
|
|
||||||
|
def test_i_cannot_delete_nonexistent_job(
|
||||||
|
self,
|
||||||
|
job_service
|
||||||
|
):
|
||||||
|
"""Test deleting a nonexistent job returns False."""
|
||||||
|
# Execute deletion with random ObjectId
|
||||||
|
result = job_service.delete_job(ObjectId())
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestStatusTransitionValidation:
|
||||||
|
"""Tests for status transition validation across different scenarios."""
|
||||||
|
|
||||||
|
def test_valid_job_lifecycle_flow(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test complete valid job lifecycle: PENDING → PROCESSING → COMPLETED."""
|
||||||
|
# Create job (PENDING)
|
||||||
|
job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
assert job.status == ProcessingStatus.PENDING
|
||||||
|
|
||||||
|
# Start job (PENDING → PROCESSING)
|
||||||
|
started_job = job_service.mark_job_as_started(job.id)
|
||||||
|
assert started_job.status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
# Complete job (PROCESSING → COMPLETED)
|
||||||
|
completed_job = job_service.mark_job_as_completed(job.id)
|
||||||
|
assert completed_job.status == ProcessingStatus.COMPLETED
|
||||||
|
|
||||||
|
def test_valid_job_failure_flow(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test valid job failure: PENDING → PROCESSING → FAILED."""
|
||||||
|
# Create job (PENDING)
|
||||||
|
job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
assert job.status == ProcessingStatus.PENDING
|
||||||
|
|
||||||
|
# Start job (PENDING → PROCESSING)
|
||||||
|
started_job = job_service.mark_job_as_started(job.id)
|
||||||
|
assert started_job.status == ProcessingStatus.PROCESSING
|
||||||
|
|
||||||
|
# Fail job (PROCESSING → FAILED)
|
||||||
|
failed_job = job_service.mark_job_as_failed(job.id, "Test failure")
|
||||||
|
assert failed_job.status == ProcessingStatus.FAILED
|
||||||
|
assert failed_job.error_message == "Test failure"
|
||||||
|
|
||||||
|
def test_job_operations_with_empty_database(
|
||||||
|
self,
|
||||||
|
job_service
|
||||||
|
):
|
||||||
|
"""Test job operations when database is empty."""
|
||||||
|
# Try to get nonexistent job
|
||||||
|
result = job_service.get_job_by_id(ObjectId())
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
# Try to get jobs by status when none exist
|
||||||
|
pending_jobs = job_service.get_jobs_by_status(ProcessingStatus.PENDING)
|
||||||
|
assert pending_jobs == []
|
||||||
|
|
||||||
|
# Try to delete nonexistent job
|
||||||
|
delete_result = job_service.delete_job(ObjectId())
|
||||||
|
assert delete_result is False
|
||||||
@@ -1,187 +0,0 @@
|
|||||||
"""
|
|
||||||
Unit tests for MongoDB database connection module.
|
|
||||||
|
|
||||||
Tests the database connection functionality with mocking
|
|
||||||
to avoid requiring actual MongoDB instance during tests.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import Mock, patch, MagicMock
|
|
||||||
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
|
||||||
|
|
||||||
from app.database.connection import (
|
|
||||||
create_mongodb_client,
|
|
||||||
get_database,
|
|
||||||
close_database_connection,
|
|
||||||
get_mongodb_client,
|
|
||||||
test_database_connection
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_get_database_connection():
|
|
||||||
"""Test successful database connection creation."""
|
|
||||||
mock_client = Mock()
|
|
||||||
mock_database = Mock()
|
|
||||||
|
|
||||||
# Configure the mock to support dictionary-like access
|
|
||||||
mock_client.__getitem__ = Mock(return_value=mock_database)
|
|
||||||
|
|
||||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
|
||||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
|
|
||||||
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
|
|
||||||
# Reset global variables
|
|
||||||
import app.database.connection
|
|
||||||
app.database.connection._client = None
|
|
||||||
app.database.connection._database = None
|
|
||||||
|
|
||||||
result = get_database()
|
|
||||||
|
|
||||||
assert result == mock_database
|
|
||||||
mock_client.admin.command.assert_called_with('ping')
|
|
||||||
# Verify that __getitem__ was called with the database name
|
|
||||||
mock_client.__getitem__.assert_called_with("testdb")
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_cannot_connect_to_invalid_mongodb_url():
|
|
||||||
"""Test fail-fast behavior with invalid MongoDB URL."""
|
|
||||||
mock_client = Mock()
|
|
||||||
mock_client.admin.command.side_effect = ConnectionFailure("Connection failed")
|
|
||||||
|
|
||||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
|
||||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://invalid:27017"):
|
|
||||||
with pytest.raises(SystemExit) as exc_info:
|
|
||||||
create_mongodb_client()
|
|
||||||
|
|
||||||
assert exc_info.value.code == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_cannot_connect_with_server_selection_timeout():
|
|
||||||
"""Test fail-fast behavior with server selection timeout."""
|
|
||||||
mock_client = Mock()
|
|
||||||
mock_client.admin.command.side_effect = ServerSelectionTimeoutError("Timeout")
|
|
||||||
|
|
||||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
|
||||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://timeout:27017"):
|
|
||||||
with pytest.raises(SystemExit) as exc_info:
|
|
||||||
create_mongodb_client()
|
|
||||||
|
|
||||||
assert exc_info.value.code == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_cannot_connect_with_unexpected_error():
|
|
||||||
"""Test fail-fast behavior with unexpected connection error."""
|
|
||||||
with patch('app.database.connection.MongoClient', side_effect=Exception("Unexpected error")):
|
|
||||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://error:27017"):
|
|
||||||
with pytest.raises(SystemExit) as exc_info:
|
|
||||||
create_mongodb_client()
|
|
||||||
|
|
||||||
assert exc_info.value.code == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_get_database_singleton():
|
|
||||||
"""Test that get_database returns the same instance (singleton pattern)."""
|
|
||||||
mock_client = Mock()
|
|
||||||
mock_database = Mock()
|
|
||||||
mock_client.__getitem__ = Mock(return_value=mock_database)
|
|
||||||
|
|
||||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
|
||||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
|
|
||||||
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
|
|
||||||
# Reset global variables
|
|
||||||
import app.database.connection
|
|
||||||
app.database.connection._client = None
|
|
||||||
app.database.connection._database = None
|
|
||||||
|
|
||||||
# First call
|
|
||||||
db1 = get_database()
|
|
||||||
# Second call
|
|
||||||
db2 = get_database()
|
|
||||||
|
|
||||||
assert db1 is db2
|
|
||||||
# MongoClient should be called only once
|
|
||||||
assert mock_client.admin.command.call_count == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_close_database_connection():
|
|
||||||
"""Test closing database connection."""
|
|
||||||
mock_client = Mock()
|
|
||||||
mock_database = Mock()
|
|
||||||
mock_client.__getitem__ = Mock(return_value=mock_database)
|
|
||||||
|
|
||||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
|
||||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
|
|
||||||
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
|
|
||||||
# Reset global variables
|
|
||||||
import app.database.connection
|
|
||||||
app.database.connection._client = None
|
|
||||||
app.database.connection._database = None
|
|
||||||
|
|
||||||
# Create connection
|
|
||||||
get_database()
|
|
||||||
|
|
||||||
# Close connection
|
|
||||||
close_database_connection()
|
|
||||||
|
|
||||||
mock_client.close.assert_called_once()
|
|
||||||
assert app.database.connection._client is None
|
|
||||||
assert app.database.connection._database is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_get_mongodb_client():
|
|
||||||
"""Test getting raw MongoDB client instance."""
|
|
||||||
mock_client = Mock()
|
|
||||||
mock_database = Mock()
|
|
||||||
mock_client.__getitem__ = Mock(return_value=mock_database)
|
|
||||||
|
|
||||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
|
||||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
|
|
||||||
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
|
|
||||||
# Reset global variables
|
|
||||||
import app.database.connection
|
|
||||||
app.database.connection._client = None
|
|
||||||
app.database.connection._database = None
|
|
||||||
|
|
||||||
# Create connection first
|
|
||||||
get_database()
|
|
||||||
|
|
||||||
# Get client
|
|
||||||
result = get_mongodb_client()
|
|
||||||
|
|
||||||
assert result == mock_client
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_get_none_mongodb_client_when_not_connected():
|
|
||||||
"""Test getting MongoDB client returns None when not connected."""
|
|
||||||
# Reset global variables
|
|
||||||
import app.database.connection
|
|
||||||
app.database.connection._client = None
|
|
||||||
app.database.connection._database = None
|
|
||||||
|
|
||||||
result = get_mongodb_client()
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_test_database_connection_success():
|
|
||||||
"""Test database connection health check - success case."""
|
|
||||||
mock_database = Mock()
|
|
||||||
mock_database.command.return_value = True
|
|
||||||
|
|
||||||
with patch('app.database.connection.get_database', return_value=mock_database):
|
|
||||||
result = test_database_connection()
|
|
||||||
|
|
||||||
assert result is True
|
|
||||||
mock_database.command.assert_called_with('ping')
|
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_close_connection_when_no_client():
|
|
||||||
"""Test closing connection when no client exists (should not raise error)."""
|
|
||||||
# Reset global variables
|
|
||||||
import app.database.connection
|
|
||||||
app.database.connection._client = None
|
|
||||||
app.database.connection._database = None
|
|
||||||
|
|
||||||
# Should not raise any exception
|
|
||||||
close_database_connection()
|
|
||||||
|
|
||||||
assert app.database.connection._client is None
|
|
||||||
assert app.database.connection._database is None
|
|
||||||
@@ -1,311 +0,0 @@
|
|||||||
"""
|
|
||||||
Test suite for DocumentContentRepository with async/await support.
|
|
||||||
|
|
||||||
This module contains comprehensive tests for all DocumentContentRepository methods
|
|
||||||
using mongomock-motor for in-memory MongoDB testing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import hashlib
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import pytest_asyncio
|
|
||||||
from bson import ObjectId
|
|
||||||
from pymongo.errors import DuplicateKeyError
|
|
||||||
from mongomock_motor import AsyncMongoMockClient
|
|
||||||
|
|
||||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
|
||||||
from app.models.document import DocumentContent
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def in_memory_repository():
|
|
||||||
"""Create an in-memory DocumentContentRepository for testing."""
|
|
||||||
client = AsyncMongoMockClient()
|
|
||||||
db = client.test_database
|
|
||||||
repo = DocumentContentRepository(db)
|
|
||||||
await repo.initialize()
|
|
||||||
return repo
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_document_content():
|
|
||||||
"""Sample DocumentContent data for testing."""
|
|
||||||
content = "This is sample document content for testing purposes."
|
|
||||||
file_hash = hashlib.sha256(content.encode()).hexdigest()
|
|
||||||
|
|
||||||
return DocumentContent(
|
|
||||||
file_hash=file_hash,
|
|
||||||
content=content,
|
|
||||||
encoding="utf-8",
|
|
||||||
file_size=len(content.encode()),
|
|
||||||
mime_type="text/plain"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def another_document_content():
|
|
||||||
"""Another sample DocumentContent data for testing."""
|
|
||||||
content = "This is another sample document with different content."
|
|
||||||
file_hash = hashlib.sha256(content.encode()).hexdigest()
|
|
||||||
|
|
||||||
return DocumentContent(
|
|
||||||
file_hash=file_hash,
|
|
||||||
content=content,
|
|
||||||
encoding="utf-8",
|
|
||||||
file_size=len(content.encode()),
|
|
||||||
mime_type="text/plain"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestDocumentContentRepositoryCreation:
|
|
||||||
"""Tests for document content creation functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_create_document_content(self, in_memory_repository, sample_document_content):
|
|
||||||
"""Test successful document content creation."""
|
|
||||||
# Act
|
|
||||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert created_content is not None
|
|
||||||
assert created_content.file_hash == sample_document_content.file_hash
|
|
||||||
assert created_content.content == sample_document_content.content
|
|
||||||
assert created_content.encoding == sample_document_content.encoding
|
|
||||||
assert created_content.file_size == sample_document_content.file_size
|
|
||||||
assert created_content.mime_type == sample_document_content.mime_type
|
|
||||||
assert created_content.id is not None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_create_document_content_with_duplicate_file_hash(self, in_memory_repository,
|
|
||||||
sample_document_content):
|
|
||||||
"""Test that creating document content with duplicate file_hash raises DuplicateKeyError."""
|
|
||||||
# Arrange
|
|
||||||
await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
|
|
||||||
# Act & Assert
|
|
||||||
with pytest.raises(DuplicateKeyError) as exc_info:
|
|
||||||
await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
|
|
||||||
assert "already exists" in str(exc_info.value)
|
|
||||||
|
|
||||||
|
|
||||||
class TestDocumentContentRepositoryFinding:
|
|
||||||
"""Tests for document content finding functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_find_document_content_by_id(self, in_memory_repository, sample_document_content):
|
|
||||||
"""Test finding document content by valid ID."""
|
|
||||||
# Arrange
|
|
||||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_content is not None
|
|
||||||
assert found_content.id == created_content.id
|
|
||||||
assert found_content.file_hash == created_content.file_hash
|
|
||||||
assert found_content.content == created_content.content
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_find_document_content_by_invalid_id(self, in_memory_repository):
|
|
||||||
"""Test that invalid ObjectId returns None."""
|
|
||||||
# Act
|
|
||||||
found_content = await in_memory_repository.find_document_content_by_id("invalid_id")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_content is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_find_document_content_by_nonexistent_id(self, in_memory_repository):
|
|
||||||
"""Test that nonexistent but valid ObjectId returns None."""
|
|
||||||
# Arrange
|
|
||||||
nonexistent_id = str(ObjectId())
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_content = await in_memory_repository.find_document_content_by_id(nonexistent_id)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_content is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_find_document_content_by_file_hash(self, in_memory_repository, sample_document_content):
|
|
||||||
"""Test finding document content by file hash."""
|
|
||||||
# Arrange
|
|
||||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_content = await in_memory_repository.find_document_content_by_file_hash(sample_document_content.file_hash)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_content is not None
|
|
||||||
assert found_content.file_hash == created_content.file_hash
|
|
||||||
assert found_content.id == created_content.id
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_find_document_content_by_nonexistent_file_hash(self, in_memory_repository):
|
|
||||||
"""Test that nonexistent file hash returns None."""
|
|
||||||
# Act
|
|
||||||
found_content = await in_memory_repository.find_document_content_by_file_hash("nonexistent_hash")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_content is None
|
|
||||||
|
|
||||||
|
|
||||||
class TestDocumentContentRepositoryUpdate:
|
|
||||||
"""Tests for document content update functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_content(self, in_memory_repository, sample_document_content):
|
|
||||||
"""Test successful document content update."""
|
|
||||||
# Arrange
|
|
||||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
update_data = {
|
|
||||||
"content": "Updated content for testing",
|
|
||||||
"encoding": "utf-16",
|
|
||||||
"mime_type": "text/html"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Act
|
|
||||||
updated_content = await in_memory_repository.update_document_content(str(created_content.id), update_data)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert updated_content is not None
|
|
||||||
assert updated_content.content == update_data["content"]
|
|
||||||
assert updated_content.encoding == update_data["encoding"]
|
|
||||||
assert updated_content.mime_type == update_data["mime_type"]
|
|
||||||
assert updated_content.id == created_content.id
|
|
||||||
assert updated_content.file_hash == created_content.file_hash # Should remain unchanged
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_update_document_content_with_invalid_id(self, in_memory_repository):
|
|
||||||
"""Test that updating with invalid ID returns None."""
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.update_document_content("invalid_id", {"content": "test"})
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_content_with_partial_data(self, in_memory_repository, sample_document_content):
|
|
||||||
"""Test updating document content with partial data."""
|
|
||||||
# Arrange
|
|
||||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
partial_update = {"encoding": "iso-8859-1"}
|
|
||||||
|
|
||||||
# Act
|
|
||||||
updated_content = await in_memory_repository.update_document_content(str(created_content.id), partial_update)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert updated_content is not None
|
|
||||||
assert updated_content.encoding == "iso-8859-1"
|
|
||||||
assert updated_content.content == created_content.content # Should remain unchanged
|
|
||||||
assert updated_content.mime_type == created_content.mime_type # Should remain unchanged
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_content_with_empty_data(self, in_memory_repository, sample_document_content):
|
|
||||||
"""Test updating document content with empty data returns current content."""
|
|
||||||
# Arrange
|
|
||||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
empty_update = {}
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.update_document_content(str(created_content.id), empty_update)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is not None
|
|
||||||
assert result.content == created_content.content
|
|
||||||
assert result.encoding == created_content.encoding
|
|
||||||
assert result.mime_type == created_content.mime_type
|
|
||||||
|
|
||||||
|
|
||||||
class TestDocumentContentRepositoryDeletion:
|
|
||||||
"""Tests for document content deletion functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_delete_document_content(self, in_memory_repository, sample_document_content):
|
|
||||||
"""Test successful document content deletion."""
|
|
||||||
# Arrange
|
|
||||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
deletion_result = await in_memory_repository.delete_document_content(str(created_content.id))
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert deletion_result is True
|
|
||||||
|
|
||||||
# Verify content is actually deleted
|
|
||||||
found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
|
|
||||||
assert found_content is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_delete_document_content_with_invalid_id(self, in_memory_repository):
|
|
||||||
"""Test that deleting with invalid ID returns False."""
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.delete_document_content("invalid_id")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is False
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_delete_nonexistent_document_content(self, in_memory_repository):
|
|
||||||
"""Test that deleting nonexistent document content returns False."""
|
|
||||||
# Arrange
|
|
||||||
nonexistent_id = str(ObjectId())
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.delete_document_content(nonexistent_id)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is False
|
|
||||||
|
|
||||||
|
|
||||||
class TestDocumentContentRepositoryUtilities:
|
|
||||||
"""Tests for utility methods."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_check_content_exists(self, in_memory_repository, sample_document_content):
|
|
||||||
"""Test checking if document content exists by file hash."""
|
|
||||||
# Arrange
|
|
||||||
await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
exists = await in_memory_repository.content_exists(sample_document_content.file_hash)
|
|
||||||
not_exists = await in_memory_repository.content_exists("nonexistent_hash")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert exists is True
|
|
||||||
assert not_exists is False
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_list_document_contents(self, in_memory_repository, sample_document_content,
|
|
||||||
another_document_content):
|
|
||||||
"""Test listing document contents with pagination."""
|
|
||||||
# Arrange
|
|
||||||
await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
await in_memory_repository.create_document_content(another_document_content)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
all_contents = await in_memory_repository.list_document_contents()
|
|
||||||
limited_contents = await in_memory_repository.list_document_contents(skip=0, limit=1)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(all_contents) == 2
|
|
||||||
assert len(limited_contents) == 1
|
|
||||||
assert all(isinstance(content, DocumentContent) for content in all_contents)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_count_document_contents(self, in_memory_repository, sample_document_content,
|
|
||||||
another_document_content):
|
|
||||||
"""Test counting document contents."""
|
|
||||||
# Arrange
|
|
||||||
initial_count = await in_memory_repository.count_document_contents()
|
|
||||||
await in_memory_repository.create_document_content(sample_document_content)
|
|
||||||
await in_memory_repository.create_document_content(another_document_content)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
final_count = await in_memory_repository.count_document_contents()
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert final_count == initial_count + 2
|
|
||||||
@@ -1,566 +0,0 @@
|
|||||||
"""
|
|
||||||
Test suite for FileDocumentRepository with async/await support.
|
|
||||||
|
|
||||||
This module contains comprehensive tests for all FileDocumentRepository methods
|
|
||||||
using mongomock-motor for in-memory MongoDB testing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Dict, Any
|
|
||||||
|
|
||||||
import pytest_asyncio
|
|
||||||
from bson import ObjectId
|
|
||||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
|
||||||
from mongomock_motor import AsyncMongoMockClient
|
|
||||||
|
|
||||||
from app.database.repositories.document_repository import FileDocumentRepository
|
|
||||||
from app.models.document import FileDocument, FileType
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def in_memory_repository():
|
|
||||||
"""Create an in-memory FileDocumentRepository for testing."""
|
|
||||||
client = AsyncMongoMockClient()
|
|
||||||
db = client.test_database
|
|
||||||
repo = FileDocumentRepository(db)
|
|
||||||
# repo.db = db
|
|
||||||
# repo.collection = db.files
|
|
||||||
await repo.initialize()
|
|
||||||
return repo
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_file_document():
|
|
||||||
"""Sample FileDocument data for testing."""
|
|
||||||
return FileDocument(
|
|
||||||
filename="test_document.pdf",
|
|
||||||
filepath="/path/to/test_document.pdf",
|
|
||||||
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
|
|
||||||
file_type=FileType("pdf"),
|
|
||||||
detected_at=datetime.now(),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_update_data():
|
|
||||||
"""Sample update data for testing."""
|
|
||||||
return {
|
|
||||||
"metadata": {"tags": ["updated", "document"]},
|
|
||||||
"file_type": FileType("txt"),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def multiple_sample_documents():
|
|
||||||
"""Multiple FileDocument objects for list/search testing."""
|
|
||||||
base_time = datetime.now()
|
|
||||||
return [
|
|
||||||
FileDocument(
|
|
||||||
filename="document1.pdf",
|
|
||||||
filepath="/path/to/document1.pdf",
|
|
||||||
file_hash="hash1" + "0" * 58,
|
|
||||||
file_type=FileType("pdf"),
|
|
||||||
detected_at=base_time,
|
|
||||||
),
|
|
||||||
FileDocument(
|
|
||||||
filename="similar_document.pdf",
|
|
||||||
filepath="/path/to/similar_document.pdf",
|
|
||||||
file_hash="hash2" + "0" * 58,
|
|
||||||
file_type=FileType("pdf"),
|
|
||||||
detected_at=base_time,
|
|
||||||
),
|
|
||||||
FileDocument(
|
|
||||||
filename="completely_different.txt",
|
|
||||||
filepath="/path/to/completely_different.txt",
|
|
||||||
file_hash="hash3" + "0" * 58,
|
|
||||||
file_type=FileType("pdf"),
|
|
||||||
detected_at=base_time,
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileDocumentRepositoryInitialization:
|
|
||||||
"""Tests for repository initialization."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_initialize_repository(self):
|
|
||||||
"""Test repository initialization."""
|
|
||||||
# Arrange
|
|
||||||
client = AsyncMongoMockClient()
|
|
||||||
db = client.test_database
|
|
||||||
repo = FileDocumentRepository(db)
|
|
||||||
await repo.initialize()
|
|
||||||
|
|
||||||
# Act & Assert (should not raise any exception)
|
|
||||||
assert repo.db is not None
|
|
||||||
assert repo.collection is not None
|
|
||||||
# TODO : check that the indexes are create
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileDocumentRepositoryCreation:
|
|
||||||
"""Tests for file document creation functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_create_document(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test successful file document creation."""
|
|
||||||
# Act
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert created_doc is not None
|
|
||||||
assert created_doc.filename == sample_file_document.filename
|
|
||||||
assert created_doc.filepath == sample_file_document.filepath
|
|
||||||
assert created_doc.file_hash == sample_file_document.file_hash
|
|
||||||
assert created_doc.file_type == sample_file_document.file_type
|
|
||||||
assert created_doc.id is not None
|
|
||||||
assert isinstance(created_doc.id, ObjectId)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_create_document_without_id(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test creating document with _id set to None (should be removed)."""
|
|
||||||
# Arrange
|
|
||||||
sample_file_document.id = None
|
|
||||||
|
|
||||||
# Act
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert created_doc is not None
|
|
||||||
assert created_doc.id is not None
|
|
||||||
assert isinstance(created_doc.id, ObjectId)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_create_duplicate_document(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test that creating document with duplicate hash raises DuplicateKeyError."""
|
|
||||||
# Arrange
|
|
||||||
await in_memory_repository.create_document(sample_file_document)
|
|
||||||
duplicate_doc = FileDocument(
|
|
||||||
filename="different_name.pdf",
|
|
||||||
filepath=sample_file_document.filepath,
|
|
||||||
file_hash="different_hash" + "0" * 58,
|
|
||||||
file_type=FileType("pdf"),
|
|
||||||
detected_at=datetime.now()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Act & Assert
|
|
||||||
with pytest.raises(DuplicateKeyError) as exc_info:
|
|
||||||
await in_memory_repository.create_document(duplicate_doc)
|
|
||||||
|
|
||||||
assert "already exists" in str(exc_info.value)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_create_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
|
|
||||||
"""Test handling of PyMongo errors during document creation."""
|
|
||||||
# Arrange
|
|
||||||
mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
|
|
||||||
|
|
||||||
# Act & Assert
|
|
||||||
with pytest.raises(ValueError) as exc_info:
|
|
||||||
await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
assert "Failed to create file document" in str(exc_info.value)
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileDocumentRepositoryFinding:
|
|
||||||
"""Tests for file document finding functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test finding document by valid ObjectId."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id))
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_doc is not None
|
|
||||||
assert found_doc.id == created_doc.id
|
|
||||||
assert found_doc.filename == created_doc.filename
|
|
||||||
assert found_doc.file_hash == created_doc.file_hash
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository):
|
|
||||||
"""Test that invalid ObjectId returns None."""
|
|
||||||
# Act
|
|
||||||
found_doc = await in_memory_repository.find_document_by_id("invalid_id")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_doc is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository):
|
|
||||||
"""Test that nonexistent but valid ObjectId returns None."""
|
|
||||||
# Arrange
|
|
||||||
nonexistent_id = str(ObjectId())
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_doc = await in_memory_repository.find_document_by_id(nonexistent_id)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_doc is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_find_document_by_hash(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test finding document by file hash."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_doc = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_doc is not None
|
|
||||||
assert found_doc.file_hash == created_doc.file_hash
|
|
||||||
assert found_doc.id == created_doc.id
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_find_document_with_nonexistent_hash(self, in_memory_repository):
|
|
||||||
"""Test that nonexistent hash returns None."""
|
|
||||||
# Act
|
|
||||||
found_doc = await in_memory_repository.find_document_by_hash("nonexistent_hash")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_doc is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test finding document by exact filepath."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_doc = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_doc is not None
|
|
||||||
assert found_doc.filepath == created_doc.filepath
|
|
||||||
assert found_doc.id == created_doc.id
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository):
|
|
||||||
"""Test that nonexistent filepath returns None."""
|
|
||||||
# Act
|
|
||||||
found_doc = await in_memory_repository.find_document_by_filepath("/nonexistent/path.pdf")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_doc is None
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileDocumentRepositoryFuzzySearch:
|
|
||||||
"""Tests for fuzzy search functionality by filename."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_find_documents_by_exact_name(self, in_memory_repository, multiple_sample_documents):
|
|
||||||
"""Test finding documents with exact filename match."""
|
|
||||||
# Arrange
|
|
||||||
for doc in multiple_sample_documents:
|
|
||||||
await in_memory_repository.create_document(doc)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_docs = await in_memory_repository.find_document_by_name("document1.pdf")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(found_docs) == 1
|
|
||||||
assert found_docs[0].filename == "document1.pdf"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_find_documents_by_fuzzy_name(self, in_memory_repository, multiple_sample_documents):
|
|
||||||
"""Test finding documents with fuzzy matching using default threshold."""
|
|
||||||
# Arrange
|
|
||||||
for doc in multiple_sample_documents:
|
|
||||||
await in_memory_repository.create_document(doc)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_docs = await in_memory_repository.find_document_by_name("document")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(found_docs) >= 2 # Should find document1.pdf and similar_document.pdf
|
|
||||||
filenames = [doc.filename for doc in found_docs]
|
|
||||||
assert "document1.pdf" in filenames
|
|
||||||
assert "similar_document.pdf" in filenames
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker):
|
|
||||||
"""Test handling of PyMongo errors during name search."""
|
|
||||||
# Arrange
|
|
||||||
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
|
||||||
|
|
||||||
# Act
|
|
||||||
found_docs = await in_memory_repository.find_document_by_name("test")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert found_docs == []
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileDocumentRepositoryListing:
|
|
||||||
"""Tests for document listing functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_documents):
|
|
||||||
"""Test listing documents with default pagination."""
|
|
||||||
# Arrange
|
|
||||||
for doc in multiple_sample_documents:
|
|
||||||
await in_memory_repository.create_document(doc)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
docs = await in_memory_repository.list_documents()
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(docs) == len(multiple_sample_documents)
|
|
||||||
assert all(isinstance(doc, FileDocument) for doc in docs)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_documents):
|
|
||||||
"""Test listing documents with custom pagination."""
|
|
||||||
# Arrange
|
|
||||||
for doc in multiple_sample_documents:
|
|
||||||
await in_memory_repository.create_document(doc)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
docs_page1 = await in_memory_repository.list_documents(skip=0, limit=2)
|
|
||||||
docs_page2 = await in_memory_repository.list_documents(skip=2, limit=2)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(docs_page1) == 2
|
|
||||||
assert len(docs_page2) == 1 # Only 3 total documents
|
|
||||||
|
|
||||||
# Ensure no overlap between pages
|
|
||||||
page1_ids = [doc.id for doc in docs_page1]
|
|
||||||
page2_ids = [doc.id for doc in docs_page2]
|
|
||||||
assert len(set(page1_ids).intersection(set(page2_ids))) == 0
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_list_documents_sorted_by_date(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test that documents are sorted by detected_at in descending order."""
|
|
||||||
# Arrange
|
|
||||||
from datetime import timedelta
|
|
||||||
|
|
||||||
# Create documents with different timestamps
|
|
||||||
doc1 = sample_file_document.model_copy()
|
|
||||||
doc1.filename = "oldest.pdf"
|
|
||||||
doc1.filepath = f"/path/to/{doc1.filename}"
|
|
||||||
doc1.file_hash = "hash1" + "0" * 58
|
|
||||||
doc1.detected_at = datetime.now() - timedelta(hours=2)
|
|
||||||
|
|
||||||
doc2 = sample_file_document.model_copy()
|
|
||||||
doc2.filename = "newest.pdf"
|
|
||||||
doc2.filepath = f"/path/to/{doc2.filename}"
|
|
||||||
doc2.file_hash = "hash2" + "0" * 58
|
|
||||||
doc2.detected_at = datetime.now()
|
|
||||||
|
|
||||||
await in_memory_repository.create_document(doc1)
|
|
||||||
await in_memory_repository.create_document(doc2)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
docs = await in_memory_repository.list_documents()
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert len(docs) == 2
|
|
||||||
assert docs[0].filename == "newest.pdf" # Most recent first
|
|
||||||
assert docs[1].filename == "oldest.pdf"
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_list_empty_documents(self, in_memory_repository):
|
|
||||||
"""Test listing documents from empty collection."""
|
|
||||||
# Act
|
|
||||||
docs = await in_memory_repository.list_documents()
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert docs == []
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker):
|
|
||||||
"""Test handling of PyMongo errors during document listing."""
|
|
||||||
# Arrange
|
|
||||||
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
|
||||||
|
|
||||||
# Act
|
|
||||||
docs = await in_memory_repository.list_documents()
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert docs == []
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileDocumentRepositoryUpdate:
|
|
||||||
"""Tests for document update functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document,
|
|
||||||
sample_update_data):
|
|
||||||
"""Test successful document update."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
updated_doc = await in_memory_repository.update_document(str(created_doc.id), sample_update_data)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert updated_doc is not None
|
|
||||||
assert updated_doc.file_type == sample_update_data["file_type"]
|
|
||||||
assert updated_doc.id == created_doc.id
|
|
||||||
assert updated_doc.filename == created_doc.filename # Unchanged fields remain
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test updating document with partial data."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
partial_update = {"file_type": FileType("txt")}
|
|
||||||
|
|
||||||
# Act
|
|
||||||
updated_doc = await in_memory_repository.update_document(str(created_doc.id), partial_update)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert updated_doc is not None
|
|
||||||
assert updated_doc.file_type == FileType("txt")
|
|
||||||
assert updated_doc.filename == created_doc.filename # Should remain unchanged
|
|
||||||
assert updated_doc.filepath == created_doc.filepath # Should remain unchanged
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test that None values are filtered out from update data."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
update_with_none = {"metadata": {"tags": ["updated", "document"]}, "file_type": None}
|
|
||||||
|
|
||||||
# Act
|
|
||||||
updated_doc = await in_memory_repository.update_document(str(created_doc.id), update_with_none)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert updated_doc is not None
|
|
||||||
assert updated_doc.metadata == {"tags": ["updated", "document"]}
|
|
||||||
assert updated_doc.file_type == created_doc.file_type # Should remain unchanged (None filtered out)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test updating document with empty data returns current document."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
empty_update = {}
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.update_document(str(created_doc.id), empty_update)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is not None
|
|
||||||
assert result.filename == created_doc.filename
|
|
||||||
assert result.file_hash == created_doc.file_hash
|
|
||||||
assert result.metadata == created_doc.metadata
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data):
|
|
||||||
"""Test that updating with invalid ID returns None."""
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.update_document("invalid_id", sample_update_data)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data):
|
|
||||||
"""Test that updating nonexistent document returns None."""
|
|
||||||
# Arrange
|
|
||||||
nonexistent_id = str(ObjectId())
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.update_document(nonexistent_id, sample_update_data)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document,
|
|
||||||
sample_update_data, mocker):
|
|
||||||
"""Test handling of PyMongo errors during document update."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
|
|
||||||
side_effect=PyMongoError("Database error"))
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.update_document(str(created_doc.id), sample_update_data)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileDocumentRepositoryDeletion:
|
|
||||||
"""Tests for document deletion functionality."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test successful document deletion."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
deletion_result = await in_memory_repository.delete_document(str(created_doc.id))
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert deletion_result is True
|
|
||||||
|
|
||||||
# Verify document is actually deleted
|
|
||||||
found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id))
|
|
||||||
assert found_doc is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository):
|
|
||||||
"""Test that deleting with invalid ID returns False."""
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.delete_document("invalid_id")
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is False
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository):
|
|
||||||
"""Test that deleting nonexistent document returns False."""
|
|
||||||
# Arrange
|
|
||||||
nonexistent_id = str(ObjectId())
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.delete_document(nonexistent_id)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is False
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
|
|
||||||
"""Test handling of PyMongo errors during document deletion."""
|
|
||||||
# Arrange
|
|
||||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
|
||||||
mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result = await in_memory_repository.delete_document(str(created_doc.id))
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result is False
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileDocumentRepositoryUtilities:
|
|
||||||
"""Tests for utility methods."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_count_documents(self, in_memory_repository, sample_file_document):
|
|
||||||
"""Test counting documents."""
|
|
||||||
# Arrange
|
|
||||||
initial_count = await in_memory_repository.count_documents()
|
|
||||||
await in_memory_repository.create_document(sample_file_document)
|
|
||||||
|
|
||||||
# Act
|
|
||||||
final_count = await in_memory_repository.count_documents()
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert final_count == initial_count + 1
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_count_zero_documents(self, in_memory_repository):
|
|
||||||
"""Test counting documents in empty collection."""
|
|
||||||
# Act
|
|
||||||
count = await in_memory_repository.count_documents()
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert count == 0
|
|
||||||
@@ -1,697 +0,0 @@
|
|||||||
"""
|
|
||||||
Unit tests for DocumentService using in-memory MongoDB.
|
|
||||||
|
|
||||||
Tests the orchestration logic with real MongoDB operations
|
|
||||||
using mongomock for better integration testing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import pytest_asyncio
|
|
||||||
from unittest.mock import Mock, patch
|
|
||||||
from datetime import datetime
|
|
||||||
from bson import ObjectId
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from mongomock_motor import AsyncMongoMockClient
|
|
||||||
|
|
||||||
from app.services.document_service import DocumentService
|
|
||||||
from app.database.repositories.document_repository import FileDocumentRepository
|
|
||||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
|
||||||
from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod
|
|
||||||
from app.models.types import PyObjectId
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def in_memory_file_repository():
|
|
||||||
"""Create an in-memory FileDocumentRepository for testing."""
|
|
||||||
client = AsyncMongoMockClient()
|
|
||||||
db = client.test_database
|
|
||||||
repo = FileDocumentRepository(db)
|
|
||||||
await repo.initialize()
|
|
||||||
return repo
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def in_memory_content_repository():
|
|
||||||
"""Create an in-memory DocumentContentRepository for testing."""
|
|
||||||
client = AsyncMongoMockClient()
|
|
||||||
db = client.test_database
|
|
||||||
repo = DocumentContentRepository(db)
|
|
||||||
await repo.initialize()
|
|
||||||
return repo
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def in_memory_database():
|
|
||||||
"""Create an in-memory database for testing."""
|
|
||||||
client = AsyncMongoMockClient()
|
|
||||||
return client.test_database
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
|
||||||
async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database):
|
|
||||||
"""Create DocumentService with in-memory repositories."""
|
|
||||||
with patch('app.services.document_service.get_database', return_value=in_memory_database):
|
|
||||||
service = DocumentService()
|
|
||||||
service.file_repository = in_memory_file_repository
|
|
||||||
service.content_repository = in_memory_content_repository
|
|
||||||
return service
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_file_bytes():
|
|
||||||
"""Sample file content as bytes."""
|
|
||||||
return b"This is a test PDF content"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_text_bytes():
|
|
||||||
"""Sample text file content as bytes."""
|
|
||||||
return b"This is a test text file content"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_file_hash():
|
|
||||||
"""Expected SHA256 hash for sample file bytes."""
|
|
||||||
import hashlib
|
|
||||||
return hashlib.sha256(b"This is a test PDF content").hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_file_document():
|
|
||||||
"""Sample FileDocument for testing."""
|
|
||||||
return FileDocument(
|
|
||||||
id=ObjectId(),
|
|
||||||
filename="test.pdf",
|
|
||||||
filepath="/test/test.pdf",
|
|
||||||
file_type=FileType.PDF,
|
|
||||||
extraction_method=None,
|
|
||||||
metadata={},
|
|
||||||
detected_at=datetime(2024, 1, 15, 10, 30, 0),
|
|
||||||
file_hash="test_hash"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestCreateDocument:
|
|
||||||
"""Tests for create_document method."""
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@patch('app.services.document_service.datetime')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_create_document_with_new_content(
|
|
||||||
self,
|
|
||||||
mock_datetime,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test creating document when content doesn't exist yet."""
|
|
||||||
# Setup mocks
|
|
||||||
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
|
|
||||||
mock_datetime.utcnow.return_value = fixed_time
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Execute
|
|
||||||
result = await document_service.create_document(
|
|
||||||
"/test/test.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Verify document creation
|
|
||||||
assert result is not None
|
|
||||||
assert result.filename == "test.pdf"
|
|
||||||
assert result.filepath == "/test/test.pdf"
|
|
||||||
assert result.file_type == FileType.PDF
|
|
||||||
assert result.detected_at == fixed_time
|
|
||||||
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
|
|
||||||
|
|
||||||
# Verify content was created
|
|
||||||
content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
||||||
result.file_hash
|
|
||||||
)
|
|
||||||
assert content is not None
|
|
||||||
assert content.file_hash == result.file_hash
|
|
||||||
assert content.file_size == len(sample_file_bytes)
|
|
||||||
assert content.mime_type == "application/pdf"
|
|
||||||
assert content.encoding == "utf-8"
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@patch('app.services.document_service.datetime')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_create_document_with_existing_content(
|
|
||||||
self,
|
|
||||||
mock_datetime,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test creating document when content already exists (deduplication)."""
|
|
||||||
# Setup mocks
|
|
||||||
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
|
|
||||||
mock_datetime.utcnow.return_value = fixed_time
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create first document
|
|
||||||
first_doc = await document_service.create_document(
|
|
||||||
"/test/first.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create second document with same content
|
|
||||||
second_doc = await document_service.create_document(
|
|
||||||
"/test/second.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Verify both documents exist but share same hash
|
|
||||||
assert first_doc.file_hash == second_doc.file_hash
|
|
||||||
assert first_doc.filename != second_doc.filename
|
|
||||||
assert first_doc.filepath != second_doc.filepath
|
|
||||||
|
|
||||||
# Verify only one content document exists
|
|
||||||
all_content = await document_service.content_repository.list_document_content()
|
|
||||||
content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash]
|
|
||||||
assert len(content_for_hash) == 1
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_create_document_with_different_encodings(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_text_bytes
|
|
||||||
):
|
|
||||||
"""Test creating documents with different text encodings."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "text/plain"
|
|
||||||
|
|
||||||
# Test with different encodings
|
|
||||||
encodings = ["utf-8", "latin-1", "ascii"]
|
|
||||||
|
|
||||||
for i, encoding in enumerate(encodings):
|
|
||||||
result = await document_service.create_document(
|
|
||||||
f"/test/test{i}.txt",
|
|
||||||
sample_text_bytes,
|
|
||||||
encoding
|
|
||||||
)
|
|
||||||
|
|
||||||
# Verify document was created
|
|
||||||
assert result is not None
|
|
||||||
assert result.file_type == FileType.TXT
|
|
||||||
|
|
||||||
# Verify content has correct encoding
|
|
||||||
content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
||||||
result.file_hash
|
|
||||||
)
|
|
||||||
assert content.encoding == encoding
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_create_document_with_unsupported_file_type(
|
|
||||||
self,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test that unsupported file types raise ValueError."""
|
|
||||||
with pytest.raises(ValueError, match="Unsupported file type"):
|
|
||||||
await document_service.create_document(
|
|
||||||
"/test/test.xyz", # Unsupported extension
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_create_document_with_empty_file_path(
|
|
||||||
self,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test that empty file path raises ValueError."""
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
await document_service.create_document(
|
|
||||||
"", # Empty path
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_create_document_with_empty_bytes(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service
|
|
||||||
):
|
|
||||||
"""Test behavior with empty file bytes."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "text/plain"
|
|
||||||
|
|
||||||
# Execute with empty bytes
|
|
||||||
result = await document_service.create_document(
|
|
||||||
"/test/empty.txt",
|
|
||||||
b"", # Empty bytes
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Should still work but with zero file size
|
|
||||||
assert result is not None
|
|
||||||
content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
||||||
result.file_hash
|
|
||||||
)
|
|
||||||
assert content.file_size == 0
|
|
||||||
|
|
||||||
|
|
||||||
class TestGetMethods:
|
|
||||||
"""Tests for document retrieval methods."""
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_get_document_by_id(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test retrieving document by ID."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create a document first
|
|
||||||
created_doc = await document_service.create_document(
|
|
||||||
"/test/test.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute
|
|
||||||
result = await document_service.get_document_by_id(created_doc.id)
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert result is not None
|
|
||||||
assert result.id == created_doc.id
|
|
||||||
assert result.filename == created_doc.filename
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_get_document_by_hash(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test retrieving document by file hash."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create a document first
|
|
||||||
created_doc = await document_service.create_document(
|
|
||||||
"/test/test.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute
|
|
||||||
result = await document_service.get_document_by_hash(created_doc.file_hash)
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert result is not None
|
|
||||||
assert result.file_hash == created_doc.file_hash
|
|
||||||
assert result.filename == created_doc.filename
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_get_document_by_filepath(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test retrieving document by file path."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
test_path = "/test/unique_test.pdf"
|
|
||||||
|
|
||||||
# Create a document first
|
|
||||||
created_doc = await document_service.create_document(
|
|
||||||
test_path,
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute
|
|
||||||
result = await document_service.get_document_by_filepath(test_path)
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert result is not None
|
|
||||||
assert result.filepath == test_path
|
|
||||||
assert result.id == created_doc.id
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_get_document_with_content(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test retrieving document with associated content."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create a document first
|
|
||||||
created_doc = await document_service.create_document(
|
|
||||||
"/test/test.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute
|
|
||||||
result = await document_service.get_document_with_content(created_doc.id)
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert result is not None
|
|
||||||
document, content = result
|
|
||||||
assert document.id == created_doc.id
|
|
||||||
assert content is not None
|
|
||||||
assert content.file_hash == created_doc.file_hash
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_get_nonexistent_document_by_id(
|
|
||||||
self,
|
|
||||||
document_service
|
|
||||||
):
|
|
||||||
"""Test that nonexistent document returns None."""
|
|
||||||
# Execute with random ObjectId
|
|
||||||
result = await document_service.get_document_by_id(ObjectId())
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_cannot_get_nonexistent_document_by_hash(
|
|
||||||
self,
|
|
||||||
document_service
|
|
||||||
):
|
|
||||||
"""Test that nonexistent document hash returns None."""
|
|
||||||
# Execute
|
|
||||||
result = await document_service.get_document_by_hash("nonexistent_hash")
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
|
|
||||||
class TestPaginationAndCounting:
|
|
||||||
"""Tests for document listing and counting."""
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_list_documents_with_pagination(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test document listing with pagination parameters."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create multiple documents
|
|
||||||
for i in range(5):
|
|
||||||
await document_service.create_document(
|
|
||||||
f"/test/test{i}.pdf",
|
|
||||||
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute with pagination
|
|
||||||
result = await document_service.list_documents(skip=1, limit=2)
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert len(result) == 2
|
|
||||||
|
|
||||||
# Test counting
|
|
||||||
total_count = await document_service.count_documents()
|
|
||||||
assert total_count == 5
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_count_documents(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test document counting."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "text/plain"
|
|
||||||
|
|
||||||
# Initially should be 0
|
|
||||||
initial_count = await document_service.count_documents()
|
|
||||||
assert initial_count == 0
|
|
||||||
|
|
||||||
# Create some documents
|
|
||||||
for i in range(3):
|
|
||||||
await document_service.create_document(
|
|
||||||
f"/test/test{i}.txt",
|
|
||||||
sample_file_bytes + bytes(str(i), 'utf-8'),
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute
|
|
||||||
final_count = await document_service.count_documents()
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert final_count == 3
|
|
||||||
|
|
||||||
|
|
||||||
class TestUpdateAndDelete:
|
|
||||||
"""Tests for document update and deletion operations."""
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_metadata(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test updating document metadata."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create a document first
|
|
||||||
created_doc = await document_service.create_document(
|
|
||||||
"/test/test.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute update
|
|
||||||
update_data = {"metadata": {"page_count": 5}}
|
|
||||||
result = await document_service.update_document(created_doc.id, update_data)
|
|
||||||
|
|
||||||
# Verify
|
|
||||||
assert result is not None
|
|
||||||
assert result.metadata.get("page_count") == 5
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_delete_document_and_orphaned_content(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test deleting document with orphaned content cleanup."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create a document
|
|
||||||
created_doc = await document_service.create_document(
|
|
||||||
"/test/test.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Verify content exists
|
|
||||||
content_before = await document_service.content_repository.find_document_content_by_file_hash(
|
|
||||||
created_doc.file_hash
|
|
||||||
)
|
|
||||||
assert content_before is not None
|
|
||||||
|
|
||||||
# Execute deletion
|
|
||||||
result = await document_service.delete_document(created_doc.id)
|
|
||||||
|
|
||||||
# Verify document and content are deleted
|
|
||||||
assert result is True
|
|
||||||
|
|
||||||
deleted_doc = await document_service.get_document_by_id(created_doc.id)
|
|
||||||
assert deleted_doc is None
|
|
||||||
|
|
||||||
content_after = await document_service.content_repository.find_document_content_by_file_hash(
|
|
||||||
created_doc.file_hash
|
|
||||||
)
|
|
||||||
assert content_after is None
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_delete_document_without_affecting_shared_content(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test deleting document without removing shared content."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create two documents with same content
|
|
||||||
doc1 = await document_service.create_document(
|
|
||||||
"/test/test1.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
doc2 = await document_service.create_document(
|
|
||||||
"/test/test2.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# They should share the same hash
|
|
||||||
assert doc1.file_hash == doc2.file_hash
|
|
||||||
|
|
||||||
# Delete first document
|
|
||||||
result = await document_service.delete_document(doc1.id)
|
|
||||||
assert result is True
|
|
||||||
|
|
||||||
# Verify first document is deleted but content still exists
|
|
||||||
deleted_doc = await document_service.get_document_by_id(doc1.id)
|
|
||||||
assert deleted_doc is None
|
|
||||||
|
|
||||||
remaining_doc = await document_service.get_document_by_id(doc2.id)
|
|
||||||
assert remaining_doc is not None
|
|
||||||
|
|
||||||
content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
||||||
doc2.file_hash
|
|
||||||
)
|
|
||||||
assert content is not None
|
|
||||||
|
|
||||||
|
|
||||||
class TestUtilityMethods:
|
|
||||||
"""Tests for utility methods."""
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_check_content_exists(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test checking if content exists by hash."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Initially content doesn't exist
|
|
||||||
test_hash = "nonexistent_hash"
|
|
||||||
exists_before = await document_service.content_exists(test_hash)
|
|
||||||
assert exists_before is False
|
|
||||||
|
|
||||||
# Create a document
|
|
||||||
created_doc = await document_service.create_document(
|
|
||||||
"/test/test.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Now content should exist
|
|
||||||
exists_after = await document_service.content_exists(created_doc.file_hash)
|
|
||||||
assert exists_after is True
|
|
||||||
|
|
||||||
@patch('app.services.document_service.magic.from_buffer')
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_i_can_update_document_content(
|
|
||||||
self,
|
|
||||||
mock_magic,
|
|
||||||
document_service,
|
|
||||||
sample_file_bytes
|
|
||||||
):
|
|
||||||
"""Test updating extracted document content."""
|
|
||||||
# Setup
|
|
||||||
mock_magic.return_value = "application/pdf"
|
|
||||||
|
|
||||||
# Create a document first
|
|
||||||
created_doc = await document_service.create_document(
|
|
||||||
"/test/test.pdf",
|
|
||||||
sample_file_bytes,
|
|
||||||
"utf-8"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Update content
|
|
||||||
new_content = "Updated extracted content"
|
|
||||||
result = await document_service.update_document_content(
|
|
||||||
created_doc.file_hash,
|
|
||||||
new_content
|
|
||||||
)
|
|
||||||
|
|
||||||
# Verify update
|
|
||||||
assert result is not None
|
|
||||||
assert result.content == new_content
|
|
||||||
|
|
||||||
# Verify persistence
|
|
||||||
updated_content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
||||||
created_doc.file_hash
|
|
||||||
)
|
|
||||||
assert updated_content.content == new_content
|
|
||||||
|
|
||||||
|
|
||||||
class TestHashCalculation:
|
|
||||||
"""Tests for file hash calculation utility."""
|
|
||||||
|
|
||||||
def test_i_can_calculate_consistent_file_hash(self, document_service):
|
|
||||||
"""Test that file hash calculation is consistent."""
|
|
||||||
test_bytes = b"Test content for hashing"
|
|
||||||
|
|
||||||
# Calculate hash multiple times
|
|
||||||
hash1 = document_service._calculate_file_hash(test_bytes)
|
|
||||||
hash2 = document_service._calculate_file_hash(test_bytes)
|
|
||||||
|
|
||||||
# Should be identical
|
|
||||||
assert hash1 == hash2
|
|
||||||
assert len(hash1) == 64 # SHA256 produces 64-character hex string
|
|
||||||
|
|
||||||
def test_i_get_different_hashes_for_different_content(self, document_service):
|
|
||||||
"""Test that different content produces different hashes."""
|
|
||||||
content1 = b"First content"
|
|
||||||
content2 = b"Second content"
|
|
||||||
|
|
||||||
hash1 = document_service._calculate_file_hash(content1)
|
|
||||||
hash2 = document_service._calculate_file_hash(content2)
|
|
||||||
|
|
||||||
assert hash1 != hash2
|
|
||||||
|
|
||||||
|
|
||||||
class TestFileTypeDetection:
|
|
||||||
"""Tests for file type detection."""
|
|
||||||
|
|
||||||
def test_i_can_detect_pdf_file_type(self, document_service):
|
|
||||||
"""Test PDF file type detection."""
|
|
||||||
file_type = document_service._detect_file_type("/path/to/document.pdf")
|
|
||||||
assert file_type == FileType.PDF
|
|
||||||
|
|
||||||
def test_i_can_detect_txt_file_type(self, document_service):
|
|
||||||
"""Test text file type detection."""
|
|
||||||
file_type = document_service._detect_file_type("/path/to/document.txt")
|
|
||||||
assert file_type == FileType.TXT
|
|
||||||
|
|
||||||
def test_i_can_detect_docx_file_type(self, document_service):
|
|
||||||
"""Test DOCX file type detection."""
|
|
||||||
file_type = document_service._detect_file_type("/path/to/document.docx")
|
|
||||||
assert file_type == FileType.DOCX
|
|
||||||
|
|
||||||
def test_i_cannot_detect_unsupported_file_type(self, document_service):
|
|
||||||
"""Test unsupported file type raises ValueError."""
|
|
||||||
with pytest.raises(ValueError, match="Unsupported file type"):
|
|
||||||
document_service._detect_file_type("/path/to/document.xyz")
|
|
||||||
0
tests/utils/__init__.py
Normal file
0
tests/utils/__init__.py
Normal file
@@ -14,6 +14,8 @@ def get_doc(filename: str = None):
|
|||||||
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
|
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
|
||||||
file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"),
|
file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"),
|
||||||
detected_at=datetime.now(),
|
detected_at=datetime.now(),
|
||||||
|
file_size=1024,
|
||||||
|
mime_type="application/pdf"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user