Refactored DocumentService to save document in the filesystem. Fixed docker application
This commit is contained in:
243
Readme.md
243
Readme.md
@@ -103,17 +103,22 @@ MyDocManager/
|
||||
│ │ │ ├── models/
|
||||
│ │ │ │ ├── __init__.py
|
||||
│ │ │ │ ├── user.py # User Pydantic models
|
||||
│ │ │ │ └── auth.py # Auth Pydantic models
|
||||
│ │ │ │ ├── auth.py # Auth Pydantic models
|
||||
│ │ │ │ ├── document.py # Document Pydantic models
|
||||
│ │ │ │ ├── job.py # Job Processing Pydantic models
|
||||
│ │ │ │ └── types.py # PyObjectId and other useful types
|
||||
│ │ │ ├── database/
|
||||
│ │ │ │ ├── __init__.py
|
||||
│ │ │ │ ├── connection.py # MongoDB connection
|
||||
│ │ │ │ └── repositories/
|
||||
│ │ │ │ ├── __init__.py
|
||||
│ │ │ │ └── user_repository.py # User CRUD operations
|
||||
│ │ │ │ ├── user_repository.py # User CRUD operations
|
||||
│ │ │ │ └── document_repository.py # User CRUD operations
|
||||
│ │ │ ├── services/
|
||||
│ │ │ │ ├── __init__.py
|
||||
│ │ │ │ ├── auth_service.py # JWT & password logic
|
||||
│ │ │ │ ├── user_service.py # User business logic
|
||||
│ │ │ │ ├── document_service.py # Document business logic
|
||||
│ │ │ │ └── init_service.py # Admin creation at startup
|
||||
│ │ │ ├── api/
|
||||
│ │ │ │ ├── __init__.py
|
||||
@@ -125,7 +130,7 @@ MyDocManager/
|
||||
│ │ │ └── utils/
|
||||
│ │ │ ├── __init__.py
|
||||
│ │ │ ├── security.py # Password utilities
|
||||
│ │ │ └── exceptions.py # Custom exceptions
|
||||
│ │ │ └── document_matching.py # Fuzzy matching Algorithms
|
||||
│ ├── worker/
|
||||
│ │ ├── Dockerfile
|
||||
│ │ ├── requirements.txt
|
||||
@@ -224,78 +229,76 @@ On first startup, the application automatically creates a default admin user:
|
||||
|
||||
#### Files Collection
|
||||
|
||||
Stores file metadata and extracted content:
|
||||
Stores file metadata and extracted content using Pydantic models:
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "ObjectId",
|
||||
"filename": "document.pdf",
|
||||
"filepath": "/watched_files/document.pdf",
|
||||
"file_type": "pdf",
|
||||
"extraction_method": "direct_text", // direct_text, ocr, hybrid
|
||||
"metadata": {
|
||||
"page_count": 15, // for PDFs
|
||||
"word_count": 250, // for text files
|
||||
"image_dimensions": { // for images
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
}
|
||||
},
|
||||
"detected_at": "2024-01-15T10:29:00Z",
|
||||
"file_hash": "sha256_hash_value"
|
||||
}
|
||||
```
|
||||
#### Document Contents Collection
|
||||
```python
|
||||
class FileDocument(BaseModel):
|
||||
"""
|
||||
Model for file documents stored in the 'files' collection.
|
||||
|
||||
Stores actual file content and technical metadata:
|
||||
```json
|
||||
{
|
||||
"_id": "ObjectId",
|
||||
"file_hash": "sha256_hash_value",
|
||||
"content": "extracted text content...",
|
||||
"encoding": "utf-8",
|
||||
"file_size": 2048576,
|
||||
"mime_type": "application/pdf"
|
||||
}
|
||||
Represents a file detected in the watched directory with its
|
||||
metadata and extracted content.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
filename: str = Field(..., description="Original filename")
|
||||
filepath: str = Field(..., description="Full path to the file")
|
||||
file_type: FileType = Field(..., description="Type of the file")
|
||||
extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
@field_validator('filepath')
|
||||
@classmethod
|
||||
def validate_filepath(cls, v: str) -> str:
|
||||
"""Validate filepath format."""
|
||||
if not v.strip():
|
||||
raise ValueError("Filepath cannot be empty")
|
||||
return v.strip()
|
||||
|
||||
@field_validator('filename')
|
||||
@classmethod
|
||||
def validate_filename(cls, v: str) -> str:
|
||||
"""Validate filename format."""
|
||||
if not v.strip():
|
||||
raise ValueError("Filename cannot be empty")
|
||||
return v.strip()
|
||||
```
|
||||
|
||||
#### Processing Jobs Collection
|
||||
|
||||
Tracks processing status and lifecycle:
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "ObjectId",
|
||||
"file_id": "reference_to_files_collection",
|
||||
"status": "completed",
|
||||
// pending, processing, completed, failed
|
||||
"task_id": "celery_task_uuid",
|
||||
"created_at": "2024-01-15T10:29:00Z",
|
||||
"started_at": "2024-01-15T10:29:30Z",
|
||||
"completed_at": "2024-01-15T10:30:00Z",
|
||||
"error_message": null
|
||||
}
|
||||
```python
|
||||
class ProcessingJob(BaseModel):
|
||||
"""
|
||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||
|
||||
Tracks the lifecycle and status of document processing tasks.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_id: PyObjectId = Field(..., description="Reference to file document")
|
||||
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
|
||||
task_id: Optional[str] = Field(default=None, description="Celery task UUID")
|
||||
created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
|
||||
started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
|
||||
completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
|
||||
error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
|
||||
|
||||
@field_validator('error_message')
|
||||
@classmethod
|
||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||
"""Clean up error message."""
|
||||
if v is not None:
|
||||
return v.strip() if v.strip() else None
|
||||
return v
|
||||
```
|
||||
|
||||
### Data Storage Strategy
|
||||
|
||||
- **Choice**: Three separate collections for files, content, and processing status
|
||||
- **Rationale**: Normalization prevents content duplication when multiple files have identical content
|
||||
- **Benefits**:
|
||||
- Content deduplication via SHA256 hash
|
||||
- Better query performance for metadata vs content searches
|
||||
- Clear separation of concerns between file metadata, content, and processing lifecycle
|
||||
- Multiple files can reference the same content (e.g., identical copies in different locations)
|
||||
|
||||
### Content Storage Location
|
||||
|
||||
- **Choice**: Store extracted content in separate `document_contents` collection
|
||||
- **Rationale**: Content normalization and deduplication
|
||||
- **Benefits**:
|
||||
- Single content storage per unique file hash
|
||||
- Multiple file entries can reference same content
|
||||
- Efficient storage for duplicate files
|
||||
|
||||
### Supported File Types (Initial Implementation)
|
||||
|
||||
- **Text Files** (`.txt`): Direct content reading
|
||||
@@ -306,7 +309,7 @@ Tracks processing status and lifecycle:
|
||||
|
||||
#### Watchdog Implementation
|
||||
|
||||
- **Choice**: Dedicated observer thread (Option A)
|
||||
- **Choice**: Dedicated observer thread
|
||||
- **Rationale**: Standard approach, clean separation of concerns
|
||||
- **Implementation**: Watchdog observer runs in separate thread from FastAPI
|
||||
|
||||
@@ -327,17 +330,17 @@ Tracks processing status and lifecycle:
|
||||
|
||||
#### Content Storage Location
|
||||
|
||||
- **Choice**: Store extracted content in `files` collection
|
||||
- **Rationale**: Content is intrinsic property of the file
|
||||
- **Benefits**: Single query to get file + content, simpler data model
|
||||
- **Choice**: Store files in the file system, using the SHA256 hash as filename
|
||||
- **Rationale**: MongoDB is not meant for large files, better performance. Files remain in the file system for easy
|
||||
access.
|
||||
|
||||
### Implementation Order
|
||||
|
||||
1. ✅ Pydantic models for MongoDB collections
|
||||
2. ✅ Repository layer for data access (files + processing_jobs)
|
||||
3. ✅ Celery tasks for document processing
|
||||
4. ✅ Watchdog file monitoring implementation
|
||||
5. ✅ FastAPI integration and startup coordination
|
||||
2. UNDER PROGRESS : Repository layer for data access (files + processing_jobs)
|
||||
3. TODO : Celery tasks for document processing
|
||||
4. TODO : Watchdog file monitoring implementation
|
||||
5. TODO : FastAPI integration and startup coordination
|
||||
|
||||
### Processing Pipeline Features
|
||||
|
||||
@@ -347,87 +350,6 @@ Tracks processing status and lifecycle:
|
||||
- **Extensible Metadata**: Flexible metadata storage per file type
|
||||
- **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches
|
||||
|
||||
## Document Service Architecture
|
||||
|
||||
### Service Overview
|
||||
|
||||
The document service provides orchestrated access to file documents and their content through a single interface that coordinates between `FileDocument` and `DocumentContent` repositories.
|
||||
|
||||
### Service Design
|
||||
|
||||
- **Architecture Pattern**: Service orchestration with separate repositories
|
||||
- **Transaction Support**: MongoDB ACID transactions for data consistency
|
||||
- **Content Deduplication**: Multiple files can reference the same content via SHA256 hash
|
||||
- **Error Handling**: MongoDB standard exceptions with transaction rollback
|
||||
|
||||
### Document Service (`document_service.py`)
|
||||
|
||||
Orchestrates operations between file and content repositories while maintaining data consistency.
|
||||
|
||||
#### Core Functionality
|
||||
|
||||
##### `create_document(file_path: str, file_bytes: bytes, encoding: str)`
|
||||
|
||||
Creates a new document with automatic attribute calculation and content deduplication.
|
||||
|
||||
**Automatic Calculations:**
|
||||
- `file_hash`: SHA256 hash of file bytes
|
||||
- `file_type`: Detection based on file extension
|
||||
- `mime_type`: Detection via `python-magic` library
|
||||
- `file_size`: Length of provided bytes
|
||||
- `detected_at`: Current timestamp
|
||||
- `metadata`: Empty dictionary (reserved for future extension)
|
||||
|
||||
**Deduplication Logic:**
|
||||
1. Calculate SHA256 hash of file content
|
||||
2. Check if `DocumentContent` with this hash already exists
|
||||
3. If EXISTS: Create only `FileDocument` referencing existing content
|
||||
4. If NOT EXISTS: Create both `FileDocument` and `DocumentContent` in transaction
|
||||
|
||||
**Transaction Flow:**
|
||||
```
|
||||
BEGIN TRANSACTION
|
||||
IF content_exists(file_hash):
|
||||
CREATE FileDocument with content reference
|
||||
ELSE:
|
||||
CREATE DocumentContent
|
||||
CREATE FileDocument with content reference
|
||||
COMMIT TRANSACTION
|
||||
```
|
||||
|
||||
#### Available Methods
|
||||
|
||||
- `create_document(file_path, file_bytes, encoding)`: Create with deduplication
|
||||
- `get_document_by_id(document_id)`: Retrieve by document ID
|
||||
- `get_document_by_hash(file_hash)`: Retrieve by file hash
|
||||
- `get_document_by_filepath(filepath)`: Retrieve by file path
|
||||
- `list_documents(skip, limit)`: Paginated document listing
|
||||
- `count_documents()`: Total document count
|
||||
- `update_document(document_id, update_data)`: Update document metadata
|
||||
- `delete_document(document_id)`: Remove document and orphaned content
|
||||
|
||||
### Repository Dependencies
|
||||
|
||||
The document service coordinates two existing repositories:
|
||||
|
||||
#### File Repository (`file_repository.py`)
|
||||
- `create_document()`, `find_document_by_id()`, `find_document_by_hash()`
|
||||
- `find_document_by_filepath()`, `find_document_by_name()`
|
||||
- `list_documents()`, `count_documents()`
|
||||
- `update_document()`, `delete_document()`
|
||||
|
||||
#### Document Content Repository (`document_content_repository.py`)
|
||||
- `create_document_content()`, `find_document_content_by_id()`
|
||||
- `find_document_content_by_file_hash()`, `content_exists()`
|
||||
- `update_document_content()`, `delete_document_content()`
|
||||
- `list_document_contents()`, `count_document_contents()`
|
||||
|
||||
### Dependencies
|
||||
|
||||
- `python-magic`: MIME type detection
|
||||
- `hashlib`: SHA256 hashing (standard library)
|
||||
- `pymongo`: MongoDB transactions support
|
||||
|
||||
## Key Implementation Notes
|
||||
|
||||
### Python Standards
|
||||
@@ -483,21 +405,14 @@ The document service coordinates two existing repositories:
|
||||
|
||||
### Next Implementation Steps
|
||||
|
||||
1. ✅ Create docker-compose.yml with all services => Done
|
||||
2. ✅ Define user management and authentication architecture => Done
|
||||
3. ✅ Implement user models and authentication services =>
|
||||
1. models/user.py => Done
|
||||
2. models/auth.py => Done
|
||||
3. database/repositories/user_repository.py => Done
|
||||
4. ✅ Add automatic admin user creation if it does not exists => Done
|
||||
5. **IN PROGRESS**: Implement file processing pipeline =>
|
||||
1. **IN PROGRESS**: Implement file processing pipeline =>
|
||||
1. Create Pydantic models for files and processing_jobs collections
|
||||
2. Implement repository layer for file and processing job data access
|
||||
3. Create Celery tasks for document processing (.txt, .pdf, .docx)
|
||||
4. Implement Watchdog file monitoring with dedicated observer
|
||||
5. Integrate file watcher with FastAPI startup
|
||||
6. Create protected API routes for user management
|
||||
7. Build React monitoring interface with authentication
|
||||
2. Create protected API routes for user management
|
||||
3. Build React monitoring interface with authentication
|
||||
|
||||
## Annexes
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ services:
|
||||
MONGO_INITDB_ROOT_PASSWORD: password123
|
||||
MONGO_INITDB_DATABASE: mydocmanager
|
||||
volumes:
|
||||
- mongodb-data:/data/db
|
||||
- ./volumes/db:/data/db
|
||||
networks:
|
||||
- mydocmanager-network
|
||||
|
||||
@@ -38,6 +38,7 @@ services:
|
||||
volumes:
|
||||
- ./src/file-processor:/app
|
||||
- ./volumes/watched_files:/watched_files
|
||||
- ./volumes/objects:/objects
|
||||
depends_on:
|
||||
- redis
|
||||
- mongodb
|
||||
|
||||
@@ -51,6 +51,15 @@ def get_jwt_secret_key() -> str:
|
||||
raise ValueError("JWT_SECRET environment variable must be set in production")
|
||||
return secret
|
||||
|
||||
def get_objects_folder() -> str:
|
||||
"""
|
||||
Get Vault path from environment variables.
|
||||
|
||||
Returns:
|
||||
str: Vault path
|
||||
"""
|
||||
return os.getenv("OBJECTS_FOLDER", "/objects")
|
||||
|
||||
|
||||
def get_jwt_algorithm() -> str:
|
||||
"""
|
||||
|
||||
@@ -10,7 +10,7 @@ from typing import Optional
|
||||
from pymongo import MongoClient
|
||||
from pymongo.database import Database
|
||||
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
|
||||
from app.config.settings import get_mongodb_url, get_mongodb_database_name
|
||||
|
||||
# Global variables for singleton pattern
|
||||
@@ -18,7 +18,7 @@ _client: Optional[MongoClient] = None
|
||||
_database: Optional[Database] = None
|
||||
|
||||
|
||||
def create_mongodb_client() -> MongoClient:
|
||||
def create_mongodb_client() -> AsyncIOMotorClient:
|
||||
"""
|
||||
Create MongoDB client with connection validation.
|
||||
|
||||
@@ -32,7 +32,7 @@ def create_mongodb_client() -> MongoClient:
|
||||
|
||||
try:
|
||||
# Create client with short timeout for fail-fast behavior
|
||||
client = MongoClient(
|
||||
client = AsyncIOMotorClient(
|
||||
mongodb_url,
|
||||
serverSelectionTimeoutMS=5000, # 5 seconds timeout
|
||||
connectTimeoutMS=5000,
|
||||
@@ -107,6 +107,15 @@ def get_mongodb_client() -> Optional[MongoClient]:
|
||||
return _client
|
||||
|
||||
|
||||
def get_extra_args(session):
|
||||
# Build kwargs only if session is provided
|
||||
kwargs = {}
|
||||
if session is not None:
|
||||
kwargs["session"] = session
|
||||
|
||||
return kwargs
|
||||
|
||||
|
||||
def test_database_connection() -> bool:
|
||||
"""
|
||||
Test if database connection is working.
|
||||
|
||||
@@ -1,214 +0,0 @@
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from bson import ObjectId
|
||||
|
||||
from app.models.document import DocumentContent
|
||||
|
||||
|
||||
class DocumentContentRepository:
|
||||
"""
|
||||
Repository class for document content CRUD operations in MongoDB.
|
||||
|
||||
This class handles all database operations related to document content,
|
||||
following the repository pattern with dependency injection and async/await.
|
||||
"""
|
||||
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
"""
|
||||
Initialize repository with database dependency.
|
||||
|
||||
Args:
|
||||
database (AsyncIOMotorDatabase): MongoDB database instance
|
||||
"""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = database.document_contents
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
|
||||
Creates unique index on file_hash field to prevent duplicates.
|
||||
"""
|
||||
try:
|
||||
await self.collection.create_index("file_hash", unique=True)
|
||||
except PyMongoError:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
|
||||
async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
|
||||
"""
|
||||
Create a new document content in the database.
|
||||
|
||||
Args:
|
||||
document_content (DocumentContent): Document content data
|
||||
|
||||
Returns:
|
||||
DocumentContent: Created document content with database ID
|
||||
|
||||
Raises:
|
||||
DuplicateKeyError: If file_hash already exists
|
||||
ValueError: If document content creation fails due to validation
|
||||
"""
|
||||
document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
|
||||
|
||||
# Remove _id if it's None to let MongoDB generate it
|
||||
if document_dict.get("_id") is None:
|
||||
document_dict.pop("_id", None)
|
||||
|
||||
try:
|
||||
result = await self.collection.insert_one(document_dict)
|
||||
document_dict["_id"] = result.inserted_id
|
||||
return DocumentContent(**document_dict)
|
||||
except DuplicateKeyError as e:
|
||||
raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
|
||||
except PyMongoError as e:
|
||||
raise ValueError(f"Failed to create document content: {e}")
|
||||
|
||||
async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Find document content by ID.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to search for
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return None
|
||||
|
||||
document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
|
||||
if document_doc:
|
||||
return DocumentContent(**document_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Find document content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash (str): File hash to search for
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
document_doc = await self.collection.find_one({"file_hash": file_hash})
|
||||
if document_doc:
|
||||
return DocumentContent(**document_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if document content exists by file hash.
|
||||
|
||||
Args:
|
||||
file_hash (str): File hash to check
|
||||
|
||||
Returns:
|
||||
bool: True if document content exists, False otherwise
|
||||
"""
|
||||
try:
|
||||
count = await self.collection.count_documents({"file_hash": file_hash})
|
||||
return count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update document content information.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to update
|
||||
update_data (dict): Updated document content data
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Updated document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return None
|
||||
|
||||
# Remove None values and _id from update data
|
||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
|
||||
|
||||
if not clean_update_data:
|
||||
return await self.find_document_content_by_id(document_id)
|
||||
|
||||
result = await self.collection.find_one_and_update(
|
||||
{"_id": ObjectId(document_id)},
|
||||
{"$set": clean_update_data},
|
||||
return_document=True
|
||||
)
|
||||
|
||||
if result:
|
||||
return DocumentContent(**result)
|
||||
return None
|
||||
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def delete_document_content(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete document content from database.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to delete
|
||||
|
||||
Returns:
|
||||
bool: True if document content was deleted, False otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return False
|
||||
|
||||
result = await self.collection.delete_one({"_id": ObjectId(document_id)})
|
||||
return result.deleted_count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
|
||||
"""
|
||||
List document contents with pagination.
|
||||
|
||||
Args:
|
||||
skip (int): Number of document contents to skip (default: 0)
|
||||
limit (int): Maximum number of document contents to return (default: 100)
|
||||
|
||||
Returns:
|
||||
List[DocumentContent]: List of document contents
|
||||
"""
|
||||
try:
|
||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
|
||||
document_docs = await cursor.to_list(length=limit)
|
||||
return [DocumentContent(**document_doc) for document_doc in document_docs]
|
||||
except PyMongoError:
|
||||
return []
|
||||
|
||||
async def count_document_contents(self) -> int:
|
||||
"""
|
||||
Count total number of document contents.
|
||||
|
||||
Returns:
|
||||
int: Total number of document contents in database
|
||||
"""
|
||||
try:
|
||||
return await self.collection.count_documents({})
|
||||
except PyMongoError:
|
||||
return 0
|
||||
@@ -9,6 +9,8 @@ from typing import Optional, List
|
||||
from bson import ObjectId
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
|
||||
|
||||
from app.database.connection import get_extra_args
|
||||
from app.models.document import FileDocument
|
||||
from app.utils.document_matching import fuzzy_matching, subsequence_matching
|
||||
|
||||
@@ -37,7 +39,7 @@ class FileDocumentRepository:
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
"""Initialize file repository with database connection."""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = self.db.files
|
||||
self.collection: AsyncIOMotorCollection = self.db.documents
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
@@ -47,6 +49,7 @@ class FileDocumentRepository:
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
return self
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
@@ -60,26 +63,27 @@ class FileDocumentRepository:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
|
||||
async def create_document(self, file_data: FileDocument) -> FileDocument:
|
||||
async def create_document(self, file_data: FileDocument, session=None) -> FileDocument:
|
||||
"""
|
||||
Create a new file document in database.
|
||||
|
||||
Args:
|
||||
file_data (FileDocument): File document data to create
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
FileDocument: Created file document with database ID
|
||||
FileDocument: Created document with database ID
|
||||
|
||||
Raises:
|
||||
ValueError: If file creation fails due to validation
|
||||
DuplicateKeyError: If file with same hash already exists
|
||||
DuplicateKeyError: If a document with same hash already exists
|
||||
"""
|
||||
try:
|
||||
file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
|
||||
if "_id" in file_dict and file_dict["_id"] is None:
|
||||
del file_dict["_id"]
|
||||
|
||||
result = await self.collection.insert_one(file_dict)
|
||||
result = await self.collection.insert_one(file_dict, **get_extra_args(session))
|
||||
file_data.id = result.inserted_id
|
||||
return file_data
|
||||
|
||||
@@ -204,13 +208,14 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return 0
|
||||
|
||||
async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]:
|
||||
async def update_document(self, file_id: str, update_data: dict, session=None) -> Optional[FileDocument]:
|
||||
"""
|
||||
Update file document with new data.
|
||||
|
||||
Args:
|
||||
file_id (str): File document ID to update
|
||||
update_data (dict): Fields to update
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
FileDocument or None: Updated file document if successful, None otherwise
|
||||
@@ -228,7 +233,8 @@ class FileDocumentRepository:
|
||||
result = await self.collection.find_one_and_update(
|
||||
{"_id": ObjectId(file_id)},
|
||||
{"$set": clean_update_data},
|
||||
return_document=True
|
||||
return_document=True,
|
||||
**get_extra_args(session)
|
||||
)
|
||||
|
||||
if result:
|
||||
@@ -238,12 +244,13 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def delete_document(self, file_id: str) -> bool:
|
||||
async def delete_document(self, file_id: str, session=None) -> bool:
|
||||
"""
|
||||
Delete file document from database.
|
||||
|
||||
Args:
|
||||
file_id (str): File document ID to delete
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
bool: True if file was deleted, False otherwise
|
||||
@@ -252,7 +259,7 @@ class FileDocumentRepository:
|
||||
if not ObjectId.is_valid(file_id):
|
||||
return False
|
||||
|
||||
result = await self.collection.delete_one({"_id": ObjectId(file_id)})
|
||||
result = await self.collection.delete_one({"_id": ObjectId(file_id)}, **get_extra_args(session))
|
||||
return result.deleted_count > 0
|
||||
|
||||
except PyMongoError:
|
||||
|
||||
@@ -32,7 +32,6 @@ class UserRepository:
|
||||
"""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = database.users
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
@@ -41,6 +40,7 @@ class UserRepository:
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
return self
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
|
||||
@@ -7,10 +7,11 @@ This service provides API endpoints for health checks and task dispatching.
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, HTTPException, Depends
|
||||
from pydantic import BaseModel
|
||||
|
||||
import redis
|
||||
from celery import Celery
|
||||
from fastapi import FastAPI, HTTPException, Depends
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.database.connection import test_database_connection, get_database
|
||||
from app.database.repositories.user_repository import UserRepository
|
||||
@@ -39,12 +40,11 @@ async def lifespan(app: FastAPI):
|
||||
database = get_database()
|
||||
|
||||
# Initialize repositories and services
|
||||
user_repository = UserRepository(database)
|
||||
user_service = UserService(user_repository)
|
||||
user_service = await UserService(database).initialize()
|
||||
init_service = InitializationService(user_service)
|
||||
|
||||
# Run initialization tasks
|
||||
initialization_result = init_service.initialize_application()
|
||||
initialization_result = await init_service.initialize_application()
|
||||
|
||||
if initialization_result["initialization_success"]:
|
||||
logger.info("Application startup completed successfully")
|
||||
@@ -56,6 +56,7 @@ async def lifespan(app: FastAPI):
|
||||
logger.error(f" - {error}")
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
logger.error(f"Critical error during application startup: {str(e)}")
|
||||
# You might want to decide if the app should continue or exit here
|
||||
# For now, we log the error but continue
|
||||
@@ -119,6 +120,7 @@ async def create_user(
|
||||
):
|
||||
return user_service.create_user(user_data)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""
|
||||
|
||||
@@ -33,15 +33,6 @@ class ExtractionMethod(str, Enum):
|
||||
HYBRID = "hybrid"
|
||||
|
||||
|
||||
class ProcessingStatus(str, Enum):
|
||||
"""Status values for processing jobs."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class FileDocument(BaseModel):
|
||||
"""
|
||||
Model for file documents stored in the 'files' collection.
|
||||
@@ -58,6 +49,9 @@ class FileDocument(BaseModel):
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
@field_validator('filepath')
|
||||
@classmethod
|
||||
@@ -74,69 +68,3 @@ class FileDocument(BaseModel):
|
||||
if not v.strip():
|
||||
raise ValueError("Filename cannot be empty")
|
||||
return v.strip()
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
populate_by_name = True
|
||||
arbitrary_types_allowed = True
|
||||
json_encoders = {ObjectId: str}
|
||||
|
||||
|
||||
class DocumentContent(BaseModel):
|
||||
"""Model for document content."""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
content: str = Field(..., description="File content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
|
||||
class ProcessingJob(BaseModel):
|
||||
"""
|
||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||
|
||||
Tracks the lifecycle and status of document processing tasks.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_id: PyObjectId = Field(..., description="Reference to file document")
|
||||
status: ProcessingStatus = Field(
|
||||
default=ProcessingStatus.PENDING,
|
||||
description="Current processing status"
|
||||
)
|
||||
task_id: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Celery task UUID"
|
||||
)
|
||||
created_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when job was created"
|
||||
)
|
||||
started_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when processing started"
|
||||
)
|
||||
completed_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when processing completed"
|
||||
)
|
||||
error_message: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Error message if processing failed"
|
||||
)
|
||||
|
||||
@field_validator('error_message')
|
||||
@classmethod
|
||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||
"""Clean up error message."""
|
||||
if v is not None:
|
||||
return v.strip() if v.strip() else None
|
||||
return v
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
populate_by_name = True
|
||||
arbitrary_types_allowed = True
|
||||
json_encoders = {ObjectId: str}
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
from bson import ObjectId
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
|
||||
class ProcessingStatus(str, Enum):
|
||||
"""Status values for processing jobs."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class ProcessingJob(BaseModel):
|
||||
"""
|
||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||
|
||||
Tracks the lifecycle and status of document processing tasks.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_id: PyObjectId = Field(..., description="Reference to file document")
|
||||
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
|
||||
task_id: Optional[str] = Field(default=None, description="Celery task UUID")
|
||||
created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
|
||||
started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
|
||||
completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
|
||||
error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
|
||||
|
||||
@field_validator('error_message')
|
||||
@classmethod
|
||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||
"""Clean up error message."""
|
||||
if v is not None:
|
||||
return v.strip() if v.strip() else None
|
||||
return v
|
||||
@@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import magic
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClientSession
|
||||
import magic
|
||||
from pymongo.errors import PyMongoError
|
||||
|
||||
from app.database.connection import get_database
|
||||
from app.config.settings import get_objects_folder
|
||||
from app.database.repositories.document_repository import FileDocumentRepository
|
||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
||||
from app.models.document import (
|
||||
FileDocument,
|
||||
DocumentContent,
|
||||
FileType,
|
||||
ProcessingStatus
|
||||
)
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
@@ -34,13 +31,25 @@ class DocumentService:
|
||||
and their content while ensuring data consistency through transactions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the document service with repository dependencies."""
|
||||
self.db = get_database()
|
||||
self.file_repository = FileDocumentRepository(self.db)
|
||||
self.content_repository = DocumentContentRepository(self.db)
|
||||
def __init__(self, database, objects_folder: str = None):
|
||||
"""
|
||||
Initialize the document service with repository dependencies.
|
||||
|
||||
def _calculate_file_hash(self, file_bytes: bytes) -> str:
|
||||
Args:
|
||||
database: Database instance
|
||||
objects_folder: folder to store files by their hash
|
||||
"""
|
||||
|
||||
self.db = database
|
||||
self.document_repository = FileDocumentRepository(self.db)
|
||||
self.objects_folder = objects_folder or get_objects_folder()
|
||||
|
||||
async def initialize(self):
|
||||
await self.document_repository.initialize()
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def _calculate_file_hash(file_bytes: bytes) -> str:
|
||||
"""
|
||||
Calculate SHA256 hash of file content.
|
||||
|
||||
@@ -52,7 +61,8 @@ class DocumentService:
|
||||
"""
|
||||
return hashlib.sha256(file_bytes).hexdigest()
|
||||
|
||||
def _detect_file_type(self, file_path: str) -> FileType:
|
||||
@staticmethod
|
||||
def _detect_file_type(file_path: str) -> FileType:
|
||||
"""
|
||||
Detect file type from file extension.
|
||||
|
||||
@@ -72,7 +82,8 @@ class DocumentService:
|
||||
except ValueError:
|
||||
raise ValueError(f"Unsupported file type: {extension}")
|
||||
|
||||
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
||||
@staticmethod
|
||||
def _detect_mime_type(file_bytes: bytes) -> str:
|
||||
"""
|
||||
Detect MIME type from file content.
|
||||
|
||||
@@ -84,6 +95,25 @@ class DocumentService:
|
||||
"""
|
||||
return magic.from_buffer(file_bytes, mime=True)
|
||||
|
||||
def _get_document_path(self, file_hash):
|
||||
"""
|
||||
|
||||
:param file_hash:
|
||||
:return:
|
||||
"""
|
||||
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
||||
|
||||
def save_content_if_needed(self, file_hash, content: bytes):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if os.path.exists(target_path):
|
||||
return
|
||||
|
||||
if not os.path.exists(os.path.dirname(target_path)):
|
||||
os.makedirs(os.path.dirname(target_path))
|
||||
|
||||
with open(target_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
async def create_document(
|
||||
self,
|
||||
file_path: str,
|
||||
@@ -115,50 +145,32 @@ class DocumentService:
|
||||
mime_type = self._detect_mime_type(file_bytes)
|
||||
file_size = len(file_bytes)
|
||||
filename = Path(file_path).name
|
||||
detected_at = datetime.utcnow()
|
||||
detected_at = datetime.now()
|
||||
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
try:
|
||||
# Check if content already exists
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash, session=session
|
||||
)
|
||||
try:
|
||||
self.save_content_if_needed(file_hash, file_bytes)
|
||||
|
||||
# Create DocumentContent if it doesn't exist
|
||||
if not existing_content:
|
||||
content_data = DocumentContent(
|
||||
file_hash=file_hash,
|
||||
content="", # Will be populated by processing workers
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
await self.content_repository.create_document_content(
|
||||
content_data, session=session
|
||||
)
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash,
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash
|
||||
)
|
||||
created_file = await self.document_repository.create_document(file_data)
|
||||
|
||||
created_file = await self.file_repository.create_document(
|
||||
file_data, session=session
|
||||
)
|
||||
return created_file
|
||||
|
||||
return created_file
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
|
||||
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -170,7 +182,7 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_id(document_id)
|
||||
return await self.document_repository.find_document_by_id(str(document_id))
|
||||
|
||||
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -182,7 +194,7 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_hash(file_hash)
|
||||
return await self.document_repository.find_document_by_hash(file_hash)
|
||||
|
||||
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -194,32 +206,15 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_filepath(filepath)
|
||||
return await self.document_repository.find_document_by_filepath(filepath)
|
||||
|
||||
async def get_document_with_content(
|
||||
self,
|
||||
document_id: PyObjectId
|
||||
) -> Optional[Tuple[FileDocument, DocumentContent]]:
|
||||
"""
|
||||
Retrieve a document with its associated content.
|
||||
|
||||
Args:
|
||||
document_id: Document ObjectId
|
||||
|
||||
Returns:
|
||||
Tuple of (FileDocument, DocumentContent) if found, None otherwise
|
||||
"""
|
||||
document = await self.get_document_by_id(document_id)
|
||||
if not document:
|
||||
async def get_document_content_by_hash(self, file_hash):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if not os.path.exists(target_path):
|
||||
return None
|
||||
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash
|
||||
)
|
||||
if not content:
|
||||
return None
|
||||
|
||||
return (document, content)
|
||||
with open(target_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
async def list_documents(
|
||||
self,
|
||||
@@ -236,7 +231,7 @@ class DocumentService:
|
||||
Returns:
|
||||
List of FileDocument instances
|
||||
"""
|
||||
return await self.file_repository.list_documents(skip=skip, limit=limit)
|
||||
return await self.document_repository.list_documents(skip=skip, limit=limit)
|
||||
|
||||
async def count_documents(self) -> int:
|
||||
"""
|
||||
@@ -245,7 +240,7 @@ class DocumentService:
|
||||
Returns:
|
||||
Total document count
|
||||
"""
|
||||
return await self.file_repository.count_documents()
|
||||
return await self.document_repository.count_documents()
|
||||
|
||||
async def update_document(
|
||||
self,
|
||||
@@ -262,7 +257,12 @@ class DocumentService:
|
||||
Returns:
|
||||
Updated FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.update_document(document_id, update_data)
|
||||
if "file_bytes" in update_data:
|
||||
file_hash = self._calculate_file_hash(update_data["file_bytes"])
|
||||
update_data["file_hash"] = file_hash
|
||||
self.save_content_if_needed(file_hash, update_data["file_bytes"])
|
||||
|
||||
return await self.document_repository.update_document(document_id, update_data)
|
||||
|
||||
async def delete_document(self, document_id: PyObjectId) -> bool:
|
||||
"""
|
||||
@@ -281,100 +281,31 @@ class DocumentService:
|
||||
Raises:
|
||||
PyMongoError: If database operation fails
|
||||
"""
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
# Start transaction
|
||||
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = await self.document_repository.find_document_by_id(document_id)
|
||||
if not document:
|
||||
return False
|
||||
|
||||
# Delete the document
|
||||
deleted = await self.document_repository.delete_document(document_id)
|
||||
if not deleted:
|
||||
return False
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = await self.document_repository.find_document_by_hash(document.file_hash)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = await self.file_repository.find_document_by_id(
|
||||
document_id, session=session
|
||||
)
|
||||
if not document:
|
||||
return False
|
||||
os.remove(self._get_document_path(document.file_hash))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Delete the document
|
||||
deleted = await self.file_repository.delete_document(
|
||||
document_id, session=session
|
||||
)
|
||||
if not deleted:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = await self.file_repository.find_document_by_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
if content:
|
||||
await self.content_repository.delete_document_content(
|
||||
content.id, session=session
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if content with given hash exists.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
True if content exists, False otherwise
|
||||
"""
|
||||
return await self.content_repository.content_exists(file_hash)
|
||||
|
||||
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Retrieve content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
DocumentContent if found, None otherwise
|
||||
"""
|
||||
return await self.content_repository.find_document_content_by_file_hash(file_hash)
|
||||
|
||||
async def update_document_content(
|
||||
self,
|
||||
file_hash: str,
|
||||
content: str,
|
||||
encoding: str = "utf-8"
|
||||
) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update the extracted content for a document.
|
||||
|
||||
This method is typically called by processing workers to store
|
||||
the extracted text content.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
content: Extracted text content
|
||||
encoding: Character encoding
|
||||
|
||||
Returns:
|
||||
Updated DocumentContent if found, None otherwise
|
||||
"""
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash
|
||||
)
|
||||
if not existing_content:
|
||||
return None
|
||||
|
||||
update_data = {
|
||||
"content": content,
|
||||
"encoding": encoding
|
||||
}
|
||||
|
||||
return await self.content_repository.update_document_content(
|
||||
existing_content.id, update_data
|
||||
)
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
@@ -33,7 +33,7 @@ class InitializationService:
|
||||
self.user_service = user_service
|
||||
|
||||
|
||||
def ensure_admin_user_exists(self) -> Optional[UserInDB]:
|
||||
async def ensure_admin_user_exists(self) -> Optional[UserInDB]:
|
||||
"""
|
||||
Ensure default admin user exists in the system.
|
||||
|
||||
@@ -49,7 +49,7 @@ class InitializationService:
|
||||
logger.info("Checking if admin user exists...")
|
||||
|
||||
# Check if any admin user already exists
|
||||
if self._admin_user_exists():
|
||||
if await self._admin_user_exists():
|
||||
logger.info("Admin user already exists, skipping creation")
|
||||
return None
|
||||
|
||||
@@ -64,7 +64,7 @@ class InitializationService:
|
||||
role=UserRole.ADMIN
|
||||
)
|
||||
|
||||
created_user = self.user_service.create_user(admin_data)
|
||||
created_user = await self.user_service.create_user(admin_data)
|
||||
logger.info(f"Default admin user created successfully with ID: {created_user.id}")
|
||||
logger.warning(
|
||||
"Default admin user created with username 'admin' and password 'admin'. "
|
||||
@@ -77,7 +77,7 @@ class InitializationService:
|
||||
logger.error(f"Failed to create default admin user: {str(e)}")
|
||||
raise Exception(f"Admin user creation failed: {str(e)}")
|
||||
|
||||
def _admin_user_exists(self) -> bool:
|
||||
async def _admin_user_exists(self) -> bool:
|
||||
"""
|
||||
Check if any admin user exists in the system.
|
||||
|
||||
@@ -86,7 +86,7 @@ class InitializationService:
|
||||
"""
|
||||
try:
|
||||
# Get all users and check if any have admin role
|
||||
users = self.user_service.list_users(limit=1000) # Reasonable limit for admin check
|
||||
users = await self.user_service.list_users(limit=1000) # Reasonable limit for admin check
|
||||
|
||||
for user in users:
|
||||
if user.role == UserRole.ADMIN and user.is_active:
|
||||
@@ -99,7 +99,7 @@ class InitializationService:
|
||||
# In case of error, assume admin exists to avoid creating duplicates
|
||||
return True
|
||||
|
||||
def initialize_application(self) -> dict:
|
||||
async def initialize_application(self) -> dict:
|
||||
"""
|
||||
Perform all application initialization tasks.
|
||||
|
||||
@@ -119,7 +119,7 @@ class InitializationService:
|
||||
|
||||
try:
|
||||
# Ensure admin user exists
|
||||
created_admin = self.ensure_admin_user_exists()
|
||||
created_admin = await self.ensure_admin_user_exists()
|
||||
if created_admin:
|
||||
initialization_summary["admin_user_created"] = True
|
||||
|
||||
|
||||
@@ -6,11 +6,11 @@ retrieval, updates, and authentication operations with proper error handling.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
|
||||
from app.models.user import UserCreate, UserInDB, UserUpdate, UserResponse, UserCreateNoValidation
|
||||
from app.models.auth import UserRole
|
||||
from app.database.repositories.user_repository import UserRepository
|
||||
from app.models.user import UserCreate, UserInDB, UserUpdate, UserCreateNoValidation
|
||||
from app.services.auth_service import AuthService
|
||||
|
||||
|
||||
@@ -22,17 +22,22 @@ class UserService:
|
||||
authentication, and data management with proper validation.
|
||||
"""
|
||||
|
||||
def __init__(self, user_repository: UserRepository):
|
||||
def __init__(self, database):
|
||||
"""
|
||||
Initialize user service with repository dependency.
|
||||
|
||||
Args:
|
||||
user_repository (UserRepository): Repository for user data operations
|
||||
"""
|
||||
self.user_repository = user_repository
|
||||
self.db = database
|
||||
self.user_repository = UserRepository(self.db)
|
||||
self.auth_service = AuthService()
|
||||
|
||||
def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
|
||||
async def initialize(self):
|
||||
await self.user_repository.initialize()
|
||||
return self
|
||||
|
||||
async def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
|
||||
"""
|
||||
Create a new user with business logic validation.
|
||||
|
||||
@@ -55,11 +60,11 @@ class UserService:
|
||||
raise ValueError(f"User with email '{user_data.email}' already exists")
|
||||
|
||||
try:
|
||||
return self.user_repository.create_user(user_data)
|
||||
return await self.user_repository.create_user(user_data)
|
||||
except DuplicateKeyError:
|
||||
raise ValueError(f"User with username '{user_data.username}' already exists")
|
||||
|
||||
def get_user_by_username(self, username: str) -> Optional[UserInDB]:
|
||||
async def get_user_by_username(self, username: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Retrieve user by username.
|
||||
|
||||
@@ -69,9 +74,9 @@ class UserService:
|
||||
Returns:
|
||||
UserInDB or None: User if found, None otherwise
|
||||
"""
|
||||
return self.user_repository.find_user_by_username(username)
|
||||
return await self.user_repository.find_user_by_username(username)
|
||||
|
||||
def get_user_by_id(self, user_id: str) -> Optional[UserInDB]:
|
||||
async def get_user_by_id(self, user_id: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Retrieve user by ID.
|
||||
|
||||
@@ -81,9 +86,9 @@ class UserService:
|
||||
Returns:
|
||||
UserInDB or None: User if found, None otherwise
|
||||
"""
|
||||
return self.user_repository.find_user_by_id(user_id)
|
||||
return await self.user_repository.find_user_by_id(user_id)
|
||||
|
||||
def authenticate_user(self, username: str, password: str) -> Optional[UserInDB]:
|
||||
async def authenticate_user(self, username: str, password: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Authenticate user with username and password.
|
||||
|
||||
@@ -106,7 +111,7 @@ class UserService:
|
||||
|
||||
return user
|
||||
|
||||
def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
|
||||
async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
|
||||
"""
|
||||
Update user information.
|
||||
|
||||
@@ -132,9 +137,9 @@ class UserService:
|
||||
if existing_user and str(existing_user.id) != user_id:
|
||||
raise ValueError(f"Email '{user_update.email}' is already taken")
|
||||
|
||||
return self.user_repository.update_user(user_id, user_update)
|
||||
return await self.user_repository.update_user(user_id, user_update)
|
||||
|
||||
def delete_user(self, user_id: str) -> bool:
|
||||
async def delete_user(self, user_id: str) -> bool:
|
||||
"""
|
||||
Delete user from system.
|
||||
|
||||
@@ -146,7 +151,7 @@ class UserService:
|
||||
"""
|
||||
return self.user_repository.delete_user(user_id)
|
||||
|
||||
def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
|
||||
async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
|
||||
"""
|
||||
List users with pagination.
|
||||
|
||||
@@ -157,18 +162,18 @@ class UserService:
|
||||
Returns:
|
||||
List[UserInDB]: List of users
|
||||
"""
|
||||
return self.user_repository.list_users(skip=skip, limit=limit)
|
||||
return await self.user_repository.list_users(skip=skip, limit=limit)
|
||||
|
||||
def count_users(self) -> int:
|
||||
async def count_users(self) -> int:
|
||||
"""
|
||||
Count total number of users.
|
||||
|
||||
Returns:
|
||||
int: Total number of users in system
|
||||
"""
|
||||
return self.user_repository.count_users()
|
||||
return await self.user_repository.count_users()
|
||||
|
||||
def user_exists(self, username: str) -> bool:
|
||||
async def user_exists(self, username: str) -> bool:
|
||||
"""
|
||||
Check if user exists by username.
|
||||
|
||||
@@ -178,4 +183,4 @@ class UserService:
|
||||
Returns:
|
||||
bool: True if user exists, False otherwise
|
||||
"""
|
||||
return self.user_repository.user_exists(username)
|
||||
return await self.user_repository.user_exists(username)
|
||||
|
||||
0
tests/database/__init__.py
Normal file
0
tests/database/__init__.py
Normal file
0
tests/models/__init__.py
Normal file
0
tests/models/__init__.py
Normal file
0
tests/repositories/__init__.py
Normal file
0
tests/repositories/__init__.py
Normal file
672
tests/repositories/test_document_repository.py
Normal file
672
tests/repositories/test_document_repository.py
Normal file
@@ -0,0 +1,672 @@
|
||||
"""
|
||||
Test suite for FileDocumentRepository with async/await support.
|
||||
|
||||
This module contains comprehensive tests for all FileDocumentRepository methods
|
||||
using mongomock-motor for in-memory MongoDB testing.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
import pytest_asyncio
|
||||
from bson import ObjectId
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from mongomock_motor import AsyncMongoMockClient
|
||||
|
||||
from app.database.repositories.document_repository import (
|
||||
FileDocumentRepository,
|
||||
MatchMethodBase,
|
||||
SubsequenceMatching,
|
||||
FuzzyMatching
|
||||
)
|
||||
from app.models.document import FileDocument, FileType, ExtractionMethod
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def in_memory_repository():
|
||||
"""Create an in-memory FileDocumentRepository for testing."""
|
||||
client = AsyncMongoMockClient()
|
||||
db = client.test_database
|
||||
repo = FileDocumentRepository(db)
|
||||
await repo.initialize()
|
||||
return repo
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_document():
|
||||
"""Sample FileDocument data for testing."""
|
||||
return FileDocument(
|
||||
filename="sample_document.pdf",
|
||||
filepath="/home/user/documents/sample_document.pdf",
|
||||
file_type=FileType.PDF,
|
||||
extraction_method=ExtractionMethod.OCR,
|
||||
metadata={"pages": 5, "language": "en", "author": "John Doe"},
|
||||
detected_at=datetime.now(),
|
||||
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
|
||||
encoding="utf-8",
|
||||
file_size=1024000,
|
||||
mime_type="application/pdf"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_update_data():
|
||||
"""Sample update data for testing."""
|
||||
return {
|
||||
"extraction_method": ExtractionMethod.HYBRID,
|
||||
"metadata": {"pages": 10, "language": "fr", "updated": True},
|
||||
"file_size": 2048000
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multiple_sample_files():
|
||||
"""Multiple FileDocument objects for list/search testing."""
|
||||
base_time = datetime.now()
|
||||
return [
|
||||
FileDocument(
|
||||
filename="first_doc.txt",
|
||||
filepath="/docs/first_doc.txt",
|
||||
file_type=FileType.TXT,
|
||||
extraction_method=ExtractionMethod.DIRECT_TEXT,
|
||||
metadata={"words": 500},
|
||||
detected_at=base_time,
|
||||
file_hash="hash1" + "0" * 58,
|
||||
encoding="utf-8",
|
||||
file_size=5000,
|
||||
mime_type="text/plain"
|
||||
),
|
||||
FileDocument(
|
||||
filename="second_document.pdf",
|
||||
filepath="/docs/second_document.pdf",
|
||||
file_type=FileType.PDF,
|
||||
extraction_method=ExtractionMethod.OCR,
|
||||
metadata={"pages": 8},
|
||||
detected_at=base_time,
|
||||
file_hash="hash2" + "0" * 58,
|
||||
encoding="utf-8",
|
||||
file_size=10000,
|
||||
mime_type="application/pdf"
|
||||
),
|
||||
FileDocument(
|
||||
filename="third_file.docx",
|
||||
filepath="/docs/third_file.docx",
|
||||
file_type=FileType.DOCX,
|
||||
extraction_method=ExtractionMethod.HYBRID,
|
||||
metadata={"paragraphs": 15},
|
||||
detected_at=base_time,
|
||||
file_hash="hash3" + "0" * 58,
|
||||
encoding="utf-8",
|
||||
file_size=15000,
|
||||
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryInitialization:
|
||||
"""Tests for repository initialization."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_initialize_repository(self):
|
||||
"""Test repository initialization."""
|
||||
# Arrange
|
||||
client = AsyncMongoMockClient()
|
||||
db = client.test_database
|
||||
repo = FileDocumentRepository(db)
|
||||
await repo.initialize()
|
||||
|
||||
# Act & Assert (should not raise any exception)
|
||||
assert repo.db is not None
|
||||
assert repo.collection is not None
|
||||
# TODO : check that the indexes are created
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryCreation:
|
||||
"""Tests for file document creation functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_file_document(self, in_memory_repository, sample_file_document):
|
||||
"""Test successful file document creation."""
|
||||
# Act
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Assert
|
||||
assert created_file is not None
|
||||
assert created_file.filename == sample_file_document.filename
|
||||
assert created_file.filepath == sample_file_document.filepath
|
||||
assert created_file.file_type == sample_file_document.file_type
|
||||
assert created_file.extraction_method == sample_file_document.extraction_method
|
||||
assert created_file.metadata == sample_file_document.metadata
|
||||
assert created_file.file_hash == sample_file_document.file_hash
|
||||
assert created_file.file_size == sample_file_document.file_size
|
||||
assert created_file.mime_type == sample_file_document.mime_type
|
||||
assert created_file.id is not None
|
||||
assert isinstance(created_file.id, ObjectId)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_file_document_without_id(self, in_memory_repository, sample_file_document):
|
||||
"""Test creating file document with _id set to None (should be removed)."""
|
||||
# Arrange
|
||||
sample_file_document.id = None
|
||||
|
||||
# Act
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Assert
|
||||
assert created_file is not None
|
||||
assert created_file.id is not None
|
||||
assert isinstance(created_file.id, ObjectId)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_duplicate_file_document(self, in_memory_repository, sample_file_document):
|
||||
"""Test that creating file document with duplicate filepath raises DuplicateKeyError."""
|
||||
# Arrange
|
||||
await in_memory_repository.create_document(sample_file_document)
|
||||
duplicate_file = FileDocument(
|
||||
filename="different_name.pdf",
|
||||
filepath=sample_file_document.filepath, # Same filepath
|
||||
file_type=FileType.PDF,
|
||||
extraction_method=ExtractionMethod.OCR,
|
||||
metadata={"different": "metadata"},
|
||||
detected_at=datetime.now(),
|
||||
file_hash="different_hash_123456789012345678901234567890123456789012345678",
|
||||
encoding="utf-8",
|
||||
file_size=2000,
|
||||
mime_type="application/pdf"
|
||||
)
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(DuplicateKeyError) as exc_info:
|
||||
await in_memory_repository.create_document(duplicate_file)
|
||||
|
||||
assert "already exists" in str(exc_info.value)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_file_document_with_pymongo_error(self, in_memory_repository,
|
||||
sample_file_document, mocker):
|
||||
"""Test handling of PyMongo errors during file document creation."""
|
||||
# Arrange
|
||||
mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
assert "Failed to create file document" in str(exc_info.value)
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryFinding:
|
||||
"""Tests for file document finding functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document):
|
||||
"""Test finding file document by valid ObjectId."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
found_file = await in_memory_repository.find_document_by_id(str(created_file.id))
|
||||
|
||||
# Assert
|
||||
assert found_file is not None
|
||||
assert found_file.id == created_file.id
|
||||
assert found_file.filename == created_file.filename
|
||||
assert found_file.filepath == created_file.filepath
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository):
|
||||
"""Test that invalid ObjectId returns None."""
|
||||
# Act
|
||||
found_file = await in_memory_repository.find_document_by_id("invalid_id")
|
||||
|
||||
# Assert
|
||||
assert found_file is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository):
|
||||
"""Test that nonexistent but valid ObjectId returns None."""
|
||||
# Arrange
|
||||
nonexistent_id = str(ObjectId())
|
||||
|
||||
# Act
|
||||
found_file = await in_memory_repository.find_document_by_id(nonexistent_id)
|
||||
|
||||
# Assert
|
||||
assert found_file is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_document_by_file_hash(self, in_memory_repository, sample_file_document):
|
||||
"""Test finding file document by file hash."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
found_file = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash)
|
||||
|
||||
# Assert
|
||||
assert found_file is not None
|
||||
assert found_file.file_hash == created_file.file_hash
|
||||
assert found_file.id == created_file.id
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_with_nonexistent_file_hash(self, in_memory_repository):
|
||||
"""Test that nonexistent file hash returns None."""
|
||||
# Act
|
||||
found_file = await in_memory_repository.find_document_by_hash("nonexistent_hash")
|
||||
|
||||
# Assert
|
||||
assert found_file is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document):
|
||||
"""Test finding file document by filepath."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
found_file = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath)
|
||||
|
||||
# Assert
|
||||
assert found_file is not None
|
||||
assert found_file.filepath == created_file.filepath
|
||||
assert found_file.id == created_file.id
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository):
|
||||
"""Test that nonexistent filepath returns None."""
|
||||
# Act
|
||||
found_file = await in_memory_repository.find_document_by_filepath("/nonexistent/path/file.pdf")
|
||||
|
||||
# Assert
|
||||
assert found_file is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_with_pymongo_error(self, in_memory_repository, mocker):
|
||||
"""Test handling of PyMongo errors during file document finding."""
|
||||
# Arrange
|
||||
mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
found_file = await in_memory_repository.find_document_by_hash("test_hash")
|
||||
|
||||
# Assert
|
||||
assert found_file is None
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryNameMatching:
|
||||
"""Tests for file document name matching functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_documents_by_name_with_fuzzy_matching(self, in_memory_repository, multiple_sample_files):
|
||||
"""Test finding file documents by filename using fuzzy matching."""
|
||||
# Arrange
|
||||
for file_doc in multiple_sample_files:
|
||||
await in_memory_repository.create_document(file_doc)
|
||||
|
||||
# Act
|
||||
fuzzy_method = FuzzyMatching(threshold=0.5)
|
||||
found_files = await in_memory_repository.find_document_by_name("document", fuzzy_method)
|
||||
|
||||
# Assert
|
||||
assert len(found_files) >= 1
|
||||
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
|
||||
# Should find files with "document" in the name
|
||||
found_filenames = [f.filename for f in found_files]
|
||||
assert any("document" in fname.lower() for fname in found_filenames)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_documents_by_name_with_subsequence_matching(self, in_memory_repository,
|
||||
multiple_sample_files):
|
||||
"""Test finding file documents by filename using subsequence matching."""
|
||||
# Arrange
|
||||
for file_doc in multiple_sample_files:
|
||||
await in_memory_repository.create_document(file_doc)
|
||||
|
||||
# Act
|
||||
subsequence_method = SubsequenceMatching()
|
||||
found_files = await in_memory_repository.find_document_by_name("doc", subsequence_method)
|
||||
|
||||
# Assert
|
||||
assert len(found_files) >= 1
|
||||
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_documents_by_name_with_default_method(self, in_memory_repository, multiple_sample_files):
|
||||
"""Test finding file documents by filename with default matching method."""
|
||||
# Arrange
|
||||
for file_doc in multiple_sample_files:
|
||||
await in_memory_repository.create_document(file_doc)
|
||||
|
||||
# Act
|
||||
found_files = await in_memory_repository.find_document_by_name("first")
|
||||
|
||||
# Assert
|
||||
assert len(found_files) >= 0
|
||||
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker):
|
||||
"""Test handling of PyMongo errors during document name matching."""
|
||||
# Arrange
|
||||
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
found_files = await in_memory_repository.find_document_by_name("test")
|
||||
|
||||
# Assert
|
||||
assert found_files == []
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryListing:
|
||||
"""Tests for file document listing functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_files):
|
||||
"""Test listing file documents with default pagination."""
|
||||
# Arrange
|
||||
for file_doc in multiple_sample_files:
|
||||
await in_memory_repository.create_document(file_doc)
|
||||
|
||||
# Act
|
||||
files = await in_memory_repository.list_documents()
|
||||
|
||||
# Assert
|
||||
assert len(files) == len(multiple_sample_files)
|
||||
assert all(isinstance(file_doc, FileDocument) for file_doc in files)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_files):
|
||||
"""Test listing file documents with custom pagination."""
|
||||
# Arrange
|
||||
for file_doc in multiple_sample_files:
|
||||
await in_memory_repository.create_document(file_doc)
|
||||
|
||||
# Act
|
||||
files_page1 = await in_memory_repository.list_documents(skip=0, limit=2)
|
||||
files_page2 = await in_memory_repository.list_documents(skip=2, limit=2)
|
||||
|
||||
# Assert
|
||||
assert len(files_page1) == 2
|
||||
assert len(files_page2) == 1 # Only 3 total files
|
||||
|
||||
# Ensure no overlap between pages
|
||||
page1_ids = [file_doc.id for file_doc in files_page1]
|
||||
page2_ids = [file_doc.id for file_doc in files_page2]
|
||||
assert len(set(page1_ids).intersection(set(page2_ids))) == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_documents_sorted_by_detected_at(self, in_memory_repository, sample_file_document):
|
||||
"""Test that file documents are sorted by detected_at in descending order."""
|
||||
# Arrange
|
||||
file1 = sample_file_document.model_copy()
|
||||
file1.filepath = "/docs/file1.pdf"
|
||||
file1.filename = "file1.pdf"
|
||||
file1.file_hash = "hash1" + "0" * 58
|
||||
file1.detected_at = datetime(2024, 1, 1, 10, 0, 0)
|
||||
|
||||
file2 = sample_file_document.model_copy()
|
||||
file2.filepath = "/docs/file2.pdf"
|
||||
file2.filename = "file2.pdf"
|
||||
file2.file_hash = "hash2" + "0" * 58
|
||||
file2.detected_at = datetime(2024, 1, 2, 10, 0, 0) # Later date
|
||||
|
||||
created_file1 = await in_memory_repository.create_document(file1)
|
||||
created_file2 = await in_memory_repository.create_document(file2)
|
||||
|
||||
# Act
|
||||
files = await in_memory_repository.list_documents()
|
||||
|
||||
# Assert
|
||||
assert len(files) == 2
|
||||
# Most recent (latest detected_at) should be first
|
||||
assert files[0].id == created_file2.id
|
||||
assert files[1].id == created_file1.id
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_empty_documents(self, in_memory_repository):
|
||||
"""Test listing file documents from empty collection."""
|
||||
# Act
|
||||
files = await in_memory_repository.list_documents()
|
||||
|
||||
# Assert
|
||||
assert files == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker):
|
||||
"""Test handling of PyMongo errors during file document listing."""
|
||||
# Arrange
|
||||
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
files = await in_memory_repository.list_documents()
|
||||
|
||||
# Assert
|
||||
assert files == []
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryUpdate:
|
||||
"""Tests for file document update functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document,
|
||||
sample_update_data):
|
||||
"""Test successful file document update."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
updated_file = await in_memory_repository.update_document(str(created_file.id), sample_update_data)
|
||||
|
||||
# Assert
|
||||
assert updated_file is not None
|
||||
assert updated_file.extraction_method == sample_update_data["extraction_method"]
|
||||
assert updated_file.metadata == sample_update_data["metadata"]
|
||||
assert updated_file.file_size == sample_update_data["file_size"]
|
||||
assert updated_file.id == created_file.id
|
||||
assert updated_file.filename == created_file.filename # Unchanged fields remain
|
||||
assert updated_file.filepath == created_file.filepath
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document):
|
||||
"""Test updating file document with partial data."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
partial_update = {"file_size": 999999}
|
||||
|
||||
# Act
|
||||
updated_file = await in_memory_repository.update_document(str(created_file.id), partial_update)
|
||||
|
||||
# Assert
|
||||
assert updated_file is not None
|
||||
assert updated_file.file_size == 999999
|
||||
assert updated_file.filename == created_file.filename # Should remain unchanged
|
||||
assert updated_file.metadata == created_file.metadata # Should remain unchanged
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document):
|
||||
"""Test that None values are filtered out from update data."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
update_with_none = {"file_size": 777777, "metadata": None}
|
||||
|
||||
# Act
|
||||
updated_file = await in_memory_repository.update_document(str(created_file.id), update_with_none)
|
||||
|
||||
# Assert
|
||||
assert updated_file is not None
|
||||
assert updated_file.file_size == 777777
|
||||
assert updated_file.metadata == created_file.metadata # Should remain unchanged (None filtered out)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document):
|
||||
"""Test updating file document with empty data returns current document."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
empty_update = {}
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.update_document(str(created_file.id), empty_update)
|
||||
|
||||
# Assert
|
||||
assert result is not None
|
||||
assert result.filename == created_file.filename
|
||||
assert result.filepath == created_file.filepath
|
||||
assert result.metadata == created_file.metadata
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data):
|
||||
"""Test that updating with invalid ID returns None."""
|
||||
# Act
|
||||
result = await in_memory_repository.update_document("invalid_id", sample_update_data)
|
||||
|
||||
# Assert
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data):
|
||||
"""Test that updating nonexistent file document returns None."""
|
||||
# Arrange
|
||||
nonexistent_id = str(ObjectId())
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.update_document(nonexistent_id, sample_update_data)
|
||||
|
||||
# Assert
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document,
|
||||
sample_update_data, mocker):
|
||||
"""Test handling of PyMongo errors during file document update."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
|
||||
side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.update_document(str(created_file.id), sample_update_data)
|
||||
|
||||
# Assert
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryDeletion:
|
||||
"""Tests for file document deletion functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document):
|
||||
"""Test successful file document deletion."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
deletion_result = await in_memory_repository.delete_document(str(created_file.id))
|
||||
|
||||
# Assert
|
||||
assert deletion_result is True
|
||||
|
||||
# Verify document is actually deleted
|
||||
found_file = await in_memory_repository.find_document_by_id(str(created_file.id))
|
||||
assert found_file is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository):
|
||||
"""Test that deleting with invalid ID returns False."""
|
||||
# Act
|
||||
result = await in_memory_repository.delete_document("invalid_id")
|
||||
|
||||
# Assert
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository):
|
||||
"""Test that deleting nonexistent file document returns False."""
|
||||
# Arrange
|
||||
nonexistent_id = str(ObjectId())
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.delete_document(nonexistent_id)
|
||||
|
||||
# Assert
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
|
||||
"""Test handling of PyMongo errors during file document deletion."""
|
||||
# Arrange
|
||||
created_file = await in_memory_repository.create_document(sample_file_document)
|
||||
mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.delete_document(str(created_file.id))
|
||||
|
||||
# Assert
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryUtilities:
|
||||
"""Tests for utility methods."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_count_documents(self, in_memory_repository, sample_file_document):
|
||||
"""Test counting file documents."""
|
||||
# Arrange
|
||||
initial_count = await in_memory_repository.count_documents()
|
||||
await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
final_count = await in_memory_repository.count_documents()
|
||||
|
||||
# Assert
|
||||
assert final_count == initial_count + 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_count_zero_documents(self, in_memory_repository):
|
||||
"""Test counting file documents in empty collection."""
|
||||
# Act
|
||||
count = await in_memory_repository.count_documents()
|
||||
|
||||
# Assert
|
||||
assert count == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_count_documents_with_pymongo_error(self, in_memory_repository, mocker):
|
||||
"""Test handling of PyMongo errors during file document counting."""
|
||||
# Arrange
|
||||
mocker.patch.object(in_memory_repository.collection, 'count_documents', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
count = await in_memory_repository.count_documents()
|
||||
|
||||
# Assert
|
||||
assert count == 0
|
||||
|
||||
|
||||
class TestMatchingMethods:
|
||||
"""Tests for matching method classes."""
|
||||
|
||||
def test_i_can_create_fuzzy_matching_with_default_threshold(self):
|
||||
"""Test creating FuzzyMatching with default threshold."""
|
||||
# Act
|
||||
fuzzy = FuzzyMatching()
|
||||
|
||||
# Assert
|
||||
assert fuzzy.threshold == 0.6
|
||||
|
||||
def test_i_can_create_fuzzy_matching_with_custom_threshold(self):
|
||||
"""Test creating FuzzyMatching with custom threshold."""
|
||||
# Act
|
||||
fuzzy = FuzzyMatching(threshold=0.8)
|
||||
|
||||
# Assert
|
||||
assert fuzzy.threshold == 0.8
|
||||
|
||||
def test_i_can_create_subsequence_matching(self):
|
||||
"""Test creating SubsequenceMatching."""
|
||||
# Act
|
||||
subsequence = SubsequenceMatching()
|
||||
|
||||
# Assert
|
||||
assert isinstance(subsequence, MatchMethodBase)
|
||||
assert isinstance(subsequence, SubsequenceMatching)
|
||||
0
tests/services/__init__.py
Normal file
0
tests/services/__init__.py
Normal file
587
tests/services/test_document_service.py
Normal file
587
tests/services/test_document_service.py
Normal file
@@ -0,0 +1,587 @@
|
||||
"""
|
||||
Unit tests for DocumentService using in-memory MongoDB.
|
||||
|
||||
Tests the orchestration logic with real MongoDB operations
|
||||
using mongomock for better integration testing.
|
||||
"""
|
||||
import os
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from bson import ObjectId
|
||||
from mongomock_motor import AsyncMongoMockClient
|
||||
|
||||
from app.models.document import FileType
|
||||
from app.services.document_service import DocumentService
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup_test_folder():
|
||||
"""Clean up test folder."""
|
||||
import shutil
|
||||
shutil.rmtree("test_folder", ignore_errors=True)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def in_memory_database():
|
||||
"""Create an in-memory database for testing."""
|
||||
client = AsyncMongoMockClient()
|
||||
return client.test_database
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def document_service(in_memory_database):
|
||||
"""Create DocumentService with in-memory repositories."""
|
||||
service = DocumentService(in_memory_database, objects_folder="test_folder")
|
||||
return service
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_bytes():
|
||||
"""Sample file content as bytes."""
|
||||
return b"This is a test PDF content"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_text_bytes():
|
||||
"""Sample text file content as bytes."""
|
||||
return b"This is a test text file content"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_hash():
|
||||
"""Expected SHA256 hash for sample file bytes."""
|
||||
import hashlib
|
||||
return hashlib.sha256(b"This is a test PDF content").hexdigest()
|
||||
|
||||
|
||||
def validate_file_saved(document_service, file_hash, file_bytes):
|
||||
# Verify file is saved to disk
|
||||
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
|
||||
assert os.path.exists(target_file_path)
|
||||
|
||||
with open(target_file_path, "rb") as f:
|
||||
content = f.read()
|
||||
assert content == file_bytes
|
||||
|
||||
|
||||
class TestCreateDocument:
|
||||
"""Tests for create_document method."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@patch('app.services.document_service.datetime')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_with_new_content(
|
||||
self,
|
||||
mock_datetime,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test creating document when content doesn't exist yet."""
|
||||
# Setup mocks
|
||||
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
|
||||
mock_datetime.now.return_value = fixed_time
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Execute
|
||||
result = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Verify document creation
|
||||
assert result is not None
|
||||
assert result.filename == "test.pdf"
|
||||
assert result.filepath == "/test/test.pdf"
|
||||
assert result.file_type == FileType.PDF
|
||||
assert result.detected_at == fixed_time
|
||||
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
|
||||
|
||||
# Verify document created in database
|
||||
doc_in_db = await document_service.document_repository.find_document_by_id(result.id)
|
||||
assert doc_in_db is not None
|
||||
assert doc_in_db.id == result.id
|
||||
assert doc_in_db.filename == result.filename
|
||||
assert doc_in_db.filepath == result.filepath
|
||||
assert doc_in_db.file_type == result.file_type
|
||||
assert doc_in_db.detected_at == fixed_time
|
||||
assert doc_in_db.file_hash == result.file_hash
|
||||
|
||||
# Verify file is saved to disk
|
||||
validate_file_saved(document_service, result.file_hash, sample_file_bytes)
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@patch('app.services.document_service.datetime')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_with_existing_content(
|
||||
self,
|
||||
mock_datetime,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test creating document when content already exists (deduplication)."""
|
||||
# Setup mocks
|
||||
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
|
||||
mock_datetime.now.return_value = fixed_time
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create first document
|
||||
first_doc = await document_service.create_document(
|
||||
"/test/first.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Create second document with same content
|
||||
second_doc = await document_service.create_document(
|
||||
"/test/second.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Verify both documents exist but share same hash
|
||||
assert first_doc.file_hash == second_doc.file_hash
|
||||
assert first_doc.filename != second_doc.filename
|
||||
assert first_doc.filepath != second_doc.filepath
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_document_with_unsupported_file_type(
|
||||
self,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test that unsupported file types raise ValueError."""
|
||||
with pytest.raises(ValueError, match="Unsupported file type"):
|
||||
await document_service.create_document(
|
||||
"/test/test.xyz", # Unsupported extension
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_document_with_empty_file_path(
|
||||
self,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test that empty file path raises ValueError."""
|
||||
with pytest.raises(ValueError):
|
||||
await document_service.create_document(
|
||||
"", # Empty path
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_with_empty_bytes(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service
|
||||
):
|
||||
"""Test behavior with empty file bytes."""
|
||||
# Setup
|
||||
mock_magic.return_value = "text/plain"
|
||||
|
||||
# Execute with empty bytes
|
||||
result = await document_service.create_document(
|
||||
"/test/empty.txt",
|
||||
b"", # Empty bytes
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Verify file is saved to disk
|
||||
validate_file_saved(document_service, result.file_hash, b"")
|
||||
|
||||
|
||||
class TestGetMethods:
|
||||
"""Tests for document retrieval methods."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_get_document_by_id(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test retrieving document by ID."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
result = await document_service.get_document_by_id(created_doc.id)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
assert result.id == created_doc.id
|
||||
assert result.filename == created_doc.filename
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_get_document_by_hash(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test retrieving document by file hash."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
result = await document_service.get_document_by_hash(created_doc.file_hash)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
assert result.file_hash == created_doc.file_hash
|
||||
assert result.filename == created_doc.filename
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_get_document_by_filepath(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test retrieving document by file path."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
test_path = "/test/unique_test.pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
test_path,
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
result = await document_service.get_document_by_filepath(test_path)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
assert result.filepath == test_path
|
||||
assert result.id == created_doc.id
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_get_document_content(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test retrieving document with associated content."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
result = await document_service.get_document_content_by_hash(created_doc.file_hash)
|
||||
|
||||
# Verify
|
||||
assert result == sample_file_bytes
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_get_nonexistent_document_by_id(
|
||||
self,
|
||||
document_service
|
||||
):
|
||||
"""Test that nonexistent document returns None."""
|
||||
# Execute with random ObjectId
|
||||
result = await document_service.get_document_by_id(ObjectId())
|
||||
|
||||
# Verify
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_get_nonexistent_document_by_hash(
|
||||
self,
|
||||
document_service
|
||||
):
|
||||
"""Test that nonexistent document hash returns None."""
|
||||
# Execute
|
||||
result = await document_service.get_document_by_hash("nonexistent_hash")
|
||||
|
||||
# Verify
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestPaginationAndCounting:
|
||||
"""Tests for document listing and counting."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_documents_with_pagination(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test document listing with pagination parameters."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create multiple documents
|
||||
for i in range(5):
|
||||
await document_service.create_document(
|
||||
f"/test/test{i}.pdf",
|
||||
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute with pagination
|
||||
result = await document_service.list_documents(skip=1, limit=2)
|
||||
|
||||
# Verify
|
||||
assert len(result) == 2
|
||||
|
||||
# Test counting
|
||||
total_count = await document_service.count_documents()
|
||||
assert total_count == 5
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_count_documents(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test document counting."""
|
||||
# Setup
|
||||
mock_magic.return_value = "text/plain"
|
||||
|
||||
# Initially should be 0
|
||||
initial_count = await document_service.count_documents()
|
||||
assert initial_count == 0
|
||||
|
||||
# Create some documents
|
||||
for i in range(3):
|
||||
await document_service.create_document(
|
||||
f"/test/test{i}.txt",
|
||||
sample_file_bytes + bytes(str(i), 'utf-8'),
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
final_count = await document_service.count_documents()
|
||||
|
||||
# Verify
|
||||
assert final_count == 3
|
||||
|
||||
|
||||
class TestUpdateAndDelete:
|
||||
"""Tests for document update and deletion operations."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_metadata(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test updating document metadata."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute update
|
||||
update_data = {"metadata": {"page_count": 5}}
|
||||
result = await document_service.update_document(created_doc.id, update_data)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
assert result.metadata.get("page_count") == 5
|
||||
assert result.filename == created_doc.filename
|
||||
assert result.filepath == created_doc.filepath
|
||||
assert result.file_hash == created_doc.file_hash
|
||||
assert result.file_type == created_doc.file_type
|
||||
assert result.metadata == update_data['metadata']
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_content(
|
||||
self,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute update
|
||||
update_data = {"file_bytes": b"this is an updated file content"}
|
||||
result = await document_service.update_document(created_doc.id, update_data)
|
||||
|
||||
assert result.filename == created_doc.filename
|
||||
assert result.filepath == created_doc.filepath
|
||||
assert result.file_hash != created_doc.file_hash
|
||||
assert result.file_type == created_doc.file_type
|
||||
assert result.metadata == created_doc.metadata
|
||||
|
||||
# Verify file is saved to disk
|
||||
validate_file_saved(document_service, result.file_hash, b"this is an updated file content")
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_delete_document_and_orphaned_content(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test deleting document with orphaned content cleanup."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Verify content exists
|
||||
validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes)
|
||||
|
||||
# Execute deletion
|
||||
result = await document_service.delete_document(created_doc.id)
|
||||
|
||||
# Verify document and content are deleted
|
||||
assert result is True
|
||||
|
||||
deleted_doc = await document_service.get_document_by_id(created_doc.id)
|
||||
assert deleted_doc is None
|
||||
|
||||
# validate content is deleted
|
||||
file_hash = created_doc.file_hash[:24]
|
||||
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
|
||||
assert not os.path.exists(target_file_path)
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_delete_document_without_affecting_shared_content(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test deleting document without removing shared content."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create two documents with same content
|
||||
doc1 = await document_service.create_document(
|
||||
"/test/test1.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
doc2 = await document_service.create_document(
|
||||
"/test/test2.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# They should share the same hash
|
||||
assert doc1.file_hash == doc2.file_hash
|
||||
|
||||
# Delete first document
|
||||
result = await document_service.delete_document(doc1.id)
|
||||
assert result is True
|
||||
|
||||
# Verify first document is deleted but content still exists
|
||||
deleted_doc = await document_service.get_document_by_id(doc1.id)
|
||||
assert deleted_doc is None
|
||||
|
||||
remaining_doc = await document_service.get_document_by_id(doc2.id)
|
||||
assert remaining_doc is not None
|
||||
|
||||
validate_file_saved(document_service, doc2.file_hash, sample_file_bytes)
|
||||
|
||||
|
||||
class TestHashCalculation:
|
||||
"""Tests for file hash calculation utility."""
|
||||
|
||||
def test_i_can_calculate_consistent_file_hash(self, document_service):
|
||||
"""Test that file hash calculation is consistent."""
|
||||
test_bytes = b"Test content for hashing"
|
||||
|
||||
# Calculate hash multiple times
|
||||
hash1 = document_service._calculate_file_hash(test_bytes)
|
||||
hash2 = document_service._calculate_file_hash(test_bytes)
|
||||
|
||||
# Should be identical
|
||||
assert hash1 == hash2
|
||||
assert len(hash1) == 64 # SHA256 produces 64-character hex string
|
||||
|
||||
def test_i_get_different_hashes_for_different_content(self, document_service):
|
||||
"""Test that different content produces different hashes."""
|
||||
content1 = b"First content"
|
||||
content2 = b"Second content"
|
||||
|
||||
hash1 = document_service._calculate_file_hash(content1)
|
||||
hash2 = document_service._calculate_file_hash(content2)
|
||||
|
||||
assert hash1 != hash2
|
||||
|
||||
|
||||
class TestFileTypeDetection:
|
||||
"""Tests for file type detection."""
|
||||
|
||||
def test_i_can_detect_pdf_file_type(self, document_service):
|
||||
"""Test PDF file type detection."""
|
||||
file_type = document_service._detect_file_type("/path/to/document.pdf")
|
||||
assert file_type == FileType.PDF
|
||||
|
||||
def test_i_can_detect_txt_file_type(self, document_service):
|
||||
"""Test text file type detection."""
|
||||
file_type = document_service._detect_file_type("/path/to/document.txt")
|
||||
assert file_type == FileType.TXT
|
||||
|
||||
def test_i_can_detect_docx_file_type(self, document_service):
|
||||
"""Test DOCX file type detection."""
|
||||
file_type = document_service._detect_file_type("/path/to/document.docx")
|
||||
assert file_type == FileType.DOCX
|
||||
|
||||
def test_i_cannot_detect_unsupported_file_type(self, document_service):
|
||||
"""Test unsupported file type raises ValueError."""
|
||||
with pytest.raises(ValueError, match="Unsupported file type"):
|
||||
document_service._detect_file_type("/path/to/document.xyz")
|
||||
@@ -1,187 +0,0 @@
|
||||
"""
|
||||
Unit tests for MongoDB database connection module.
|
||||
|
||||
Tests the database connection functionality with mocking
|
||||
to avoid requiring actual MongoDB instance during tests.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
||||
|
||||
from app.database.connection import (
|
||||
create_mongodb_client,
|
||||
get_database,
|
||||
close_database_connection,
|
||||
get_mongodb_client,
|
||||
test_database_connection
|
||||
)
|
||||
|
||||
|
||||
def test_i_can_get_database_connection():
|
||||
"""Test successful database connection creation."""
|
||||
mock_client = Mock()
|
||||
mock_database = Mock()
|
||||
|
||||
# Configure the mock to support dictionary-like access
|
||||
mock_client.__getitem__ = Mock(return_value=mock_database)
|
||||
|
||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
|
||||
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
|
||||
# Reset global variables
|
||||
import app.database.connection
|
||||
app.database.connection._client = None
|
||||
app.database.connection._database = None
|
||||
|
||||
result = get_database()
|
||||
|
||||
assert result == mock_database
|
||||
mock_client.admin.command.assert_called_with('ping')
|
||||
# Verify that __getitem__ was called with the database name
|
||||
mock_client.__getitem__.assert_called_with("testdb")
|
||||
|
||||
|
||||
def test_i_cannot_connect_to_invalid_mongodb_url():
|
||||
"""Test fail-fast behavior with invalid MongoDB URL."""
|
||||
mock_client = Mock()
|
||||
mock_client.admin.command.side_effect = ConnectionFailure("Connection failed")
|
||||
|
||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://invalid:27017"):
|
||||
with pytest.raises(SystemExit) as exc_info:
|
||||
create_mongodb_client()
|
||||
|
||||
assert exc_info.value.code == 1
|
||||
|
||||
|
||||
def test_i_cannot_connect_with_server_selection_timeout():
|
||||
"""Test fail-fast behavior with server selection timeout."""
|
||||
mock_client = Mock()
|
||||
mock_client.admin.command.side_effect = ServerSelectionTimeoutError("Timeout")
|
||||
|
||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://timeout:27017"):
|
||||
with pytest.raises(SystemExit) as exc_info:
|
||||
create_mongodb_client()
|
||||
|
||||
assert exc_info.value.code == 1
|
||||
|
||||
|
||||
def test_i_cannot_connect_with_unexpected_error():
|
||||
"""Test fail-fast behavior with unexpected connection error."""
|
||||
with patch('app.database.connection.MongoClient', side_effect=Exception("Unexpected error")):
|
||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://error:27017"):
|
||||
with pytest.raises(SystemExit) as exc_info:
|
||||
create_mongodb_client()
|
||||
|
||||
assert exc_info.value.code == 1
|
||||
|
||||
|
||||
def test_i_can_get_database_singleton():
|
||||
"""Test that get_database returns the same instance (singleton pattern)."""
|
||||
mock_client = Mock()
|
||||
mock_database = Mock()
|
||||
mock_client.__getitem__ = Mock(return_value=mock_database)
|
||||
|
||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
|
||||
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
|
||||
# Reset global variables
|
||||
import app.database.connection
|
||||
app.database.connection._client = None
|
||||
app.database.connection._database = None
|
||||
|
||||
# First call
|
||||
db1 = get_database()
|
||||
# Second call
|
||||
db2 = get_database()
|
||||
|
||||
assert db1 is db2
|
||||
# MongoClient should be called only once
|
||||
assert mock_client.admin.command.call_count == 1
|
||||
|
||||
|
||||
def test_i_can_close_database_connection():
|
||||
"""Test closing database connection."""
|
||||
mock_client = Mock()
|
||||
mock_database = Mock()
|
||||
mock_client.__getitem__ = Mock(return_value=mock_database)
|
||||
|
||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
|
||||
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
|
||||
# Reset global variables
|
||||
import app.database.connection
|
||||
app.database.connection._client = None
|
||||
app.database.connection._database = None
|
||||
|
||||
# Create connection
|
||||
get_database()
|
||||
|
||||
# Close connection
|
||||
close_database_connection()
|
||||
|
||||
mock_client.close.assert_called_once()
|
||||
assert app.database.connection._client is None
|
||||
assert app.database.connection._database is None
|
||||
|
||||
|
||||
def test_i_can_get_mongodb_client():
|
||||
"""Test getting raw MongoDB client instance."""
|
||||
mock_client = Mock()
|
||||
mock_database = Mock()
|
||||
mock_client.__getitem__ = Mock(return_value=mock_database)
|
||||
|
||||
with patch('app.database.connection.MongoClient', return_value=mock_client):
|
||||
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
|
||||
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
|
||||
# Reset global variables
|
||||
import app.database.connection
|
||||
app.database.connection._client = None
|
||||
app.database.connection._database = None
|
||||
|
||||
# Create connection first
|
||||
get_database()
|
||||
|
||||
# Get client
|
||||
result = get_mongodb_client()
|
||||
|
||||
assert result == mock_client
|
||||
|
||||
|
||||
def test_i_can_get_none_mongodb_client_when_not_connected():
|
||||
"""Test getting MongoDB client returns None when not connected."""
|
||||
# Reset global variables
|
||||
import app.database.connection
|
||||
app.database.connection._client = None
|
||||
app.database.connection._database = None
|
||||
|
||||
result = get_mongodb_client()
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_i_can_test_database_connection_success():
|
||||
"""Test database connection health check - success case."""
|
||||
mock_database = Mock()
|
||||
mock_database.command.return_value = True
|
||||
|
||||
with patch('app.database.connection.get_database', return_value=mock_database):
|
||||
result = test_database_connection()
|
||||
|
||||
assert result is True
|
||||
mock_database.command.assert_called_with('ping')
|
||||
|
||||
|
||||
def test_i_can_close_connection_when_no_client():
|
||||
"""Test closing connection when no client exists (should not raise error)."""
|
||||
# Reset global variables
|
||||
import app.database.connection
|
||||
app.database.connection._client = None
|
||||
app.database.connection._database = None
|
||||
|
||||
# Should not raise any exception
|
||||
close_database_connection()
|
||||
|
||||
assert app.database.connection._client is None
|
||||
assert app.database.connection._database is None
|
||||
@@ -1,311 +0,0 @@
|
||||
"""
|
||||
Test suite for DocumentContentRepository with async/await support.
|
||||
|
||||
This module contains comprehensive tests for all DocumentContentRepository methods
|
||||
using mongomock-motor for in-memory MongoDB testing.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
|
||||
import pytest_asyncio
|
||||
from bson import ObjectId
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
from mongomock_motor import AsyncMongoMockClient
|
||||
|
||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
||||
from app.models.document import DocumentContent
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def in_memory_repository():
|
||||
"""Create an in-memory DocumentContentRepository for testing."""
|
||||
client = AsyncMongoMockClient()
|
||||
db = client.test_database
|
||||
repo = DocumentContentRepository(db)
|
||||
await repo.initialize()
|
||||
return repo
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_document_content():
|
||||
"""Sample DocumentContent data for testing."""
|
||||
content = "This is sample document content for testing purposes."
|
||||
file_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
return DocumentContent(
|
||||
file_hash=file_hash,
|
||||
content=content,
|
||||
encoding="utf-8",
|
||||
file_size=len(content.encode()),
|
||||
mime_type="text/plain"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def another_document_content():
|
||||
"""Another sample DocumentContent data for testing."""
|
||||
content = "This is another sample document with different content."
|
||||
file_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
return DocumentContent(
|
||||
file_hash=file_hash,
|
||||
content=content,
|
||||
encoding="utf-8",
|
||||
file_size=len(content.encode()),
|
||||
mime_type="text/plain"
|
||||
)
|
||||
|
||||
|
||||
class TestDocumentContentRepositoryCreation:
|
||||
"""Tests for document content creation functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_content(self, in_memory_repository, sample_document_content):
|
||||
"""Test successful document content creation."""
|
||||
# Act
|
||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
||||
|
||||
# Assert
|
||||
assert created_content is not None
|
||||
assert created_content.file_hash == sample_document_content.file_hash
|
||||
assert created_content.content == sample_document_content.content
|
||||
assert created_content.encoding == sample_document_content.encoding
|
||||
assert created_content.file_size == sample_document_content.file_size
|
||||
assert created_content.mime_type == sample_document_content.mime_type
|
||||
assert created_content.id is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_document_content_with_duplicate_file_hash(self, in_memory_repository,
|
||||
sample_document_content):
|
||||
"""Test that creating document content with duplicate file_hash raises DuplicateKeyError."""
|
||||
# Arrange
|
||||
await in_memory_repository.create_document_content(sample_document_content)
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(DuplicateKeyError) as exc_info:
|
||||
await in_memory_repository.create_document_content(sample_document_content)
|
||||
|
||||
assert "already exists" in str(exc_info.value)
|
||||
|
||||
|
||||
class TestDocumentContentRepositoryFinding:
|
||||
"""Tests for document content finding functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_document_content_by_id(self, in_memory_repository, sample_document_content):
|
||||
"""Test finding document content by valid ID."""
|
||||
# Arrange
|
||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
||||
|
||||
# Act
|
||||
found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
|
||||
|
||||
# Assert
|
||||
assert found_content is not None
|
||||
assert found_content.id == created_content.id
|
||||
assert found_content.file_hash == created_content.file_hash
|
||||
assert found_content.content == created_content.content
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_content_by_invalid_id(self, in_memory_repository):
|
||||
"""Test that invalid ObjectId returns None."""
|
||||
# Act
|
||||
found_content = await in_memory_repository.find_document_content_by_id("invalid_id")
|
||||
|
||||
# Assert
|
||||
assert found_content is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_content_by_nonexistent_id(self, in_memory_repository):
|
||||
"""Test that nonexistent but valid ObjectId returns None."""
|
||||
# Arrange
|
||||
nonexistent_id = str(ObjectId())
|
||||
|
||||
# Act
|
||||
found_content = await in_memory_repository.find_document_content_by_id(nonexistent_id)
|
||||
|
||||
# Assert
|
||||
assert found_content is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_document_content_by_file_hash(self, in_memory_repository, sample_document_content):
|
||||
"""Test finding document content by file hash."""
|
||||
# Arrange
|
||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
||||
|
||||
# Act
|
||||
found_content = await in_memory_repository.find_document_content_by_file_hash(sample_document_content.file_hash)
|
||||
|
||||
# Assert
|
||||
assert found_content is not None
|
||||
assert found_content.file_hash == created_content.file_hash
|
||||
assert found_content.id == created_content.id
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_content_by_nonexistent_file_hash(self, in_memory_repository):
|
||||
"""Test that nonexistent file hash returns None."""
|
||||
# Act
|
||||
found_content = await in_memory_repository.find_document_content_by_file_hash("nonexistent_hash")
|
||||
|
||||
# Assert
|
||||
assert found_content is None
|
||||
|
||||
|
||||
class TestDocumentContentRepositoryUpdate:
|
||||
"""Tests for document content update functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_content(self, in_memory_repository, sample_document_content):
|
||||
"""Test successful document content update."""
|
||||
# Arrange
|
||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
||||
update_data = {
|
||||
"content": "Updated content for testing",
|
||||
"encoding": "utf-16",
|
||||
"mime_type": "text/html"
|
||||
}
|
||||
|
||||
# Act
|
||||
updated_content = await in_memory_repository.update_document_content(str(created_content.id), update_data)
|
||||
|
||||
# Assert
|
||||
assert updated_content is not None
|
||||
assert updated_content.content == update_data["content"]
|
||||
assert updated_content.encoding == update_data["encoding"]
|
||||
assert updated_content.mime_type == update_data["mime_type"]
|
||||
assert updated_content.id == created_content.id
|
||||
assert updated_content.file_hash == created_content.file_hash # Should remain unchanged
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_update_document_content_with_invalid_id(self, in_memory_repository):
|
||||
"""Test that updating with invalid ID returns None."""
|
||||
# Act
|
||||
result = await in_memory_repository.update_document_content("invalid_id", {"content": "test"})
|
||||
|
||||
# Assert
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_content_with_partial_data(self, in_memory_repository, sample_document_content):
|
||||
"""Test updating document content with partial data."""
|
||||
# Arrange
|
||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
||||
partial_update = {"encoding": "iso-8859-1"}
|
||||
|
||||
# Act
|
||||
updated_content = await in_memory_repository.update_document_content(str(created_content.id), partial_update)
|
||||
|
||||
# Assert
|
||||
assert updated_content is not None
|
||||
assert updated_content.encoding == "iso-8859-1"
|
||||
assert updated_content.content == created_content.content # Should remain unchanged
|
||||
assert updated_content.mime_type == created_content.mime_type # Should remain unchanged
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_content_with_empty_data(self, in_memory_repository, sample_document_content):
|
||||
"""Test updating document content with empty data returns current content."""
|
||||
# Arrange
|
||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
||||
empty_update = {}
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.update_document_content(str(created_content.id), empty_update)
|
||||
|
||||
# Assert
|
||||
assert result is not None
|
||||
assert result.content == created_content.content
|
||||
assert result.encoding == created_content.encoding
|
||||
assert result.mime_type == created_content.mime_type
|
||||
|
||||
|
||||
class TestDocumentContentRepositoryDeletion:
|
||||
"""Tests for document content deletion functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_delete_document_content(self, in_memory_repository, sample_document_content):
|
||||
"""Test successful document content deletion."""
|
||||
# Arrange
|
||||
created_content = await in_memory_repository.create_document_content(sample_document_content)
|
||||
|
||||
# Act
|
||||
deletion_result = await in_memory_repository.delete_document_content(str(created_content.id))
|
||||
|
||||
# Assert
|
||||
assert deletion_result is True
|
||||
|
||||
# Verify content is actually deleted
|
||||
found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
|
||||
assert found_content is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_delete_document_content_with_invalid_id(self, in_memory_repository):
|
||||
"""Test that deleting with invalid ID returns False."""
|
||||
# Act
|
||||
result = await in_memory_repository.delete_document_content("invalid_id")
|
||||
|
||||
# Assert
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_delete_nonexistent_document_content(self, in_memory_repository):
|
||||
"""Test that deleting nonexistent document content returns False."""
|
||||
# Arrange
|
||||
nonexistent_id = str(ObjectId())
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.delete_document_content(nonexistent_id)
|
||||
|
||||
# Assert
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestDocumentContentRepositoryUtilities:
|
||||
"""Tests for utility methods."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_check_content_exists(self, in_memory_repository, sample_document_content):
|
||||
"""Test checking if document content exists by file hash."""
|
||||
# Arrange
|
||||
await in_memory_repository.create_document_content(sample_document_content)
|
||||
|
||||
# Act
|
||||
exists = await in_memory_repository.content_exists(sample_document_content.file_hash)
|
||||
not_exists = await in_memory_repository.content_exists("nonexistent_hash")
|
||||
|
||||
# Assert
|
||||
assert exists is True
|
||||
assert not_exists is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_document_contents(self, in_memory_repository, sample_document_content,
|
||||
another_document_content):
|
||||
"""Test listing document contents with pagination."""
|
||||
# Arrange
|
||||
await in_memory_repository.create_document_content(sample_document_content)
|
||||
await in_memory_repository.create_document_content(another_document_content)
|
||||
|
||||
# Act
|
||||
all_contents = await in_memory_repository.list_document_contents()
|
||||
limited_contents = await in_memory_repository.list_document_contents(skip=0, limit=1)
|
||||
|
||||
# Assert
|
||||
assert len(all_contents) == 2
|
||||
assert len(limited_contents) == 1
|
||||
assert all(isinstance(content, DocumentContent) for content in all_contents)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_count_document_contents(self, in_memory_repository, sample_document_content,
|
||||
another_document_content):
|
||||
"""Test counting document contents."""
|
||||
# Arrange
|
||||
initial_count = await in_memory_repository.count_document_contents()
|
||||
await in_memory_repository.create_document_content(sample_document_content)
|
||||
await in_memory_repository.create_document_content(another_document_content)
|
||||
|
||||
# Act
|
||||
final_count = await in_memory_repository.count_document_contents()
|
||||
|
||||
# Assert
|
||||
assert final_count == initial_count + 2
|
||||
@@ -1,566 +0,0 @@
|
||||
"""
|
||||
Test suite for FileDocumentRepository with async/await support.
|
||||
|
||||
This module contains comprehensive tests for all FileDocumentRepository methods
|
||||
using mongomock-motor for in-memory MongoDB testing.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
import pytest_asyncio
|
||||
from bson import ObjectId
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from mongomock_motor import AsyncMongoMockClient
|
||||
|
||||
from app.database.repositories.document_repository import FileDocumentRepository
|
||||
from app.models.document import FileDocument, FileType
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def in_memory_repository():
|
||||
"""Create an in-memory FileDocumentRepository for testing."""
|
||||
client = AsyncMongoMockClient()
|
||||
db = client.test_database
|
||||
repo = FileDocumentRepository(db)
|
||||
# repo.db = db
|
||||
# repo.collection = db.files
|
||||
await repo.initialize()
|
||||
return repo
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_document():
|
||||
"""Sample FileDocument data for testing."""
|
||||
return FileDocument(
|
||||
filename="test_document.pdf",
|
||||
filepath="/path/to/test_document.pdf",
|
||||
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
|
||||
file_type=FileType("pdf"),
|
||||
detected_at=datetime.now(),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_update_data():
|
||||
"""Sample update data for testing."""
|
||||
return {
|
||||
"metadata": {"tags": ["updated", "document"]},
|
||||
"file_type": FileType("txt"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multiple_sample_documents():
|
||||
"""Multiple FileDocument objects for list/search testing."""
|
||||
base_time = datetime.now()
|
||||
return [
|
||||
FileDocument(
|
||||
filename="document1.pdf",
|
||||
filepath="/path/to/document1.pdf",
|
||||
file_hash="hash1" + "0" * 58,
|
||||
file_type=FileType("pdf"),
|
||||
detected_at=base_time,
|
||||
),
|
||||
FileDocument(
|
||||
filename="similar_document.pdf",
|
||||
filepath="/path/to/similar_document.pdf",
|
||||
file_hash="hash2" + "0" * 58,
|
||||
file_type=FileType("pdf"),
|
||||
detected_at=base_time,
|
||||
),
|
||||
FileDocument(
|
||||
filename="completely_different.txt",
|
||||
filepath="/path/to/completely_different.txt",
|
||||
file_hash="hash3" + "0" * 58,
|
||||
file_type=FileType("pdf"),
|
||||
detected_at=base_time,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryInitialization:
|
||||
"""Tests for repository initialization."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_initialize_repository(self):
|
||||
"""Test repository initialization."""
|
||||
# Arrange
|
||||
client = AsyncMongoMockClient()
|
||||
db = client.test_database
|
||||
repo = FileDocumentRepository(db)
|
||||
await repo.initialize()
|
||||
|
||||
# Act & Assert (should not raise any exception)
|
||||
assert repo.db is not None
|
||||
assert repo.collection is not None
|
||||
# TODO : check that the indexes are create
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryCreation:
|
||||
"""Tests for file document creation functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document(self, in_memory_repository, sample_file_document):
|
||||
"""Test successful file document creation."""
|
||||
# Act
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Assert
|
||||
assert created_doc is not None
|
||||
assert created_doc.filename == sample_file_document.filename
|
||||
assert created_doc.filepath == sample_file_document.filepath
|
||||
assert created_doc.file_hash == sample_file_document.file_hash
|
||||
assert created_doc.file_type == sample_file_document.file_type
|
||||
assert created_doc.id is not None
|
||||
assert isinstance(created_doc.id, ObjectId)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_without_id(self, in_memory_repository, sample_file_document):
|
||||
"""Test creating document with _id set to None (should be removed)."""
|
||||
# Arrange
|
||||
sample_file_document.id = None
|
||||
|
||||
# Act
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Assert
|
||||
assert created_doc is not None
|
||||
assert created_doc.id is not None
|
||||
assert isinstance(created_doc.id, ObjectId)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_duplicate_document(self, in_memory_repository, sample_file_document):
|
||||
"""Test that creating document with duplicate hash raises DuplicateKeyError."""
|
||||
# Arrange
|
||||
await in_memory_repository.create_document(sample_file_document)
|
||||
duplicate_doc = FileDocument(
|
||||
filename="different_name.pdf",
|
||||
filepath=sample_file_document.filepath,
|
||||
file_hash="different_hash" + "0" * 58,
|
||||
file_type=FileType("pdf"),
|
||||
detected_at=datetime.now()
|
||||
)
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(DuplicateKeyError) as exc_info:
|
||||
await in_memory_repository.create_document(duplicate_doc)
|
||||
|
||||
assert "already exists" in str(exc_info.value)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
|
||||
"""Test handling of PyMongo errors during document creation."""
|
||||
# Arrange
|
||||
mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
assert "Failed to create file document" in str(exc_info.value)
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryFinding:
|
||||
"""Tests for file document finding functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document):
|
||||
"""Test finding document by valid ObjectId."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id))
|
||||
|
||||
# Assert
|
||||
assert found_doc is not None
|
||||
assert found_doc.id == created_doc.id
|
||||
assert found_doc.filename == created_doc.filename
|
||||
assert found_doc.file_hash == created_doc.file_hash
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository):
|
||||
"""Test that invalid ObjectId returns None."""
|
||||
# Act
|
||||
found_doc = await in_memory_repository.find_document_by_id("invalid_id")
|
||||
|
||||
# Assert
|
||||
assert found_doc is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository):
|
||||
"""Test that nonexistent but valid ObjectId returns None."""
|
||||
# Arrange
|
||||
nonexistent_id = str(ObjectId())
|
||||
|
||||
# Act
|
||||
found_doc = await in_memory_repository.find_document_by_id(nonexistent_id)
|
||||
|
||||
# Assert
|
||||
assert found_doc is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_document_by_hash(self, in_memory_repository, sample_file_document):
|
||||
"""Test finding document by file hash."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
found_doc = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash)
|
||||
|
||||
# Assert
|
||||
assert found_doc is not None
|
||||
assert found_doc.file_hash == created_doc.file_hash
|
||||
assert found_doc.id == created_doc.id
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_with_nonexistent_hash(self, in_memory_repository):
|
||||
"""Test that nonexistent hash returns None."""
|
||||
# Act
|
||||
found_doc = await in_memory_repository.find_document_by_hash("nonexistent_hash")
|
||||
|
||||
# Assert
|
||||
assert found_doc is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document):
|
||||
"""Test finding document by exact filepath."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
found_doc = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath)
|
||||
|
||||
# Assert
|
||||
assert found_doc is not None
|
||||
assert found_doc.filepath == created_doc.filepath
|
||||
assert found_doc.id == created_doc.id
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository):
|
||||
"""Test that nonexistent filepath returns None."""
|
||||
# Act
|
||||
found_doc = await in_memory_repository.find_document_by_filepath("/nonexistent/path.pdf")
|
||||
|
||||
# Assert
|
||||
assert found_doc is None
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryFuzzySearch:
|
||||
"""Tests for fuzzy search functionality by filename."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_documents_by_exact_name(self, in_memory_repository, multiple_sample_documents):
|
||||
"""Test finding documents with exact filename match."""
|
||||
# Arrange
|
||||
for doc in multiple_sample_documents:
|
||||
await in_memory_repository.create_document(doc)
|
||||
|
||||
# Act
|
||||
found_docs = await in_memory_repository.find_document_by_name("document1.pdf")
|
||||
|
||||
# Assert
|
||||
assert len(found_docs) == 1
|
||||
assert found_docs[0].filename == "document1.pdf"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_find_documents_by_fuzzy_name(self, in_memory_repository, multiple_sample_documents):
|
||||
"""Test finding documents with fuzzy matching using default threshold."""
|
||||
# Arrange
|
||||
for doc in multiple_sample_documents:
|
||||
await in_memory_repository.create_document(doc)
|
||||
|
||||
# Act
|
||||
found_docs = await in_memory_repository.find_document_by_name("document")
|
||||
|
||||
# Assert
|
||||
assert len(found_docs) >= 2 # Should find document1.pdf and similar_document.pdf
|
||||
filenames = [doc.filename for doc in found_docs]
|
||||
assert "document1.pdf" in filenames
|
||||
assert "similar_document.pdf" in filenames
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker):
|
||||
"""Test handling of PyMongo errors during name search."""
|
||||
# Arrange
|
||||
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
found_docs = await in_memory_repository.find_document_by_name("test")
|
||||
|
||||
# Assert
|
||||
assert found_docs == []
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryListing:
|
||||
"""Tests for document listing functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_documents):
|
||||
"""Test listing documents with default pagination."""
|
||||
# Arrange
|
||||
for doc in multiple_sample_documents:
|
||||
await in_memory_repository.create_document(doc)
|
||||
|
||||
# Act
|
||||
docs = await in_memory_repository.list_documents()
|
||||
|
||||
# Assert
|
||||
assert len(docs) == len(multiple_sample_documents)
|
||||
assert all(isinstance(doc, FileDocument) for doc in docs)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_documents):
|
||||
"""Test listing documents with custom pagination."""
|
||||
# Arrange
|
||||
for doc in multiple_sample_documents:
|
||||
await in_memory_repository.create_document(doc)
|
||||
|
||||
# Act
|
||||
docs_page1 = await in_memory_repository.list_documents(skip=0, limit=2)
|
||||
docs_page2 = await in_memory_repository.list_documents(skip=2, limit=2)
|
||||
|
||||
# Assert
|
||||
assert len(docs_page1) == 2
|
||||
assert len(docs_page2) == 1 # Only 3 total documents
|
||||
|
||||
# Ensure no overlap between pages
|
||||
page1_ids = [doc.id for doc in docs_page1]
|
||||
page2_ids = [doc.id for doc in docs_page2]
|
||||
assert len(set(page1_ids).intersection(set(page2_ids))) == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_documents_sorted_by_date(self, in_memory_repository, sample_file_document):
|
||||
"""Test that documents are sorted by detected_at in descending order."""
|
||||
# Arrange
|
||||
from datetime import timedelta
|
||||
|
||||
# Create documents with different timestamps
|
||||
doc1 = sample_file_document.model_copy()
|
||||
doc1.filename = "oldest.pdf"
|
||||
doc1.filepath = f"/path/to/{doc1.filename}"
|
||||
doc1.file_hash = "hash1" + "0" * 58
|
||||
doc1.detected_at = datetime.now() - timedelta(hours=2)
|
||||
|
||||
doc2 = sample_file_document.model_copy()
|
||||
doc2.filename = "newest.pdf"
|
||||
doc2.filepath = f"/path/to/{doc2.filename}"
|
||||
doc2.file_hash = "hash2" + "0" * 58
|
||||
doc2.detected_at = datetime.now()
|
||||
|
||||
await in_memory_repository.create_document(doc1)
|
||||
await in_memory_repository.create_document(doc2)
|
||||
|
||||
# Act
|
||||
docs = await in_memory_repository.list_documents()
|
||||
|
||||
# Assert
|
||||
assert len(docs) == 2
|
||||
assert docs[0].filename == "newest.pdf" # Most recent first
|
||||
assert docs[1].filename == "oldest.pdf"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_empty_documents(self, in_memory_repository):
|
||||
"""Test listing documents from empty collection."""
|
||||
# Act
|
||||
docs = await in_memory_repository.list_documents()
|
||||
|
||||
# Assert
|
||||
assert docs == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker):
|
||||
"""Test handling of PyMongo errors during document listing."""
|
||||
# Arrange
|
||||
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
docs = await in_memory_repository.list_documents()
|
||||
|
||||
# Assert
|
||||
assert docs == []
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryUpdate:
|
||||
"""Tests for document update functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document,
|
||||
sample_update_data):
|
||||
"""Test successful document update."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
updated_doc = await in_memory_repository.update_document(str(created_doc.id), sample_update_data)
|
||||
|
||||
# Assert
|
||||
assert updated_doc is not None
|
||||
assert updated_doc.file_type == sample_update_data["file_type"]
|
||||
assert updated_doc.id == created_doc.id
|
||||
assert updated_doc.filename == created_doc.filename # Unchanged fields remain
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document):
|
||||
"""Test updating document with partial data."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
partial_update = {"file_type": FileType("txt")}
|
||||
|
||||
# Act
|
||||
updated_doc = await in_memory_repository.update_document(str(created_doc.id), partial_update)
|
||||
|
||||
# Assert
|
||||
assert updated_doc is not None
|
||||
assert updated_doc.file_type == FileType("txt")
|
||||
assert updated_doc.filename == created_doc.filename # Should remain unchanged
|
||||
assert updated_doc.filepath == created_doc.filepath # Should remain unchanged
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document):
|
||||
"""Test that None values are filtered out from update data."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
update_with_none = {"metadata": {"tags": ["updated", "document"]}, "file_type": None}
|
||||
|
||||
# Act
|
||||
updated_doc = await in_memory_repository.update_document(str(created_doc.id), update_with_none)
|
||||
|
||||
# Assert
|
||||
assert updated_doc is not None
|
||||
assert updated_doc.metadata == {"tags": ["updated", "document"]}
|
||||
assert updated_doc.file_type == created_doc.file_type # Should remain unchanged (None filtered out)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document):
|
||||
"""Test updating document with empty data returns current document."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
empty_update = {}
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.update_document(str(created_doc.id), empty_update)
|
||||
|
||||
# Assert
|
||||
assert result is not None
|
||||
assert result.filename == created_doc.filename
|
||||
assert result.file_hash == created_doc.file_hash
|
||||
assert result.metadata == created_doc.metadata
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data):
|
||||
"""Test that updating with invalid ID returns None."""
|
||||
# Act
|
||||
result = await in_memory_repository.update_document("invalid_id", sample_update_data)
|
||||
|
||||
# Assert
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data):
|
||||
"""Test that updating nonexistent document returns None."""
|
||||
# Arrange
|
||||
nonexistent_id = str(ObjectId())
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.update_document(nonexistent_id, sample_update_data)
|
||||
|
||||
# Assert
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document,
|
||||
sample_update_data, mocker):
|
||||
"""Test handling of PyMongo errors during document update."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
|
||||
side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.update_document(str(created_doc.id), sample_update_data)
|
||||
|
||||
# Assert
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryDeletion:
|
||||
"""Tests for document deletion functionality."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document):
|
||||
"""Test successful document deletion."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
deletion_result = await in_memory_repository.delete_document(str(created_doc.id))
|
||||
|
||||
# Assert
|
||||
assert deletion_result is True
|
||||
|
||||
# Verify document is actually deleted
|
||||
found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id))
|
||||
assert found_doc is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository):
|
||||
"""Test that deleting with invalid ID returns False."""
|
||||
# Act
|
||||
result = await in_memory_repository.delete_document("invalid_id")
|
||||
|
||||
# Assert
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository):
|
||||
"""Test that deleting nonexistent document returns False."""
|
||||
# Arrange
|
||||
nonexistent_id = str(ObjectId())
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.delete_document(nonexistent_id)
|
||||
|
||||
# Assert
|
||||
assert result is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
|
||||
"""Test handling of PyMongo errors during document deletion."""
|
||||
# Arrange
|
||||
created_doc = await in_memory_repository.create_document(sample_file_document)
|
||||
mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
|
||||
|
||||
# Act
|
||||
result = await in_memory_repository.delete_document(str(created_doc.id))
|
||||
|
||||
# Assert
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestFileDocumentRepositoryUtilities:
|
||||
"""Tests for utility methods."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_count_documents(self, in_memory_repository, sample_file_document):
|
||||
"""Test counting documents."""
|
||||
# Arrange
|
||||
initial_count = await in_memory_repository.count_documents()
|
||||
await in_memory_repository.create_document(sample_file_document)
|
||||
|
||||
# Act
|
||||
final_count = await in_memory_repository.count_documents()
|
||||
|
||||
# Assert
|
||||
assert final_count == initial_count + 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_count_zero_documents(self, in_memory_repository):
|
||||
"""Test counting documents in empty collection."""
|
||||
# Act
|
||||
count = await in_memory_repository.count_documents()
|
||||
|
||||
# Assert
|
||||
assert count == 0
|
||||
@@ -1,697 +0,0 @@
|
||||
"""
|
||||
Unit tests for DocumentService using in-memory MongoDB.
|
||||
|
||||
Tests the orchestration logic with real MongoDB operations
|
||||
using mongomock for better integration testing.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from unittest.mock import Mock, patch
|
||||
from datetime import datetime
|
||||
from bson import ObjectId
|
||||
from pathlib import Path
|
||||
|
||||
from mongomock_motor import AsyncMongoMockClient
|
||||
|
||||
from app.services.document_service import DocumentService
|
||||
from app.database.repositories.document_repository import FileDocumentRepository
|
||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
||||
from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def in_memory_file_repository():
|
||||
"""Create an in-memory FileDocumentRepository for testing."""
|
||||
client = AsyncMongoMockClient()
|
||||
db = client.test_database
|
||||
repo = FileDocumentRepository(db)
|
||||
await repo.initialize()
|
||||
return repo
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def in_memory_content_repository():
|
||||
"""Create an in-memory DocumentContentRepository for testing."""
|
||||
client = AsyncMongoMockClient()
|
||||
db = client.test_database
|
||||
repo = DocumentContentRepository(db)
|
||||
await repo.initialize()
|
||||
return repo
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def in_memory_database():
|
||||
"""Create an in-memory database for testing."""
|
||||
client = AsyncMongoMockClient()
|
||||
return client.test_database
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database):
|
||||
"""Create DocumentService with in-memory repositories."""
|
||||
with patch('app.services.document_service.get_database', return_value=in_memory_database):
|
||||
service = DocumentService()
|
||||
service.file_repository = in_memory_file_repository
|
||||
service.content_repository = in_memory_content_repository
|
||||
return service
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_bytes():
|
||||
"""Sample file content as bytes."""
|
||||
return b"This is a test PDF content"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_text_bytes():
|
||||
"""Sample text file content as bytes."""
|
||||
return b"This is a test text file content"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_hash():
|
||||
"""Expected SHA256 hash for sample file bytes."""
|
||||
import hashlib
|
||||
return hashlib.sha256(b"This is a test PDF content").hexdigest()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_file_document():
|
||||
"""Sample FileDocument for testing."""
|
||||
return FileDocument(
|
||||
id=ObjectId(),
|
||||
filename="test.pdf",
|
||||
filepath="/test/test.pdf",
|
||||
file_type=FileType.PDF,
|
||||
extraction_method=None,
|
||||
metadata={},
|
||||
detected_at=datetime(2024, 1, 15, 10, 30, 0),
|
||||
file_hash="test_hash"
|
||||
)
|
||||
|
||||
|
||||
class TestCreateDocument:
|
||||
"""Tests for create_document method."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@patch('app.services.document_service.datetime')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_with_new_content(
|
||||
self,
|
||||
mock_datetime,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test creating document when content doesn't exist yet."""
|
||||
# Setup mocks
|
||||
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
|
||||
mock_datetime.utcnow.return_value = fixed_time
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Execute
|
||||
result = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Verify document creation
|
||||
assert result is not None
|
||||
assert result.filename == "test.pdf"
|
||||
assert result.filepath == "/test/test.pdf"
|
||||
assert result.file_type == FileType.PDF
|
||||
assert result.detected_at == fixed_time
|
||||
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
|
||||
|
||||
# Verify content was created
|
||||
content = await document_service.content_repository.find_document_content_by_file_hash(
|
||||
result.file_hash
|
||||
)
|
||||
assert content is not None
|
||||
assert content.file_hash == result.file_hash
|
||||
assert content.file_size == len(sample_file_bytes)
|
||||
assert content.mime_type == "application/pdf"
|
||||
assert content.encoding == "utf-8"
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@patch('app.services.document_service.datetime')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_with_existing_content(
|
||||
self,
|
||||
mock_datetime,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test creating document when content already exists (deduplication)."""
|
||||
# Setup mocks
|
||||
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
|
||||
mock_datetime.utcnow.return_value = fixed_time
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create first document
|
||||
first_doc = await document_service.create_document(
|
||||
"/test/first.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Create second document with same content
|
||||
second_doc = await document_service.create_document(
|
||||
"/test/second.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Verify both documents exist but share same hash
|
||||
assert first_doc.file_hash == second_doc.file_hash
|
||||
assert first_doc.filename != second_doc.filename
|
||||
assert first_doc.filepath != second_doc.filepath
|
||||
|
||||
# Verify only one content document exists
|
||||
all_content = await document_service.content_repository.list_document_content()
|
||||
content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash]
|
||||
assert len(content_for_hash) == 1
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_with_different_encodings(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_text_bytes
|
||||
):
|
||||
"""Test creating documents with different text encodings."""
|
||||
# Setup
|
||||
mock_magic.return_value = "text/plain"
|
||||
|
||||
# Test with different encodings
|
||||
encodings = ["utf-8", "latin-1", "ascii"]
|
||||
|
||||
for i, encoding in enumerate(encodings):
|
||||
result = await document_service.create_document(
|
||||
f"/test/test{i}.txt",
|
||||
sample_text_bytes,
|
||||
encoding
|
||||
)
|
||||
|
||||
# Verify document was created
|
||||
assert result is not None
|
||||
assert result.file_type == FileType.TXT
|
||||
|
||||
# Verify content has correct encoding
|
||||
content = await document_service.content_repository.find_document_content_by_file_hash(
|
||||
result.file_hash
|
||||
)
|
||||
assert content.encoding == encoding
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_document_with_unsupported_file_type(
|
||||
self,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test that unsupported file types raise ValueError."""
|
||||
with pytest.raises(ValueError, match="Unsupported file type"):
|
||||
await document_service.create_document(
|
||||
"/test/test.xyz", # Unsupported extension
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_create_document_with_empty_file_path(
|
||||
self,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test that empty file path raises ValueError."""
|
||||
with pytest.raises(ValueError):
|
||||
await document_service.create_document(
|
||||
"", # Empty path
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_create_document_with_empty_bytes(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service
|
||||
):
|
||||
"""Test behavior with empty file bytes."""
|
||||
# Setup
|
||||
mock_magic.return_value = "text/plain"
|
||||
|
||||
# Execute with empty bytes
|
||||
result = await document_service.create_document(
|
||||
"/test/empty.txt",
|
||||
b"", # Empty bytes
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Should still work but with zero file size
|
||||
assert result is not None
|
||||
content = await document_service.content_repository.find_document_content_by_file_hash(
|
||||
result.file_hash
|
||||
)
|
||||
assert content.file_size == 0
|
||||
|
||||
|
||||
class TestGetMethods:
|
||||
"""Tests for document retrieval methods."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_get_document_by_id(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test retrieving document by ID."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
result = await document_service.get_document_by_id(created_doc.id)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
assert result.id == created_doc.id
|
||||
assert result.filename == created_doc.filename
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_get_document_by_hash(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test retrieving document by file hash."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
result = await document_service.get_document_by_hash(created_doc.file_hash)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
assert result.file_hash == created_doc.file_hash
|
||||
assert result.filename == created_doc.filename
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_get_document_by_filepath(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test retrieving document by file path."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
test_path = "/test/unique_test.pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
test_path,
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
result = await document_service.get_document_by_filepath(test_path)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
assert result.filepath == test_path
|
||||
assert result.id == created_doc.id
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_get_document_with_content(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test retrieving document with associated content."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
result = await document_service.get_document_with_content(created_doc.id)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
document, content = result
|
||||
assert document.id == created_doc.id
|
||||
assert content is not None
|
||||
assert content.file_hash == created_doc.file_hash
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_get_nonexistent_document_by_id(
|
||||
self,
|
||||
document_service
|
||||
):
|
||||
"""Test that nonexistent document returns None."""
|
||||
# Execute with random ObjectId
|
||||
result = await document_service.get_document_by_id(ObjectId())
|
||||
|
||||
# Verify
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_cannot_get_nonexistent_document_by_hash(
|
||||
self,
|
||||
document_service
|
||||
):
|
||||
"""Test that nonexistent document hash returns None."""
|
||||
# Execute
|
||||
result = await document_service.get_document_by_hash("nonexistent_hash")
|
||||
|
||||
# Verify
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestPaginationAndCounting:
|
||||
"""Tests for document listing and counting."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_list_documents_with_pagination(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test document listing with pagination parameters."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create multiple documents
|
||||
for i in range(5):
|
||||
await document_service.create_document(
|
||||
f"/test/test{i}.pdf",
|
||||
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute with pagination
|
||||
result = await document_service.list_documents(skip=1, limit=2)
|
||||
|
||||
# Verify
|
||||
assert len(result) == 2
|
||||
|
||||
# Test counting
|
||||
total_count = await document_service.count_documents()
|
||||
assert total_count == 5
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_count_documents(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test document counting."""
|
||||
# Setup
|
||||
mock_magic.return_value = "text/plain"
|
||||
|
||||
# Initially should be 0
|
||||
initial_count = await document_service.count_documents()
|
||||
assert initial_count == 0
|
||||
|
||||
# Create some documents
|
||||
for i in range(3):
|
||||
await document_service.create_document(
|
||||
f"/test/test{i}.txt",
|
||||
sample_file_bytes + bytes(str(i), 'utf-8'),
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute
|
||||
final_count = await document_service.count_documents()
|
||||
|
||||
# Verify
|
||||
assert final_count == 3
|
||||
|
||||
|
||||
class TestUpdateAndDelete:
|
||||
"""Tests for document update and deletion operations."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_metadata(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test updating document metadata."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Execute update
|
||||
update_data = {"metadata": {"page_count": 5}}
|
||||
result = await document_service.update_document(created_doc.id, update_data)
|
||||
|
||||
# Verify
|
||||
assert result is not None
|
||||
assert result.metadata.get("page_count") == 5
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_delete_document_and_orphaned_content(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test deleting document with orphaned content cleanup."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Verify content exists
|
||||
content_before = await document_service.content_repository.find_document_content_by_file_hash(
|
||||
created_doc.file_hash
|
||||
)
|
||||
assert content_before is not None
|
||||
|
||||
# Execute deletion
|
||||
result = await document_service.delete_document(created_doc.id)
|
||||
|
||||
# Verify document and content are deleted
|
||||
assert result is True
|
||||
|
||||
deleted_doc = await document_service.get_document_by_id(created_doc.id)
|
||||
assert deleted_doc is None
|
||||
|
||||
content_after = await document_service.content_repository.find_document_content_by_file_hash(
|
||||
created_doc.file_hash
|
||||
)
|
||||
assert content_after is None
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_delete_document_without_affecting_shared_content(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test deleting document without removing shared content."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create two documents with same content
|
||||
doc1 = await document_service.create_document(
|
||||
"/test/test1.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
doc2 = await document_service.create_document(
|
||||
"/test/test2.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# They should share the same hash
|
||||
assert doc1.file_hash == doc2.file_hash
|
||||
|
||||
# Delete first document
|
||||
result = await document_service.delete_document(doc1.id)
|
||||
assert result is True
|
||||
|
||||
# Verify first document is deleted but content still exists
|
||||
deleted_doc = await document_service.get_document_by_id(doc1.id)
|
||||
assert deleted_doc is None
|
||||
|
||||
remaining_doc = await document_service.get_document_by_id(doc2.id)
|
||||
assert remaining_doc is not None
|
||||
|
||||
content = await document_service.content_repository.find_document_content_by_file_hash(
|
||||
doc2.file_hash
|
||||
)
|
||||
assert content is not None
|
||||
|
||||
|
||||
class TestUtilityMethods:
|
||||
"""Tests for utility methods."""
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_check_content_exists(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test checking if content exists by hash."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Initially content doesn't exist
|
||||
test_hash = "nonexistent_hash"
|
||||
exists_before = await document_service.content_exists(test_hash)
|
||||
assert exists_before is False
|
||||
|
||||
# Create a document
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Now content should exist
|
||||
exists_after = await document_service.content_exists(created_doc.file_hash)
|
||||
assert exists_after is True
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
@pytest.mark.asyncio
|
||||
async def test_i_can_update_document_content(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test updating extracted document content."""
|
||||
# Setup
|
||||
mock_magic.return_value = "application/pdf"
|
||||
|
||||
# Create a document first
|
||||
created_doc = await document_service.create_document(
|
||||
"/test/test.pdf",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Update content
|
||||
new_content = "Updated extracted content"
|
||||
result = await document_service.update_document_content(
|
||||
created_doc.file_hash,
|
||||
new_content
|
||||
)
|
||||
|
||||
# Verify update
|
||||
assert result is not None
|
||||
assert result.content == new_content
|
||||
|
||||
# Verify persistence
|
||||
updated_content = await document_service.content_repository.find_document_content_by_file_hash(
|
||||
created_doc.file_hash
|
||||
)
|
||||
assert updated_content.content == new_content
|
||||
|
||||
|
||||
class TestHashCalculation:
|
||||
"""Tests for file hash calculation utility."""
|
||||
|
||||
def test_i_can_calculate_consistent_file_hash(self, document_service):
|
||||
"""Test that file hash calculation is consistent."""
|
||||
test_bytes = b"Test content for hashing"
|
||||
|
||||
# Calculate hash multiple times
|
||||
hash1 = document_service._calculate_file_hash(test_bytes)
|
||||
hash2 = document_service._calculate_file_hash(test_bytes)
|
||||
|
||||
# Should be identical
|
||||
assert hash1 == hash2
|
||||
assert len(hash1) == 64 # SHA256 produces 64-character hex string
|
||||
|
||||
def test_i_get_different_hashes_for_different_content(self, document_service):
|
||||
"""Test that different content produces different hashes."""
|
||||
content1 = b"First content"
|
||||
content2 = b"Second content"
|
||||
|
||||
hash1 = document_service._calculate_file_hash(content1)
|
||||
hash2 = document_service._calculate_file_hash(content2)
|
||||
|
||||
assert hash1 != hash2
|
||||
|
||||
|
||||
class TestFileTypeDetection:
|
||||
"""Tests for file type detection."""
|
||||
|
||||
def test_i_can_detect_pdf_file_type(self, document_service):
|
||||
"""Test PDF file type detection."""
|
||||
file_type = document_service._detect_file_type("/path/to/document.pdf")
|
||||
assert file_type == FileType.PDF
|
||||
|
||||
def test_i_can_detect_txt_file_type(self, document_service):
|
||||
"""Test text file type detection."""
|
||||
file_type = document_service._detect_file_type("/path/to/document.txt")
|
||||
assert file_type == FileType.TXT
|
||||
|
||||
def test_i_can_detect_docx_file_type(self, document_service):
|
||||
"""Test DOCX file type detection."""
|
||||
file_type = document_service._detect_file_type("/path/to/document.docx")
|
||||
assert file_type == FileType.DOCX
|
||||
|
||||
def test_i_cannot_detect_unsupported_file_type(self, document_service):
|
||||
"""Test unsupported file type raises ValueError."""
|
||||
with pytest.raises(ValueError, match="Unsupported file type"):
|
||||
document_service._detect_file_type("/path/to/document.xyz")
|
||||
0
tests/utils/__init__.py
Normal file
0
tests/utils/__init__.py
Normal file
@@ -14,6 +14,8 @@ def get_doc(filename: str = None):
|
||||
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
|
||||
file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"),
|
||||
detected_at=datetime.now(),
|
||||
file_size=1024,
|
||||
mime_type="application/pdf"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user