diff --git a/Readme.md b/Readme.md index 036ef5a..fc03512 100644 --- a/Readme.md +++ b/Readme.md @@ -232,19 +232,11 @@ Stores file metadata and extracted content: "filename": "document.pdf", "filepath": "/watched_files/document.pdf", "file_type": "pdf", - "mime_type": "application/pdf", - "file_size": 2048576, - "content": "extracted text content...", - "encoding": "utf-8", - "extraction_method": "direct_text", - // direct_text, ocr, hybrid + "extraction_method": "direct_text", // direct_text, ocr, hybrid "metadata": { - "page_count": 15, - // for PDFs - "word_count": 250, - // for text files - "image_dimensions": { - // for images + "page_count": 15, // for PDFs + "word_count": 250, // for text files + "image_dimensions": { // for images "width": 1920, "height": 1080 } @@ -253,6 +245,19 @@ Stores file metadata and extracted content: "file_hash": "sha256_hash_value" } ``` +#### Document Contents Collection + +Stores actual file content and technical metadata: +```json +{ + "_id": "ObjectId", + "file_hash": "sha256_hash_value", + "content": "extracted text content...", + "encoding": "utf-8", + "file_size": 2048576, + "mime_type": "application/pdf" +} +``` #### Processing Jobs Collection @@ -272,6 +277,25 @@ Tracks processing status and lifecycle: } ``` +### Data Storage Strategy + +- **Choice**: Three separate collections for files, content, and processing status +- **Rationale**: Normalization prevents content duplication when multiple files have identical content +- **Benefits**: + - Content deduplication via SHA256 hash + - Better query performance for metadata vs content searches + - Clear separation of concerns between file metadata, content, and processing lifecycle + - Multiple files can reference the same content (e.g., identical copies in different locations) + +### Content Storage Location + +- **Choice**: Store extracted content in separate `document_contents` collection +- **Rationale**: Content normalization and deduplication +- **Benefits**: + - Single content storage per unique file hash + - Multiple file entries can reference same content + - Efficient storage for duplicate files + ### Supported File Types (Initial Implementation) - **Text Files** (`.txt`): Direct content reading @@ -323,6 +347,87 @@ Tracks processing status and lifecycle: - **Extensible Metadata**: Flexible metadata storage per file type - **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches +## Document Service Architecture + +### Service Overview + +The document service provides orchestrated access to file documents and their content through a single interface that coordinates between `FileDocument` and `DocumentContent` repositories. + +### Service Design + +- **Architecture Pattern**: Service orchestration with separate repositories +- **Transaction Support**: MongoDB ACID transactions for data consistency +- **Content Deduplication**: Multiple files can reference the same content via SHA256 hash +- **Error Handling**: MongoDB standard exceptions with transaction rollback + +### Document Service (`document_service.py`) + +Orchestrates operations between file and content repositories while maintaining data consistency. + +#### Core Functionality + +##### `create_document(file_path: str, file_bytes: bytes, encoding: str)` + +Creates a new document with automatic attribute calculation and content deduplication. + +**Automatic Calculations:** +- `file_hash`: SHA256 hash of file bytes +- `file_type`: Detection based on file extension +- `mime_type`: Detection via `python-magic` library +- `file_size`: Length of provided bytes +- `detected_at`: Current timestamp +- `metadata`: Empty dictionary (reserved for future extension) + +**Deduplication Logic:** +1. Calculate SHA256 hash of file content +2. Check if `DocumentContent` with this hash already exists +3. If EXISTS: Create only `FileDocument` referencing existing content +4. If NOT EXISTS: Create both `FileDocument` and `DocumentContent` in transaction + +**Transaction Flow:** +``` +BEGIN TRANSACTION + IF content_exists(file_hash): + CREATE FileDocument with content reference + ELSE: + CREATE DocumentContent + CREATE FileDocument with content reference +COMMIT TRANSACTION +``` + +#### Available Methods + +- `create_document(file_path, file_bytes, encoding)`: Create with deduplication +- `get_document_by_id(document_id)`: Retrieve by document ID +- `get_document_by_hash(file_hash)`: Retrieve by file hash +- `get_document_by_filepath(filepath)`: Retrieve by file path +- `list_documents(skip, limit)`: Paginated document listing +- `count_documents()`: Total document count +- `update_document(document_id, update_data)`: Update document metadata +- `delete_document(document_id)`: Remove document and orphaned content + +### Repository Dependencies + +The document service coordinates two existing repositories: + +#### File Repository (`file_repository.py`) +- `create_document()`, `find_document_by_id()`, `find_document_by_hash()` +- `find_document_by_filepath()`, `find_document_by_name()` +- `list_documents()`, `count_documents()` +- `update_document()`, `delete_document()` + +#### Document Content Repository (`document_content_repository.py`) +- `create_document_content()`, `find_document_content_by_id()` +- `find_document_content_by_file_hash()`, `content_exists()` +- `update_document_content()`, `delete_document_content()` +- `list_document_contents()`, `count_document_contents()` + +### Dependencies + +- `python-magic`: MIME type detection +- `hashlib`: SHA256 hashing (standard library) +- `pymongo`: MongoDB transactions support + ## Key Implementation Notes ### Python Standards diff --git a/requirements.txt b/requirements.txt index e800676..7df4d86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,6 +32,7 @@ pytest-asyncio==1.2.0 pytest-mock==3.15.1 python-dateutil==2.9.0.post0 python-dotenv==1.1.1 +python-magic==0.4.27 pytz==2025.2 PyYAML==6.0.2 sentinels==1.1.1 diff --git a/src/file-processor/app/database/repositories/document_content_repository.py b/src/file-processor/app/database/repositories/document_content_repository.py new file mode 100644 index 0000000..16b2bd7 --- /dev/null +++ b/src/file-processor/app/database/repositories/document_content_repository.py @@ -0,0 +1,214 @@ +from typing import List, Optional +from datetime import datetime +from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection +from pymongo.errors import DuplicateKeyError, PyMongoError +from bson import ObjectId + +from app.models.document import DocumentContent + + +class DocumentContentRepository: + """ + Repository class for document content CRUD operations in MongoDB. + + This class handles all database operations related to document content, + following the repository pattern with dependency injection and async/await. + """ + + def __init__(self, database: AsyncIOMotorDatabase): + """ + Initialize repository with database dependency. + + Args: + database (AsyncIOMotorDatabase): MongoDB database instance + """ + self.db = database + self.collection: AsyncIOMotorCollection = database.document_contents + self._ensure_indexes() + + async def initialize(self): + """ + Initialize repository by ensuring required indexes exist. + + Should be called after repository instantiation to setup database indexes. + """ + await self._ensure_indexes() + + async def _ensure_indexes(self): + """ + Ensure required database indexes exist. + + Creates unique index on file_hash field to prevent duplicates. + """ + try: + await self.collection.create_index("file_hash", unique=True) + except PyMongoError: + # Index might already exist, ignore error + pass + + async def create_document_content(self, document_content: DocumentContent) -> DocumentContent: + """ + Create a new document content in the database. + + Args: + document_content (DocumentContent): Document content data + + Returns: + DocumentContent: Created document content with database ID + + Raises: + DuplicateKeyError: If file_hash already exists + ValueError: If document content creation fails due to validation + """ + document_dict = document_content.model_dump(by_alias=True, exclude_unset=True) + + # Remove _id if it's None to let MongoDB generate it + if document_dict.get("_id") is None: + document_dict.pop("_id", None) + + try: + result = await self.collection.insert_one(document_dict) + document_dict["_id"] = result.inserted_id + return DocumentContent(**document_dict) + except DuplicateKeyError as e: + raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}") + except PyMongoError as e: + raise ValueError(f"Failed to create document content: {e}") + + async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]: + """ + Find document content by ID. + + Args: + document_id (str): Document content ID to search for + + Returns: + DocumentContent or None: Document content if found, None otherwise + """ + try: + if not ObjectId.is_valid(document_id): + return None + + document_doc = await self.collection.find_one({"_id": ObjectId(document_id)}) + if document_doc: + return DocumentContent(**document_doc) + return None + except PyMongoError: + return None + + async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]: + """ + Find document content by file hash. + + Args: + file_hash (str): File hash to search for + + Returns: + DocumentContent or None: Document content if found, None otherwise + """ + try: + document_doc = await self.collection.find_one({"file_hash": file_hash}) + if document_doc: + return DocumentContent(**document_doc) + return None + except PyMongoError: + return None + + async def content_exists(self, file_hash: str) -> bool: + """ + Check if document content exists by file hash. + + Args: + file_hash (str): File hash to check + + Returns: + bool: True if document content exists, False otherwise + """ + try: + count = await self.collection.count_documents({"file_hash": file_hash}) + return count > 0 + except PyMongoError: + return False + + async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]: + """ + Update document content information. + + Args: + document_id (str): Document content ID to update + update_data (dict): Updated document content data + + Returns: + DocumentContent or None: Updated document content if found, None otherwise + """ + try: + if not ObjectId.is_valid(document_id): + return None + + # Remove None values and _id from update data + clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"} + + if not clean_update_data: + return await self.find_document_content_by_id(document_id) + + result = await self.collection.find_one_and_update( + {"_id": ObjectId(document_id)}, + {"$set": clean_update_data}, + return_document=True + ) + + if result: + return DocumentContent(**result) + return None + + except PyMongoError: + return None + + async def delete_document_content(self, document_id: str) -> bool: + """ + Delete document content from database. + + Args: + document_id (str): Document content ID to delete + + Returns: + bool: True if document content was deleted, False otherwise + """ + try: + if not ObjectId.is_valid(document_id): + return False + + result = await self.collection.delete_one({"_id": ObjectId(document_id)}) + return result.deleted_count > 0 + except PyMongoError: + return False + + async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]: + """ + List document contents with pagination. + + Args: + skip (int): Number of document contents to skip (default: 0) + limit (int): Maximum number of document contents to return (default: 100) + + Returns: + List[DocumentContent]: List of document contents + """ + try: + cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1) + document_docs = await cursor.to_list(length=limit) + return [DocumentContent(**document_doc) for document_doc in document_docs] + except PyMongoError: + return [] + + async def count_document_contents(self) -> int: + """ + Count total number of document contents. + + Returns: + int: Total number of document contents in database + """ + try: + return await self.collection.count_documents({}) + except PyMongoError: + return 0 diff --git a/src/file-processor/app/database/repositories/document_repository.py b/src/file-processor/app/database/repositories/document_repository.py index 67754e4..74552db 100644 --- a/src/file-processor/app/database/repositories/document_repository.py +++ b/src/file-processor/app/database/repositories/document_repository.py @@ -8,11 +8,9 @@ in MongoDB with proper error handling and type safety. from typing import Optional, List from bson import ObjectId from pymongo.errors import DuplicateKeyError, PyMongoError -from difflib import SequenceMatcher -from motor.motor_asyncio import AsyncIOMotorCollection +from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase from app.models.document import FileDocument -from app.database.connection import get_database -from app.utils.ducment_matching import fuzzy_matching, subsequence_matching +from app.utils.document_matching import fuzzy_matching, subsequence_matching class MatchMethodBase: @@ -36,9 +34,9 @@ class FileDocumentRepository: with proper error handling and data validation. """ - def __init__(self): + def __init__(self, database: AsyncIOMotorDatabase): """Initialize file repository with database connection.""" - self.db = get_database() + self.db = database self.collection: AsyncIOMotorCollection = self.db.files self._ensure_indexes() diff --git a/src/file-processor/app/models/document.py b/src/file-processor/app/models/document.py index 5cc1adc..1c22ef2 100644 --- a/src/file-processor/app/models/document.py +++ b/src/file-processor/app/models/document.py @@ -86,7 +86,7 @@ class DocumentContent(BaseModel): """Model for document content.""" id: Optional[PyObjectId] = Field(default=None, alias="_id") - file_hash: Optional[str] = Field(..., description="SHA256 hash of file content") + file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") content: str = Field(..., description="File content") encoding: str = Field(default="utf-8", description="Character encoding for text files") file_size: int = Field(..., ge=0, description="File size in bytes") diff --git a/src/file-processor/app/models/job.py b/src/file-processor/app/models/job.py new file mode 100644 index 0000000..e69de29 diff --git a/src/file-processor/app/services/document_service.py b/src/file-processor/app/services/document_service.py new file mode 100644 index 0000000..da58712 --- /dev/null +++ b/src/file-processor/app/services/document_service.py @@ -0,0 +1,380 @@ +""" +Document service for orchestrated file and content management. + +This service coordinates between FileDocument and DocumentContent repositories +while maintaining data consistency through MongoDB transactions. +""" + +import hashlib +import magic +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Dict, Any, Tuple + +from motor.motor_asyncio import AsyncIOMotorClientSession +from pymongo.errors import PyMongoError + +from app.database.connection import get_database +from app.database.repositories.document_repository import FileDocumentRepository +from app.database.repositories.document_content_repository import DocumentContentRepository +from app.models.document import ( + FileDocument, + DocumentContent, + FileType, + ProcessingStatus +) +from app.models.types import PyObjectId + + +class DocumentService: + """ + Service for orchestrated document and content management. + + Provides high-level operations that coordinate between file documents + and their content while ensuring data consistency through transactions. + """ + + def __init__(self): + """Initialize the document service with repository dependencies.""" + self.db = get_database() + self.file_repository = FileDocumentRepository(self.db) + self.content_repository = DocumentContentRepository(self.db) + + def _calculate_file_hash(self, file_bytes: bytes) -> str: + """ + Calculate SHA256 hash of file content. + + Args: + file_bytes: Raw file content as bytes + + Returns: + Hexadecimal SHA256 hash string + """ + return hashlib.sha256(file_bytes).hexdigest() + + def _detect_file_type(self, file_path: str) -> FileType: + """ + Detect file type from file extension. + + Args: + file_path: Path to the file + + Returns: + FileType enum value + + Raises: + ValueError: If file type is not supported + """ + extension = Path(file_path).suffix.lower().lstrip('.') + + try: + return FileType(extension) + except ValueError: + raise ValueError(f"Unsupported file type: {extension}") + + def _detect_mime_type(self, file_bytes: bytes) -> str: + """ + Detect MIME type from file content. + + Args: + file_bytes: Raw file content as bytes + + Returns: + MIME type string + """ + return magic.from_buffer(file_bytes, mime=True) + + async def create_document( + self, + file_path: str, + file_bytes: bytes, + encoding: str = "utf-8" + ) -> FileDocument: + """ + Create a new document with automatic deduplication. + + This method handles the creation of both FileDocument and DocumentContent + with proper deduplication based on file hash. If content with the same + hash already exists, only a new FileDocument is created. + + Args: + file_path: Full path to the file + file_bytes: Raw file content as bytes + encoding: Character encoding for text content + + Returns: + Created FileDocument instance + + Raises: + ValueError: If file type is not supported + PyMongoError: If database operation fails + """ + # Calculate automatic attributes + file_hash = self._calculate_file_hash(file_bytes) + file_type = self._detect_file_type(file_path) + mime_type = self._detect_mime_type(file_bytes) + file_size = len(file_bytes) + filename = Path(file_path).name + detected_at = datetime.utcnow() + + # Start MongoDB transaction + async with await self.db.client.start_session() as session: + async with session.start_transaction(): + try: + # Check if content already exists + existing_content = await self.content_repository.find_document_content_by_file_hash( + file_hash, session=session + ) + + # Create DocumentContent if it doesn't exist + if not existing_content: + content_data = DocumentContent( + file_hash=file_hash, + content="", # Will be populated by processing workers + encoding=encoding, + file_size=file_size, + mime_type=mime_type + ) + await self.content_repository.create_document_content( + content_data, session=session + ) + + # Create FileDocument + file_data = FileDocument( + filename=filename, + filepath=file_path, + file_type=file_type, + extraction_method=None, # Will be set by processing workers + metadata={}, # Empty for now + detected_at=detected_at, + file_hash=file_hash + ) + + created_file = await self.file_repository.create_document( + file_data, session=session + ) + + return created_file + + except Exception as e: + # Transaction will automatically rollback + raise PyMongoError(f"Failed to create document: {str(e)}") + + async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]: + """ + Retrieve a document by its ID. + + Args: + document_id: Document ObjectId + + Returns: + FileDocument if found, None otherwise + """ + return await self.file_repository.find_document_by_id(document_id) + + async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]: + """ + Retrieve a document by its file hash. + + Args: + file_hash: SHA256 hash of file content + + Returns: + FileDocument if found, None otherwise + """ + return await self.file_repository.find_document_by_hash(file_hash) + + async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: + """ + Retrieve a document by its file path. + + Args: + filepath: Full path to the file + + Returns: + FileDocument if found, None otherwise + """ + return await self.file_repository.find_document_by_filepath(filepath) + + async def get_document_with_content( + self, + document_id: PyObjectId + ) -> Optional[Tuple[FileDocument, DocumentContent]]: + """ + Retrieve a document with its associated content. + + Args: + document_id: Document ObjectId + + Returns: + Tuple of (FileDocument, DocumentContent) if found, None otherwise + """ + document = await self.get_document_by_id(document_id) + if not document: + return None + + content = await self.content_repository.find_document_content_by_file_hash( + document.file_hash + ) + if not content: + return None + + return (document, content) + + async def list_documents( + self, + skip: int = 0, + limit: int = 100 + ) -> List[FileDocument]: + """ + List documents with pagination. + + Args: + skip: Number of documents to skip + limit: Maximum number of documents to return + + Returns: + List of FileDocument instances + """ + return await self.file_repository.list_documents(skip=skip, limit=limit) + + async def count_documents(self) -> int: + """ + Get total number of documents. + + Returns: + Total document count + """ + return await self.file_repository.count_documents() + + async def update_document( + self, + document_id: PyObjectId, + update_data: Dict[str, Any] + ) -> Optional[FileDocument]: + """ + Update document metadata. + + Args: + document_id: Document ObjectId + update_data: Dictionary with fields to update + + Returns: + Updated FileDocument if found, None otherwise + """ + return await self.file_repository.update_document(document_id, update_data) + + async def delete_document(self, document_id: PyObjectId) -> bool: + """ + Delete a document and its orphaned content. + + This method removes the FileDocument and checks if the associated + DocumentContent is orphaned (no other files reference it). If orphaned, + the content is also deleted. + + Args: + document_id: Document ObjectId + + Returns: + True if document was deleted, False otherwise + + Raises: + PyMongoError: If database operation fails + """ + # Start MongoDB transaction + async with await self.db.client.start_session() as session: + async with session.start_transaction(): + try: + # Get document to find its hash + document = await self.file_repository.find_document_by_id( + document_id, session=session + ) + if not document: + return False + + # Delete the document + deleted = await self.file_repository.delete_document( + document_id, session=session + ) + if not deleted: + return False + + # Check if content is orphaned + remaining_files = await self.file_repository.find_document_by_hash( + document.file_hash, session=session + ) + + # If no other files reference this content, delete it + if not remaining_files: + content = await self.content_repository.find_document_content_by_file_hash( + document.file_hash, session=session + ) + if content: + await self.content_repository.delete_document_content( + content.id, session=session + ) + + return True + + except Exception as e: + # Transaction will automatically rollback + raise PyMongoError(f"Failed to delete document: {str(e)}") + + async def content_exists(self, file_hash: str) -> bool: + """ + Check if content with given hash exists. + + Args: + file_hash: SHA256 hash of file content + + Returns: + True if content exists, False otherwise + """ + return await self.content_repository.content_exists(file_hash) + + async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]: + """ + Retrieve content by file hash. + + Args: + file_hash: SHA256 hash of file content + + Returns: + DocumentContent if found, None otherwise + """ + return await self.content_repository.find_document_content_by_file_hash(file_hash) + + async def update_document_content( + self, + file_hash: str, + content: str, + encoding: str = "utf-8" + ) -> Optional[DocumentContent]: + """ + Update the extracted content for a document. + + This method is typically called by processing workers to store + the extracted text content. + + Args: + file_hash: SHA256 hash of file content + content: Extracted text content + encoding: Character encoding + + Returns: + Updated DocumentContent if found, None otherwise + """ + existing_content = await self.content_repository.find_document_content_by_file_hash( + file_hash + ) + if not existing_content: + return None + + update_data = { + "content": content, + "encoding": encoding + } + + return await self.content_repository.update_document_content( + existing_content.id, update_data + ) \ No newline at end of file diff --git a/src/file-processor/app/utils/ducment_matching.py b/src/file-processor/app/utils/document_matching.py similarity index 100% rename from src/file-processor/app/utils/ducment_matching.py rename to src/file-processor/app/utils/document_matching.py diff --git a/src/file-processor/requirements.txt b/src/file-processor/requirements.txt index 8a4b4be..8b4b465 100644 --- a/src/file-processor/requirements.txt +++ b/src/file-processor/requirements.txt @@ -8,3 +8,4 @@ pymongo==4.15.0 pydantic==2.11.9 redis==6.4.0 uvicorn==0.35.0 +python-magic==0.4.27 \ No newline at end of file diff --git a/tests/test_document_content_repository.py b/tests/test_document_content_repository.py new file mode 100644 index 0000000..1033e59 --- /dev/null +++ b/tests/test_document_content_repository.py @@ -0,0 +1,311 @@ +""" +Test suite for DocumentContentRepository with async/await support. + +This module contains comprehensive tests for all DocumentContentRepository methods +using mongomock-motor for in-memory MongoDB testing. +""" + +import pytest +import hashlib +from datetime import datetime + +import pytest_asyncio +from bson import ObjectId +from pymongo.errors import DuplicateKeyError +from mongomock_motor import AsyncMongoMockClient + +from app.database.repositories.document_content_repository import DocumentContentRepository +from app.models.document import DocumentContent + + +@pytest_asyncio.fixture +async def in_memory_repository(): + """Create an in-memory DocumentContentRepository for testing.""" + client = AsyncMongoMockClient() + db = client.test_database + repo = DocumentContentRepository(db) + await repo.initialize() + return repo + + +@pytest.fixture +def sample_document_content(): + """Sample DocumentContent data for testing.""" + content = "This is sample document content for testing purposes." + file_hash = hashlib.sha256(content.encode()).hexdigest() + + return DocumentContent( + file_hash=file_hash, + content=content, + encoding="utf-8", + file_size=len(content.encode()), + mime_type="text/plain" + ) + + +@pytest.fixture +def another_document_content(): + """Another sample DocumentContent data for testing.""" + content = "This is another sample document with different content." + file_hash = hashlib.sha256(content.encode()).hexdigest() + + return DocumentContent( + file_hash=file_hash, + content=content, + encoding="utf-8", + file_size=len(content.encode()), + mime_type="text/plain" + ) + + +class TestDocumentContentRepositoryCreation: + """Tests for document content creation functionality.""" + + @pytest.mark.asyncio + async def test_i_can_create_document_content(self, in_memory_repository, sample_document_content): + """Test successful document content creation.""" + # Act + created_content = await in_memory_repository.create_document_content(sample_document_content) + + # Assert + assert created_content is not None + assert created_content.file_hash == sample_document_content.file_hash + assert created_content.content == sample_document_content.content + assert created_content.encoding == sample_document_content.encoding + assert created_content.file_size == sample_document_content.file_size + assert created_content.mime_type == sample_document_content.mime_type + assert created_content.id is not None + + @pytest.mark.asyncio + async def test_i_cannot_create_document_content_with_duplicate_file_hash(self, in_memory_repository, + sample_document_content): + """Test that creating document content with duplicate file_hash raises DuplicateKeyError.""" + # Arrange + await in_memory_repository.create_document_content(sample_document_content) + + # Act & Assert + with pytest.raises(DuplicateKeyError) as exc_info: + await in_memory_repository.create_document_content(sample_document_content) + + assert "already exists" in str(exc_info.value) + + +class TestDocumentContentRepositoryFinding: + """Tests for document content finding functionality.""" + + @pytest.mark.asyncio + async def test_i_can_find_document_content_by_id(self, in_memory_repository, sample_document_content): + """Test finding document content by valid ID.""" + # Arrange + created_content = await in_memory_repository.create_document_content(sample_document_content) + + # Act + found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id)) + + # Assert + assert found_content is not None + assert found_content.id == created_content.id + assert found_content.file_hash == created_content.file_hash + assert found_content.content == created_content.content + + @pytest.mark.asyncio + async def test_i_cannot_find_document_content_by_invalid_id(self, in_memory_repository): + """Test that invalid ObjectId returns None.""" + # Act + found_content = await in_memory_repository.find_document_content_by_id("invalid_id") + + # Assert + assert found_content is None + + @pytest.mark.asyncio + async def test_i_cannot_find_document_content_by_nonexistent_id(self, in_memory_repository): + """Test that nonexistent but valid ObjectId returns None.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + found_content = await in_memory_repository.find_document_content_by_id(nonexistent_id) + + # Assert + assert found_content is None + + @pytest.mark.asyncio + async def test_i_can_find_document_content_by_file_hash(self, in_memory_repository, sample_document_content): + """Test finding document content by file hash.""" + # Arrange + created_content = await in_memory_repository.create_document_content(sample_document_content) + + # Act + found_content = await in_memory_repository.find_document_content_by_file_hash(sample_document_content.file_hash) + + # Assert + assert found_content is not None + assert found_content.file_hash == created_content.file_hash + assert found_content.id == created_content.id + + @pytest.mark.asyncio + async def test_i_cannot_find_document_content_by_nonexistent_file_hash(self, in_memory_repository): + """Test that nonexistent file hash returns None.""" + # Act + found_content = await in_memory_repository.find_document_content_by_file_hash("nonexistent_hash") + + # Assert + assert found_content is None + + +class TestDocumentContentRepositoryUpdate: + """Tests for document content update functionality.""" + + @pytest.mark.asyncio + async def test_i_can_update_document_content(self, in_memory_repository, sample_document_content): + """Test successful document content update.""" + # Arrange + created_content = await in_memory_repository.create_document_content(sample_document_content) + update_data = { + "content": "Updated content for testing", + "encoding": "utf-16", + "mime_type": "text/html" + } + + # Act + updated_content = await in_memory_repository.update_document_content(str(created_content.id), update_data) + + # Assert + assert updated_content is not None + assert updated_content.content == update_data["content"] + assert updated_content.encoding == update_data["encoding"] + assert updated_content.mime_type == update_data["mime_type"] + assert updated_content.id == created_content.id + assert updated_content.file_hash == created_content.file_hash # Should remain unchanged + + @pytest.mark.asyncio + async def test_i_cannot_update_document_content_with_invalid_id(self, in_memory_repository): + """Test that updating with invalid ID returns None.""" + # Act + result = await in_memory_repository.update_document_content("invalid_id", {"content": "test"}) + + # Assert + assert result is None + + @pytest.mark.asyncio + async def test_i_can_update_document_content_with_partial_data(self, in_memory_repository, sample_document_content): + """Test updating document content with partial data.""" + # Arrange + created_content = await in_memory_repository.create_document_content(sample_document_content) + partial_update = {"encoding": "iso-8859-1"} + + # Act + updated_content = await in_memory_repository.update_document_content(str(created_content.id), partial_update) + + # Assert + assert updated_content is not None + assert updated_content.encoding == "iso-8859-1" + assert updated_content.content == created_content.content # Should remain unchanged + assert updated_content.mime_type == created_content.mime_type # Should remain unchanged + + @pytest.mark.asyncio + async def test_i_can_update_document_content_with_empty_data(self, in_memory_repository, sample_document_content): + """Test updating document content with empty data returns current content.""" + # Arrange + created_content = await in_memory_repository.create_document_content(sample_document_content) + empty_update = {} + + # Act + result = await in_memory_repository.update_document_content(str(created_content.id), empty_update) + + # Assert + assert result is not None + assert result.content == created_content.content + assert result.encoding == created_content.encoding + assert result.mime_type == created_content.mime_type + + +class TestDocumentContentRepositoryDeletion: + """Tests for document content deletion functionality.""" + + @pytest.mark.asyncio + async def test_i_can_delete_document_content(self, in_memory_repository, sample_document_content): + """Test successful document content deletion.""" + # Arrange + created_content = await in_memory_repository.create_document_content(sample_document_content) + + # Act + deletion_result = await in_memory_repository.delete_document_content(str(created_content.id)) + + # Assert + assert deletion_result is True + + # Verify content is actually deleted + found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id)) + assert found_content is None + + @pytest.mark.asyncio + async def test_i_cannot_delete_document_content_with_invalid_id(self, in_memory_repository): + """Test that deleting with invalid ID returns False.""" + # Act + result = await in_memory_repository.delete_document_content("invalid_id") + + # Assert + assert result is False + + @pytest.mark.asyncio + async def test_i_cannot_delete_nonexistent_document_content(self, in_memory_repository): + """Test that deleting nonexistent document content returns False.""" + # Arrange + nonexistent_id = str(ObjectId()) + + # Act + result = await in_memory_repository.delete_document_content(nonexistent_id) + + # Assert + assert result is False + + +class TestDocumentContentRepositoryUtilities: + """Tests for utility methods.""" + + @pytest.mark.asyncio + async def test_i_can_check_content_exists(self, in_memory_repository, sample_document_content): + """Test checking if document content exists by file hash.""" + # Arrange + await in_memory_repository.create_document_content(sample_document_content) + + # Act + exists = await in_memory_repository.content_exists(sample_document_content.file_hash) + not_exists = await in_memory_repository.content_exists("nonexistent_hash") + + # Assert + assert exists is True + assert not_exists is False + + @pytest.mark.asyncio + async def test_i_can_list_document_contents(self, in_memory_repository, sample_document_content, + another_document_content): + """Test listing document contents with pagination.""" + # Arrange + await in_memory_repository.create_document_content(sample_document_content) + await in_memory_repository.create_document_content(another_document_content) + + # Act + all_contents = await in_memory_repository.list_document_contents() + limited_contents = await in_memory_repository.list_document_contents(skip=0, limit=1) + + # Assert + assert len(all_contents) == 2 + assert len(limited_contents) == 1 + assert all(isinstance(content, DocumentContent) for content in all_contents) + + @pytest.mark.asyncio + async def test_i_can_count_document_contents(self, in_memory_repository, sample_document_content, + another_document_content): + """Test counting document contents.""" + # Arrange + initial_count = await in_memory_repository.count_document_contents() + await in_memory_repository.create_document_content(sample_document_content) + await in_memory_repository.create_document_content(another_document_content) + + # Act + final_count = await in_memory_repository.count_document_contents() + + # Assert + assert final_count == initial_count + 2 \ No newline at end of file diff --git a/tests/test_document_repository.py b/tests/test_document_repository.py index e61180d..a5cc5c1 100644 --- a/tests/test_document_repository.py +++ b/tests/test_document_repository.py @@ -23,9 +23,9 @@ async def in_memory_repository(): """Create an in-memory FileDocumentRepository for testing.""" client = AsyncMongoMockClient() db = client.test_database - repo = FileDocumentRepository() - repo.db = db - repo.collection = db.files + repo = FileDocumentRepository(db) + # repo.db = db + # repo.collection = db.files await repo.initialize() return repo @@ -87,12 +87,15 @@ class TestFileDocumentRepositoryInitialization: async def test_i_can_initialize_repository(self): """Test repository initialization.""" # Arrange - repo = FileDocumentRepository() + client = AsyncMongoMockClient() + db = client.test_database + repo = FileDocumentRepository(db) await repo.initialize() # Act & Assert (should not raise any exception) assert repo.db is not None assert repo.collection is not None + # TODO : check that the indexes are create class TestFileDocumentRepositoryCreation: diff --git a/tests/test_document_service.py b/tests/test_document_service.py new file mode 100644 index 0000000..532c2c4 --- /dev/null +++ b/tests/test_document_service.py @@ -0,0 +1,697 @@ +""" +Unit tests for DocumentService using in-memory MongoDB. + +Tests the orchestration logic with real MongoDB operations +using mongomock for better integration testing. +""" + +import pytest +import pytest_asyncio +from unittest.mock import Mock, patch +from datetime import datetime +from bson import ObjectId +from pathlib import Path + +from mongomock_motor import AsyncMongoMockClient + +from app.services.document_service import DocumentService +from app.database.repositories.document_repository import FileDocumentRepository +from app.database.repositories.document_content_repository import DocumentContentRepository +from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod +from app.models.types import PyObjectId + + +@pytest_asyncio.fixture +async def in_memory_file_repository(): + """Create an in-memory FileDocumentRepository for testing.""" + client = AsyncMongoMockClient() + db = client.test_database + repo = FileDocumentRepository(db) + await repo.initialize() + return repo + + +@pytest_asyncio.fixture +async def in_memory_content_repository(): + """Create an in-memory DocumentContentRepository for testing.""" + client = AsyncMongoMockClient() + db = client.test_database + repo = DocumentContentRepository(db) + await repo.initialize() + return repo + + +@pytest_asyncio.fixture +async def in_memory_database(): + """Create an in-memory database for testing.""" + client = AsyncMongoMockClient() + return client.test_database + + +@pytest_asyncio.fixture +async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database): + """Create DocumentService with in-memory repositories.""" + with patch('app.services.document_service.get_database', return_value=in_memory_database): + service = DocumentService() + service.file_repository = in_memory_file_repository + service.content_repository = in_memory_content_repository + return service + + +@pytest.fixture +def sample_file_bytes(): + """Sample file content as bytes.""" + return b"This is a test PDF content" + + +@pytest.fixture +def sample_text_bytes(): + """Sample text file content as bytes.""" + return b"This is a test text file content" + + +@pytest.fixture +def sample_file_hash(): + """Expected SHA256 hash for sample file bytes.""" + import hashlib + return hashlib.sha256(b"This is a test PDF content").hexdigest() + + +@pytest.fixture +def sample_file_document(): + """Sample FileDocument for testing.""" + return FileDocument( + id=ObjectId(), + filename="test.pdf", + filepath="/test/test.pdf", + file_type=FileType.PDF, + extraction_method=None, + metadata={}, + detected_at=datetime(2024, 1, 15, 10, 30, 0), + file_hash="test_hash" + ) + + +class TestCreateDocument: + """Tests for create_document method.""" + + @patch('app.services.document_service.magic.from_buffer') + @patch('app.services.document_service.datetime') + @pytest.mark.asyncio + async def test_i_can_create_document_with_new_content( + self, + mock_datetime, + mock_magic, + document_service, + sample_file_bytes + ): + """Test creating document when content doesn't exist yet.""" + # Setup mocks + fixed_time = datetime(2024, 1, 15, 10, 30, 0) + mock_datetime.utcnow.return_value = fixed_time + mock_magic.return_value = "application/pdf" + + # Execute + result = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify document creation + assert result is not None + assert result.filename == "test.pdf" + assert result.filepath == "/test/test.pdf" + assert result.file_type == FileType.PDF + assert result.detected_at == fixed_time + assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes) + + # Verify content was created + content = await document_service.content_repository.find_document_content_by_file_hash( + result.file_hash + ) + assert content is not None + assert content.file_hash == result.file_hash + assert content.file_size == len(sample_file_bytes) + assert content.mime_type == "application/pdf" + assert content.encoding == "utf-8" + + @patch('app.services.document_service.magic.from_buffer') + @patch('app.services.document_service.datetime') + @pytest.mark.asyncio + async def test_i_can_create_document_with_existing_content( + self, + mock_datetime, + mock_magic, + document_service, + sample_file_bytes + ): + """Test creating document when content already exists (deduplication).""" + # Setup mocks + fixed_time = datetime(2024, 1, 15, 10, 30, 0) + mock_datetime.utcnow.return_value = fixed_time + mock_magic.return_value = "application/pdf" + + # Create first document + first_doc = await document_service.create_document( + "/test/first.pdf", + sample_file_bytes, + "utf-8" + ) + + # Create second document with same content + second_doc = await document_service.create_document( + "/test/second.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify both documents exist but share same hash + assert first_doc.file_hash == second_doc.file_hash + assert first_doc.filename != second_doc.filename + assert first_doc.filepath != second_doc.filepath + + # Verify only one content document exists + all_content = await document_service.content_repository.list_document_content() + content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash] + assert len(content_for_hash) == 1 + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_create_document_with_different_encodings( + self, + mock_magic, + document_service, + sample_text_bytes + ): + """Test creating documents with different text encodings.""" + # Setup + mock_magic.return_value = "text/plain" + + # Test with different encodings + encodings = ["utf-8", "latin-1", "ascii"] + + for i, encoding in enumerate(encodings): + result = await document_service.create_document( + f"/test/test{i}.txt", + sample_text_bytes, + encoding + ) + + # Verify document was created + assert result is not None + assert result.file_type == FileType.TXT + + # Verify content has correct encoding + content = await document_service.content_repository.find_document_content_by_file_hash( + result.file_hash + ) + assert content.encoding == encoding + + @pytest.mark.asyncio + async def test_i_cannot_create_document_with_unsupported_file_type( + self, + document_service, + sample_file_bytes + ): + """Test that unsupported file types raise ValueError.""" + with pytest.raises(ValueError, match="Unsupported file type"): + await document_service.create_document( + "/test/test.xyz", # Unsupported extension + sample_file_bytes, + "utf-8" + ) + + @pytest.mark.asyncio + async def test_i_cannot_create_document_with_empty_file_path( + self, + document_service, + sample_file_bytes + ): + """Test that empty file path raises ValueError.""" + with pytest.raises(ValueError): + await document_service.create_document( + "", # Empty path + sample_file_bytes, + "utf-8" + ) + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_create_document_with_empty_bytes( + self, + mock_magic, + document_service + ): + """Test behavior with empty file bytes.""" + # Setup + mock_magic.return_value = "text/plain" + + # Execute with empty bytes + result = await document_service.create_document( + "/test/empty.txt", + b"", # Empty bytes + "utf-8" + ) + + # Should still work but with zero file size + assert result is not None + content = await document_service.content_repository.find_document_content_by_file_hash( + result.file_hash + ) + assert content.file_size == 0 + + +class TestGetMethods: + """Tests for document retrieval methods.""" + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_get_document_by_id( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by ID.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = await document_service.get_document_by_id(created_doc.id) + + # Verify + assert result is not None + assert result.id == created_doc.id + assert result.filename == created_doc.filename + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_get_document_by_hash( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by file hash.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = await document_service.get_document_by_hash(created_doc.file_hash) + + # Verify + assert result is not None + assert result.file_hash == created_doc.file_hash + assert result.filename == created_doc.filename + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_get_document_by_filepath( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document by file path.""" + # Setup + mock_magic.return_value = "application/pdf" + test_path = "/test/unique_test.pdf" + + # Create a document first + created_doc = await document_service.create_document( + test_path, + sample_file_bytes, + "utf-8" + ) + + # Execute + result = await document_service.get_document_by_filepath(test_path) + + # Verify + assert result is not None + assert result.filepath == test_path + assert result.id == created_doc.id + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_get_document_with_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test retrieving document with associated content.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute + result = await document_service.get_document_with_content(created_doc.id) + + # Verify + assert result is not None + document, content = result + assert document.id == created_doc.id + assert content is not None + assert content.file_hash == created_doc.file_hash + + @pytest.mark.asyncio + async def test_i_cannot_get_nonexistent_document_by_id( + self, + document_service + ): + """Test that nonexistent document returns None.""" + # Execute with random ObjectId + result = await document_service.get_document_by_id(ObjectId()) + + # Verify + assert result is None + + @pytest.mark.asyncio + async def test_i_cannot_get_nonexistent_document_by_hash( + self, + document_service + ): + """Test that nonexistent document hash returns None.""" + # Execute + result = await document_service.get_document_by_hash("nonexistent_hash") + + # Verify + assert result is None + + +class TestPaginationAndCounting: + """Tests for document listing and counting.""" + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_list_documents_with_pagination( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test document listing with pagination parameters.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create multiple documents + for i in range(5): + await document_service.create_document( + f"/test/test{i}.pdf", + sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique + "utf-8" + ) + + # Execute with pagination + result = await document_service.list_documents(skip=1, limit=2) + + # Verify + assert len(result) == 2 + + # Test counting + total_count = await document_service.count_documents() + assert total_count == 5 + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_count_documents( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test document counting.""" + # Setup + mock_magic.return_value = "text/plain" + + # Initially should be 0 + initial_count = await document_service.count_documents() + assert initial_count == 0 + + # Create some documents + for i in range(3): + await document_service.create_document( + f"/test/test{i}.txt", + sample_file_bytes + bytes(str(i), 'utf-8'), + "utf-8" + ) + + # Execute + final_count = await document_service.count_documents() + + # Verify + assert final_count == 3 + + +class TestUpdateAndDelete: + """Tests for document update and deletion operations.""" + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_update_document_metadata( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test updating document metadata.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Execute update + update_data = {"metadata": {"page_count": 5}} + result = await document_service.update_document(created_doc.id, update_data) + + # Verify + assert result is not None + assert result.metadata.get("page_count") == 5 + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_delete_document_and_orphaned_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test deleting document with orphaned content cleanup.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Verify content exists + content_before = await document_service.content_repository.find_document_content_by_file_hash( + created_doc.file_hash + ) + assert content_before is not None + + # Execute deletion + result = await document_service.delete_document(created_doc.id) + + # Verify document and content are deleted + assert result is True + + deleted_doc = await document_service.get_document_by_id(created_doc.id) + assert deleted_doc is None + + content_after = await document_service.content_repository.find_document_content_by_file_hash( + created_doc.file_hash + ) + assert content_after is None + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_delete_document_without_affecting_shared_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test deleting document without removing shared content.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create two documents with same content + doc1 = await document_service.create_document( + "/test/test1.pdf", + sample_file_bytes, + "utf-8" + ) + + doc2 = await document_service.create_document( + "/test/test2.pdf", + sample_file_bytes, + "utf-8" + ) + + # They should share the same hash + assert doc1.file_hash == doc2.file_hash + + # Delete first document + result = await document_service.delete_document(doc1.id) + assert result is True + + # Verify first document is deleted but content still exists + deleted_doc = await document_service.get_document_by_id(doc1.id) + assert deleted_doc is None + + remaining_doc = await document_service.get_document_by_id(doc2.id) + assert remaining_doc is not None + + content = await document_service.content_repository.find_document_content_by_file_hash( + doc2.file_hash + ) + assert content is not None + + +class TestUtilityMethods: + """Tests for utility methods.""" + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_check_content_exists( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test checking if content exists by hash.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Initially content doesn't exist + test_hash = "nonexistent_hash" + exists_before = await document_service.content_exists(test_hash) + assert exists_before is False + + # Create a document + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Now content should exist + exists_after = await document_service.content_exists(created_doc.file_hash) + assert exists_after is True + + @patch('app.services.document_service.magic.from_buffer') + @pytest.mark.asyncio + async def test_i_can_update_document_content( + self, + mock_magic, + document_service, + sample_file_bytes + ): + """Test updating extracted document content.""" + # Setup + mock_magic.return_value = "application/pdf" + + # Create a document first + created_doc = await document_service.create_document( + "/test/test.pdf", + sample_file_bytes, + "utf-8" + ) + + # Update content + new_content = "Updated extracted content" + result = await document_service.update_document_content( + created_doc.file_hash, + new_content + ) + + # Verify update + assert result is not None + assert result.content == new_content + + # Verify persistence + updated_content = await document_service.content_repository.find_document_content_by_file_hash( + created_doc.file_hash + ) + assert updated_content.content == new_content + + +class TestHashCalculation: + """Tests for file hash calculation utility.""" + + def test_i_can_calculate_consistent_file_hash(self, document_service): + """Test that file hash calculation is consistent.""" + test_bytes = b"Test content for hashing" + + # Calculate hash multiple times + hash1 = document_service._calculate_file_hash(test_bytes) + hash2 = document_service._calculate_file_hash(test_bytes) + + # Should be identical + assert hash1 == hash2 + assert len(hash1) == 64 # SHA256 produces 64-character hex string + + def test_i_get_different_hashes_for_different_content(self, document_service): + """Test that different content produces different hashes.""" + content1 = b"First content" + content2 = b"Second content" + + hash1 = document_service._calculate_file_hash(content1) + hash2 = document_service._calculate_file_hash(content2) + + assert hash1 != hash2 + + +class TestFileTypeDetection: + """Tests for file type detection.""" + + def test_i_can_detect_pdf_file_type(self, document_service): + """Test PDF file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.pdf") + assert file_type == FileType.PDF + + def test_i_can_detect_txt_file_type(self, document_service): + """Test text file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.txt") + assert file_type == FileType.TXT + + def test_i_can_detect_docx_file_type(self, document_service): + """Test DOCX file type detection.""" + file_type = document_service._detect_file_type("/path/to/document.docx") + assert file_type == FileType.DOCX + + def test_i_cannot_detect_unsupported_file_type(self, document_service): + """Test unsupported file type raises ValueError.""" + with pytest.raises(ValueError, match="Unsupported file type"): + document_service._detect_file_type("/path/to/document.xyz") \ No newline at end of file diff --git a/tests/test_utils_document_matching.py b/tests/test_utils_document_matching.py index 5cdc941..ea83895 100644 --- a/tests/test_utils_document_matching.py +++ b/tests/test_utils_document_matching.py @@ -3,7 +3,7 @@ from datetime import datetime import pytest from app.models.document import FileDocument, FileType -from app.utils.ducment_matching import fuzzy_matching, subsequence_matching +from app.utils.document_matching import fuzzy_matching, subsequence_matching def get_doc(filename: str = None):