""" Unit tests for DocumentService using in-memory MongoDB. Tests the orchestration logic with real MongoDB operations using mongomock for better integration testing. """ import pytest import pytest_asyncio from unittest.mock import Mock, patch from datetime import datetime from bson import ObjectId from pathlib import Path from mongomock_motor import AsyncMongoMockClient from app.services.document_service import DocumentService from app.database.repositories.document_repository import FileDocumentRepository from app.database.repositories.document_content_repository import DocumentContentRepository from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod from app.models.types import PyObjectId @pytest_asyncio.fixture async def in_memory_file_repository(): """Create an in-memory FileDocumentRepository for testing.""" client = AsyncMongoMockClient() db = client.test_database repo = FileDocumentRepository(db) await repo.initialize() return repo @pytest_asyncio.fixture async def in_memory_content_repository(): """Create an in-memory DocumentContentRepository for testing.""" client = AsyncMongoMockClient() db = client.test_database repo = DocumentContentRepository(db) await repo.initialize() return repo @pytest_asyncio.fixture async def in_memory_database(): """Create an in-memory database for testing.""" client = AsyncMongoMockClient() return client.test_database @pytest_asyncio.fixture async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database): """Create DocumentService with in-memory repositories.""" with patch('app.services.document_service.get_database', return_value=in_memory_database): service = DocumentService() service.file_repository = in_memory_file_repository service.content_repository = in_memory_content_repository return service @pytest.fixture def sample_file_bytes(): """Sample file content as bytes.""" return b"This is a test PDF content" @pytest.fixture def sample_text_bytes(): """Sample text file content as bytes.""" return b"This is a test text file content" @pytest.fixture def sample_file_hash(): """Expected SHA256 hash for sample file bytes.""" import hashlib return hashlib.sha256(b"This is a test PDF content").hexdigest() @pytest.fixture def sample_file_document(): """Sample FileDocument for testing.""" return FileDocument( id=ObjectId(), filename="test.pdf", filepath="/test/test.pdf", file_type=FileType.PDF, extraction_method=None, metadata={}, detected_at=datetime(2024, 1, 15, 10, 30, 0), file_hash="test_hash" ) class TestCreateDocument: """Tests for create_document method.""" @patch('app.services.document_service.magic.from_buffer') @patch('app.services.document_service.datetime') @pytest.mark.asyncio async def test_i_can_create_document_with_new_content( self, mock_datetime, mock_magic, document_service, sample_file_bytes ): """Test creating document when content doesn't exist yet.""" # Setup mocks fixed_time = datetime(2024, 1, 15, 10, 30, 0) mock_datetime.utcnow.return_value = fixed_time mock_magic.return_value = "application/pdf" # Execute result = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Verify document creation assert result is not None assert result.filename == "test.pdf" assert result.filepath == "/test/test.pdf" assert result.file_type == FileType.PDF assert result.detected_at == fixed_time assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes) # Verify content was created content = await document_service.content_repository.find_document_content_by_file_hash( result.file_hash ) assert content is not None assert content.file_hash == result.file_hash assert content.file_size == len(sample_file_bytes) assert content.mime_type == "application/pdf" assert content.encoding == "utf-8" @patch('app.services.document_service.magic.from_buffer') @patch('app.services.document_service.datetime') @pytest.mark.asyncio async def test_i_can_create_document_with_existing_content( self, mock_datetime, mock_magic, document_service, sample_file_bytes ): """Test creating document when content already exists (deduplication).""" # Setup mocks fixed_time = datetime(2024, 1, 15, 10, 30, 0) mock_datetime.utcnow.return_value = fixed_time mock_magic.return_value = "application/pdf" # Create first document first_doc = await document_service.create_document( "/test/first.pdf", sample_file_bytes, "utf-8" ) # Create second document with same content second_doc = await document_service.create_document( "/test/second.pdf", sample_file_bytes, "utf-8" ) # Verify both documents exist but share same hash assert first_doc.file_hash == second_doc.file_hash assert first_doc.filename != second_doc.filename assert first_doc.filepath != second_doc.filepath # Verify only one content document exists all_content = await document_service.content_repository.list_document_content() content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash] assert len(content_for_hash) == 1 @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_create_document_with_different_encodings( self, mock_magic, document_service, sample_text_bytes ): """Test creating documents with different text encodings.""" # Setup mock_magic.return_value = "text/plain" # Test with different encodings encodings = ["utf-8", "latin-1", "ascii"] for i, encoding in enumerate(encodings): result = await document_service.create_document( f"/test/test{i}.txt", sample_text_bytes, encoding ) # Verify document was created assert result is not None assert result.file_type == FileType.TXT # Verify content has correct encoding content = await document_service.content_repository.find_document_content_by_file_hash( result.file_hash ) assert content.encoding == encoding @pytest.mark.asyncio async def test_i_cannot_create_document_with_unsupported_file_type( self, document_service, sample_file_bytes ): """Test that unsupported file types raise ValueError.""" with pytest.raises(ValueError, match="Unsupported file type"): await document_service.create_document( "/test/test.xyz", # Unsupported extension sample_file_bytes, "utf-8" ) @pytest.mark.asyncio async def test_i_cannot_create_document_with_empty_file_path( self, document_service, sample_file_bytes ): """Test that empty file path raises ValueError.""" with pytest.raises(ValueError): await document_service.create_document( "", # Empty path sample_file_bytes, "utf-8" ) @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_create_document_with_empty_bytes( self, mock_magic, document_service ): """Test behavior with empty file bytes.""" # Setup mock_magic.return_value = "text/plain" # Execute with empty bytes result = await document_service.create_document( "/test/empty.txt", b"", # Empty bytes "utf-8" ) # Should still work but with zero file size assert result is not None content = await document_service.content_repository.find_document_content_by_file_hash( result.file_hash ) assert content.file_size == 0 class TestGetMethods: """Tests for document retrieval methods.""" @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_get_document_by_id( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by ID.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = await document_service.get_document_by_id(created_doc.id) # Verify assert result is not None assert result.id == created_doc.id assert result.filename == created_doc.filename @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_get_document_by_hash( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by file hash.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = await document_service.get_document_by_hash(created_doc.file_hash) # Verify assert result is not None assert result.file_hash == created_doc.file_hash assert result.filename == created_doc.filename @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_get_document_by_filepath( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by file path.""" # Setup mock_magic.return_value = "application/pdf" test_path = "/test/unique_test.pdf" # Create a document first created_doc = await document_service.create_document( test_path, sample_file_bytes, "utf-8" ) # Execute result = await document_service.get_document_by_filepath(test_path) # Verify assert result is not None assert result.filepath == test_path assert result.id == created_doc.id @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_get_document_with_content( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document with associated content.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = await document_service.get_document_with_content(created_doc.id) # Verify assert result is not None document, content = result assert document.id == created_doc.id assert content is not None assert content.file_hash == created_doc.file_hash @pytest.mark.asyncio async def test_i_cannot_get_nonexistent_document_by_id( self, document_service ): """Test that nonexistent document returns None.""" # Execute with random ObjectId result = await document_service.get_document_by_id(ObjectId()) # Verify assert result is None @pytest.mark.asyncio async def test_i_cannot_get_nonexistent_document_by_hash( self, document_service ): """Test that nonexistent document hash returns None.""" # Execute result = await document_service.get_document_by_hash("nonexistent_hash") # Verify assert result is None class TestPaginationAndCounting: """Tests for document listing and counting.""" @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_list_documents_with_pagination( self, mock_magic, document_service, sample_file_bytes ): """Test document listing with pagination parameters.""" # Setup mock_magic.return_value = "application/pdf" # Create multiple documents for i in range(5): await document_service.create_document( f"/test/test{i}.pdf", sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique "utf-8" ) # Execute with pagination result = await document_service.list_documents(skip=1, limit=2) # Verify assert len(result) == 2 # Test counting total_count = await document_service.count_documents() assert total_count == 5 @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_count_documents( self, mock_magic, document_service, sample_file_bytes ): """Test document counting.""" # Setup mock_magic.return_value = "text/plain" # Initially should be 0 initial_count = await document_service.count_documents() assert initial_count == 0 # Create some documents for i in range(3): await document_service.create_document( f"/test/test{i}.txt", sample_file_bytes + bytes(str(i), 'utf-8'), "utf-8" ) # Execute final_count = await document_service.count_documents() # Verify assert final_count == 3 class TestUpdateAndDelete: """Tests for document update and deletion operations.""" @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_update_document_metadata( self, mock_magic, document_service, sample_file_bytes ): """Test updating document metadata.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute update update_data = {"metadata": {"page_count": 5}} result = await document_service.update_document(created_doc.id, update_data) # Verify assert result is not None assert result.metadata.get("page_count") == 5 @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_delete_document_and_orphaned_content( self, mock_magic, document_service, sample_file_bytes ): """Test deleting document with orphaned content cleanup.""" # Setup mock_magic.return_value = "application/pdf" # Create a document created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Verify content exists content_before = await document_service.content_repository.find_document_content_by_file_hash( created_doc.file_hash ) assert content_before is not None # Execute deletion result = await document_service.delete_document(created_doc.id) # Verify document and content are deleted assert result is True deleted_doc = await document_service.get_document_by_id(created_doc.id) assert deleted_doc is None content_after = await document_service.content_repository.find_document_content_by_file_hash( created_doc.file_hash ) assert content_after is None @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_delete_document_without_affecting_shared_content( self, mock_magic, document_service, sample_file_bytes ): """Test deleting document without removing shared content.""" # Setup mock_magic.return_value = "application/pdf" # Create two documents with same content doc1 = await document_service.create_document( "/test/test1.pdf", sample_file_bytes, "utf-8" ) doc2 = await document_service.create_document( "/test/test2.pdf", sample_file_bytes, "utf-8" ) # They should share the same hash assert doc1.file_hash == doc2.file_hash # Delete first document result = await document_service.delete_document(doc1.id) assert result is True # Verify first document is deleted but content still exists deleted_doc = await document_service.get_document_by_id(doc1.id) assert deleted_doc is None remaining_doc = await document_service.get_document_by_id(doc2.id) assert remaining_doc is not None content = await document_service.content_repository.find_document_content_by_file_hash( doc2.file_hash ) assert content is not None class TestUtilityMethods: """Tests for utility methods.""" @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_check_content_exists( self, mock_magic, document_service, sample_file_bytes ): """Test checking if content exists by hash.""" # Setup mock_magic.return_value = "application/pdf" # Initially content doesn't exist test_hash = "nonexistent_hash" exists_before = await document_service.content_exists(test_hash) assert exists_before is False # Create a document created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Now content should exist exists_after = await document_service.content_exists(created_doc.file_hash) assert exists_after is True @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_update_document_content( self, mock_magic, document_service, sample_file_bytes ): """Test updating extracted document content.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Update content new_content = "Updated extracted content" result = await document_service.update_document_content( created_doc.file_hash, new_content ) # Verify update assert result is not None assert result.content == new_content # Verify persistence updated_content = await document_service.content_repository.find_document_content_by_file_hash( created_doc.file_hash ) assert updated_content.content == new_content class TestHashCalculation: """Tests for file hash calculation utility.""" def test_i_can_calculate_consistent_file_hash(self, document_service): """Test that file hash calculation is consistent.""" test_bytes = b"Test content for hashing" # Calculate hash multiple times hash1 = document_service._calculate_file_hash(test_bytes) hash2 = document_service._calculate_file_hash(test_bytes) # Should be identical assert hash1 == hash2 assert len(hash1) == 64 # SHA256 produces 64-character hex string def test_i_get_different_hashes_for_different_content(self, document_service): """Test that different content produces different hashes.""" content1 = b"First content" content2 = b"Second content" hash1 = document_service._calculate_file_hash(content1) hash2 = document_service._calculate_file_hash(content2) assert hash1 != hash2 class TestFileTypeDetection: """Tests for file type detection.""" def test_i_can_detect_pdf_file_type(self, document_service): """Test PDF file type detection.""" file_type = document_service._detect_file_type("/path/to/document.pdf") assert file_type == FileType.PDF def test_i_can_detect_txt_file_type(self, document_service): """Test text file type detection.""" file_type = document_service._detect_file_type("/path/to/document.txt") assert file_type == FileType.TXT def test_i_can_detect_docx_file_type(self, document_service): """Test DOCX file type detection.""" file_type = document_service._detect_file_type("/path/to/document.docx") assert file_type == FileType.DOCX def test_i_cannot_detect_unsupported_file_type(self, document_service): """Test unsupported file type raises ValueError.""" with pytest.raises(ValueError, match="Unsupported file type"): document_service._detect_file_type("/path/to/document.xyz")