""" Unit tests for DocumentService using in-memory MongoDB. Tests the orchestration logic with real MongoDB operations using mongomock for better integration testing. """ import os from datetime import datetime from unittest.mock import patch import pytest import pytest_asyncio from bson import ObjectId from mongomock_motor import AsyncMongoMockClient from app.models.document import FileType from app.services.document_service import DocumentService @pytest.fixture(autouse=True) def cleanup_test_folder(): """Clean up test folder.""" import shutil shutil.rmtree("test_folder", ignore_errors=True) @pytest_asyncio.fixture async def in_memory_database(): """Create an in-memory database for testing.""" client = AsyncMongoMockClient() return client.test_database @pytest_asyncio.fixture async def document_service(in_memory_database): """Create DocumentService with in-memory repositories.""" service = DocumentService(in_memory_database, objects_folder="test_folder") return service @pytest.fixture def sample_file_bytes(): """Sample file content as bytes.""" return b"This is a test PDF content" @pytest.fixture def sample_text_bytes(): """Sample text file content as bytes.""" return b"This is a test text file content" @pytest.fixture def sample_file_hash(): """Expected SHA256 hash for sample file bytes.""" import hashlib return hashlib.sha256(b"This is a test PDF content").hexdigest() def validate_file_saved(document_service, file_hash, file_bytes): # Verify file is saved to disk target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash) assert os.path.exists(target_file_path) with open(target_file_path, "rb") as f: content = f.read() assert content == file_bytes class TestCreateDocument: """Tests for create_document method.""" @patch('app.services.document_service.magic.from_buffer') @patch('app.services.document_service.datetime') @pytest.mark.asyncio async def test_i_can_create_document_with_new_content( self, mock_datetime, mock_magic, document_service, sample_file_bytes ): """Test creating document when content doesn't exist yet.""" # Setup mocks fixed_time = datetime(2025, 1, 1, 10, 30, 0) mock_datetime.now.return_value = fixed_time mock_magic.return_value = "application/pdf" # Execute result = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Verify document creation assert result is not None assert result.filename == "test.pdf" assert result.filepath == "/test/test.pdf" assert result.file_type == FileType.PDF assert result.detected_at == fixed_time assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes) # Verify document created in database doc_in_db = await document_service.document_repository.find_document_by_id(result.id) assert doc_in_db is not None assert doc_in_db.id == result.id assert doc_in_db.filename == result.filename assert doc_in_db.filepath == result.filepath assert doc_in_db.file_type == result.file_type assert doc_in_db.detected_at == fixed_time assert doc_in_db.file_hash == result.file_hash # Verify file is saved to disk validate_file_saved(document_service, result.file_hash, sample_file_bytes) @patch('app.services.document_service.magic.from_buffer') @patch('app.services.document_service.datetime') @pytest.mark.asyncio async def test_i_can_create_document_with_existing_content( self, mock_datetime, mock_magic, document_service, sample_file_bytes ): """Test creating document when content already exists (deduplication).""" # Setup mocks fixed_time = datetime(2025, 1, 1, 10, 30, 0) mock_datetime.now.return_value = fixed_time mock_magic.return_value = "application/pdf" # Create first document first_doc = await document_service.create_document( "/test/first.pdf", sample_file_bytes, "utf-8" ) # Create second document with same content second_doc = await document_service.create_document( "/test/second.pdf", sample_file_bytes, "utf-8" ) # Verify both documents exist but share same hash assert first_doc.file_hash == second_doc.file_hash assert first_doc.filename != second_doc.filename assert first_doc.filepath != second_doc.filepath @pytest.mark.asyncio async def test_i_cannot_create_document_with_unsupported_file_type( self, document_service, sample_file_bytes ): """Test that unsupported file types raise ValueError.""" with pytest.raises(ValueError, match="Unsupported file type"): await document_service.create_document( "/test/test.xyz", # Unsupported extension sample_file_bytes, "utf-8" ) @pytest.mark.asyncio async def test_i_cannot_create_document_with_empty_file_path( self, document_service, sample_file_bytes ): """Test that empty file path raises ValueError.""" with pytest.raises(ValueError): await document_service.create_document( "", # Empty path sample_file_bytes, "utf-8" ) @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_create_document_with_empty_bytes( self, mock_magic, document_service ): """Test behavior with empty file bytes.""" # Setup mock_magic.return_value = "text/plain" # Execute with empty bytes result = await document_service.create_document( "/test/empty.txt", b"", # Empty bytes "utf-8" ) # Verify file is saved to disk validate_file_saved(document_service, result.file_hash, b"") class TestGetMethods: """Tests for document retrieval methods.""" @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_get_document_by_id( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by ID.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = await document_service.get_document_by_id(created_doc.id) # Verify assert result is not None assert result.id == created_doc.id assert result.filename == created_doc.filename @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_get_document_by_hash( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by file hash.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = await document_service.get_document_by_hash(created_doc.file_hash) # Verify assert result is not None assert result.file_hash == created_doc.file_hash assert result.filename == created_doc.filename @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_get_document_by_filepath( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by file path.""" # Setup mock_magic.return_value = "application/pdf" test_path = "/test/unique_test.pdf" # Create a document first created_doc = await document_service.create_document( test_path, sample_file_bytes, "utf-8" ) # Execute result = await document_service.get_document_by_filepath(test_path) # Verify assert result is not None assert result.filepath == test_path assert result.id == created_doc.id @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_get_document_content( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document with associated content.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = await document_service.get_document_content_by_hash(created_doc.file_hash) # Verify assert result == sample_file_bytes @pytest.mark.asyncio async def test_i_cannot_get_nonexistent_document_by_id( self, document_service ): """Test that nonexistent document returns None.""" # Execute with random ObjectId result = await document_service.get_document_by_id(ObjectId()) # Verify assert result is None @pytest.mark.asyncio async def test_i_cannot_get_nonexistent_document_by_hash( self, document_service ): """Test that nonexistent document hash returns None.""" # Execute result = await document_service.get_document_by_hash("nonexistent_hash") # Verify assert result is None class TestPaginationAndCounting: """Tests for document listing and counting.""" @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_list_documents_with_pagination( self, mock_magic, document_service, sample_file_bytes ): """Test document listing with pagination parameters.""" # Setup mock_magic.return_value = "application/pdf" # Create multiple documents for i in range(5): await document_service.create_document( f"/test/test{i}.pdf", sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique "utf-8" ) # Execute with pagination result = await document_service.list_documents(skip=1, limit=2) # Verify assert len(result) == 2 # Test counting total_count = await document_service.count_documents() assert total_count == 5 @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_count_documents( self, mock_magic, document_service, sample_file_bytes ): """Test document counting.""" # Setup mock_magic.return_value = "text/plain" # Initially should be 0 initial_count = await document_service.count_documents() assert initial_count == 0 # Create some documents for i in range(3): await document_service.create_document( f"/test/test{i}.txt", sample_file_bytes + bytes(str(i), 'utf-8'), "utf-8" ) # Execute final_count = await document_service.count_documents() # Verify assert final_count == 3 class TestUpdateAndDelete: """Tests for document update and deletion operations.""" @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_update_document_metadata( self, mock_magic, document_service, sample_file_bytes ): """Test updating document metadata.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute update update_data = {"metadata": {"page_count": 5}} result = await document_service.update_document(created_doc.id, update_data) # Verify assert result is not None assert result.metadata.get("page_count") == 5 assert result.filename == created_doc.filename assert result.filepath == created_doc.filepath assert result.file_hash == created_doc.file_hash assert result.file_type == created_doc.file_type assert result.metadata == update_data['metadata'] @pytest.mark.asyncio async def test_i_can_update_document_content( self, document_service, sample_file_bytes ): # Create a document first created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute update update_data = {"file_bytes": b"this is an updated file content"} result = await document_service.update_document(created_doc.id, update_data) assert result.filename == created_doc.filename assert result.filepath == created_doc.filepath assert result.file_hash != created_doc.file_hash assert result.file_type == created_doc.file_type assert result.metadata == created_doc.metadata # Verify file is saved to disk validate_file_saved(document_service, result.file_hash, b"this is an updated file content") @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_delete_document_and_orphaned_content( self, mock_magic, document_service, sample_file_bytes ): """Test deleting document with orphaned content cleanup.""" # Setup mock_magic.return_value = "application/pdf" # Create a document created_doc = await document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Verify content exists validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes) # Execute deletion result = await document_service.delete_document(created_doc.id) # Verify document and content are deleted assert result is True deleted_doc = await document_service.get_document_by_id(created_doc.id) assert deleted_doc is None # validate content is deleted file_hash = created_doc.file_hash[:24] target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash) assert not os.path.exists(target_file_path) @patch('app.services.document_service.magic.from_buffer') @pytest.mark.asyncio async def test_i_can_delete_document_without_affecting_shared_content( self, mock_magic, document_service, sample_file_bytes ): """Test deleting document without removing shared content.""" # Setup mock_magic.return_value = "application/pdf" # Create two documents with same content doc1 = await document_service.create_document( "/test/test1.pdf", sample_file_bytes, "utf-8" ) doc2 = await document_service.create_document( "/test/test2.pdf", sample_file_bytes, "utf-8" ) # They should share the same hash assert doc1.file_hash == doc2.file_hash # Delete first document result = await document_service.delete_document(doc1.id) assert result is True # Verify first document is deleted but content still exists deleted_doc = await document_service.get_document_by_id(doc1.id) assert deleted_doc is None remaining_doc = await document_service.get_document_by_id(doc2.id) assert remaining_doc is not None validate_file_saved(document_service, doc2.file_hash, sample_file_bytes) class TestHashCalculation: """Tests for file hash calculation utility.""" def test_i_can_calculate_consistent_file_hash(self, document_service): """Test that file hash calculation is consistent.""" test_bytes = b"Test content for hashing" # Calculate hash multiple times hash1 = document_service._calculate_file_hash(test_bytes) hash2 = document_service._calculate_file_hash(test_bytes) # Should be identical assert hash1 == hash2 assert len(hash1) == 64 # SHA256 produces 64-character hex string def test_i_get_different_hashes_for_different_content(self, document_service): """Test that different content produces different hashes.""" content1 = b"First content" content2 = b"Second content" hash1 = document_service._calculate_file_hash(content1) hash2 = document_service._calculate_file_hash(content2) assert hash1 != hash2 class TestFileTypeDetection: """Tests for file type detection.""" def test_i_can_detect_pdf_file_type(self, document_service): """Test PDF file type detection.""" file_type = document_service._detect_file_type("/path/to/document.pdf") assert file_type == FileType.PDF def test_i_can_detect_txt_file_type(self, document_service): """Test text file type detection.""" file_type = document_service._detect_file_type("/path/to/document.txt") assert file_type == FileType.TXT def test_i_can_detect_docx_file_type(self, document_service): """Test DOCX file type detection.""" file_type = document_service._detect_file_type("/path/to/document.docx") assert file_type == FileType.DOCX def test_i_cannot_detect_unsupported_file_type(self, document_service): """Test unsupported file type raises ValueError.""" with pytest.raises(ValueError, match="Unsupported file type"): document_service._detect_file_type("/path/to/document.xyz")