""" Unit tests for DocumentService using in-memory MongoDB. Tests the orchestration logic with real MongoDB operations using mongomock for better integration testing. """ import os from datetime import datetime from unittest.mock import patch import pytest import pytest_asyncio from bson import ObjectId from mongomock.mongo_client import MongoClient from app.models.document import FileType from app.services.document_service import DocumentService @pytest.fixture(autouse=True) def cleanup_test_folder(): """Clean up test folder.""" import shutil shutil.rmtree("test_folder", ignore_errors=True) @pytest.fixture def in_memory_database(): """Create an in-memory database for testing.""" client = MongoClient() return client.test_database @pytest_asyncio.fixture def document_service(in_memory_database): """Create DocumentService with in-memory repositories.""" service = DocumentService(in_memory_database, objects_folder="test_folder") return service @pytest.fixture def sample_file_bytes(): """Sample file content as bytes.""" return b"This is a test PDF content" @pytest.fixture def sample_text_bytes(): """Sample text file content as bytes.""" return b"This is a test text file content" @pytest.fixture def sample_file_hash(): """Expected SHA256 hash for sample file bytes.""" import hashlib return hashlib.sha256(b"This is a test PDF content").hexdigest() def validate_file_saved(document_service, file_hash, file_bytes): # Verify file is saved to disk target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash) assert os.path.exists(target_file_path) with open(target_file_path, "rb") as f: content = f.read() assert content == file_bytes class TestCreateDocument: """Tests for create_document method.""" @patch('app.services.document_service.magic.from_buffer') @patch('app.services.document_service.datetime') def test_i_can_create_document_with_new_content( self, mock_datetime, mock_magic, document_service, sample_file_bytes ): """Test creating document when content doesn't exist yet.""" # Setup mocks fixed_time = datetime(2025, 1, 1, 10, 30, 0) mock_datetime.now.return_value = fixed_time mock_magic.return_value = "application/pdf" # Execute result = document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Verify document creation assert result is not None assert result.filename == "test.pdf" assert result.filepath == "/test/test.pdf" assert result.file_type == FileType.PDF assert result.detected_at == fixed_time assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes) # Verify document created in database doc_in_db = document_service.document_repository.find_document_by_id(result.id) assert doc_in_db is not None assert doc_in_db.id == result.id assert doc_in_db.filename == result.filename assert doc_in_db.filepath == result.filepath assert doc_in_db.file_type == result.file_type assert doc_in_db.detected_at == fixed_time assert doc_in_db.file_hash == result.file_hash # Verify file is saved to disk validate_file_saved(document_service, result.file_hash, sample_file_bytes) @patch('app.services.document_service.magic.from_buffer') @patch('app.services.document_service.datetime') def test_i_can_create_document_with_existing_content( self, mock_datetime, mock_magic, document_service, sample_file_bytes ): """Test creating document when content already exists (deduplication).""" # Setup mocks fixed_time = datetime(2025, 1, 1, 10, 30, 0) mock_datetime.now.return_value = fixed_time mock_magic.return_value = "application/pdf" # Create first document first_doc = document_service.create_document( "/test/first.pdf", sample_file_bytes, "utf-8" ) # Create second document with same content second_doc = document_service.create_document( "/test/second.pdf", sample_file_bytes, "utf-8" ) # Verify both documents exist but share same hash assert first_doc.file_hash == second_doc.file_hash assert first_doc.filename != second_doc.filename assert first_doc.filepath != second_doc.filepath def test_i_cannot_create_document_with_unsupported_file_type( self, document_service, sample_file_bytes ): """Test that unsupported file types raise ValueError.""" with pytest.raises(ValueError, match="Unsupported file type"): document_service.create_document( "/test/test.xyz", # Unsupported extension sample_file_bytes, "utf-8" ) def test_i_cannot_create_document_with_empty_file_path( self, document_service, sample_file_bytes ): """Test that empty file path raises ValueError.""" with pytest.raises(ValueError): document_service.create_document( "", # Empty path sample_file_bytes, "utf-8" ) @patch('app.services.document_service.magic.from_buffer') def test_i_can_create_document_with_empty_bytes( self, mock_magic, document_service ): """Test behavior with empty file bytes.""" # Setup mock_magic.return_value = "text/plain" # Execute with empty bytes result = document_service.create_document( "/test/empty.txt", b"", # Empty bytes "utf-8" ) # Verify file is saved to disk validate_file_saved(document_service, result.file_hash, b"") class TestGetMethods: """Tests for document retrieval methods.""" @patch('app.services.document_service.magic.from_buffer') def test_i_can_get_document_by_id( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by ID.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = document_service.get_document_by_id(created_doc.id) # Verify assert result is not None assert result.id == created_doc.id assert result.filename == created_doc.filename @patch('app.services.document_service.magic.from_buffer') def test_i_can_get_document_by_hash( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by file hash.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = document_service.get_document_by_hash(created_doc.file_hash) # Verify assert result is not None assert result.file_hash == created_doc.file_hash assert result.filename == created_doc.filename @patch('app.services.document_service.magic.from_buffer') def test_i_can_get_document_by_filepath( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document by file path.""" # Setup mock_magic.return_value = "application/pdf" test_path = "/test/unique_test.pdf" # Create a document first created_doc = document_service.create_document( test_path, sample_file_bytes, "utf-8" ) # Execute result = document_service.get_document_by_filepath(test_path) # Verify assert result is not None assert result.filepath == test_path assert result.id == created_doc.id @patch('app.services.document_service.magic.from_buffer') def test_i_can_get_document_content( self, mock_magic, document_service, sample_file_bytes ): """Test retrieving document with associated content.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute result = document_service.get_document_content_by_hash(created_doc.file_hash) # Verify assert result == sample_file_bytes def test_i_cannot_get_nonexistent_document_by_id( self, document_service ): """Test that nonexistent document returns None.""" # Execute with random ObjectId result = document_service.get_document_by_id(ObjectId()) # Verify assert result is None def test_i_cannot_get_nonexistent_document_by_hash( self, document_service ): """Test that nonexistent document hash returns None.""" # Execute result = document_service.get_document_by_hash("nonexistent_hash") # Verify assert result is None class TestPaginationAndCounting: """Tests for document listing and counting.""" @patch('app.services.document_service.magic.from_buffer') def test_i_can_list_documents_with_pagination( self, mock_magic, document_service, sample_file_bytes ): """Test document listing with pagination parameters.""" # Setup mock_magic.return_value = "application/pdf" # Create multiple documents for i in range(5): document_service.create_document( f"/test/test{i}.pdf", sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique "utf-8" ) # Execute with pagination result = document_service.list_documents(skip=1, limit=2) # Verify assert len(result) == 2 # Test counting total_count = document_service.count_documents() assert total_count == 5 @patch('app.services.document_service.magic.from_buffer') def test_i_can_count_documents( self, mock_magic, document_service, sample_file_bytes ): """Test document counting.""" # Setup mock_magic.return_value = "text/plain" # Initially should be 0 initial_count = document_service.count_documents() assert initial_count == 0 # Create some documents for i in range(3): document_service.create_document( f"/test/test{i}.txt", sample_file_bytes + bytes(str(i), 'utf-8'), "utf-8" ) # Execute final_count = document_service.count_documents() # Verify assert final_count == 3 class TestUpdateAndDelete: """Tests for document update and deletion operations.""" @patch('app.services.document_service.magic.from_buffer') def test_i_can_update_document_metadata( self, mock_magic, document_service, sample_file_bytes ): """Test updating document metadata.""" # Setup mock_magic.return_value = "application/pdf" # Create a document first created_doc = document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute update update_data = {"metadata": {"page_count": 5}} result = document_service.update_document(created_doc.id, update_data) # Verify assert result is not None assert result.metadata.get("page_count") == 5 assert result.filename == created_doc.filename assert result.filepath == created_doc.filepath assert result.file_hash == created_doc.file_hash assert result.file_type == created_doc.file_type assert result.metadata == update_data['metadata'] def test_i_can_update_document_content( self, document_service, sample_file_bytes ): # Create a document first created_doc = document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Execute update update_data = {"file_bytes": b"this is an updated file content"} result = document_service.update_document(created_doc.id, update_data) assert result.filename == created_doc.filename assert result.filepath == created_doc.filepath assert result.file_hash != created_doc.file_hash assert result.file_type == created_doc.file_type assert result.metadata == created_doc.metadata # Verify file is saved to disk validate_file_saved(document_service, result.file_hash, b"this is an updated file content") @patch('app.services.document_service.magic.from_buffer') def test_i_can_delete_document_and_orphaned_content( self, mock_magic, document_service, sample_file_bytes ): """Test deleting document with orphaned content cleanup.""" # Setup mock_magic.return_value = "application/pdf" # Create a document created_doc = document_service.create_document( "/test/test.pdf", sample_file_bytes, "utf-8" ) # Verify content exists validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes) # Execute deletion result = document_service.delete_document(created_doc.id) # Verify document and content are deleted assert result is True deleted_doc = document_service.get_document_by_id(created_doc.id) assert deleted_doc is None # validate content is deleted file_hash = created_doc.file_hash[:24] target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash) assert not os.path.exists(target_file_path) @patch('app.services.document_service.magic.from_buffer') def test_i_can_delete_document_without_affecting_shared_content( self, mock_magic, document_service, sample_file_bytes ): """Test deleting document without removing shared content.""" # Setup mock_magic.return_value = "application/pdf" # Create two documents with same content doc1 = document_service.create_document( "/test/test1.pdf", sample_file_bytes, "utf-8" ) doc2 = document_service.create_document( "/test/test2.pdf", sample_file_bytes, "utf-8" ) # They should share the same hash assert doc1.file_hash == doc2.file_hash # Delete first document result = document_service.delete_document(doc1.id) assert result is True # Verify first document is deleted but content still exists deleted_doc = document_service.get_document_by_id(doc1.id) assert deleted_doc is None remaining_doc = document_service.get_document_by_id(doc2.id) assert remaining_doc is not None validate_file_saved(document_service, doc2.file_hash, sample_file_bytes) class TestHashCalculation: """Tests for file hash calculation utility.""" def test_i_can_calculate_consistent_file_hash(self, document_service): """Test that file hash calculation is consistent.""" test_bytes = b"Test content for hashing" # Calculate hash multiple times hash1 = document_service._calculate_file_hash(test_bytes) hash2 = document_service._calculate_file_hash(test_bytes) # Should be identical assert hash1 == hash2 assert len(hash1) == 64 # SHA256 produces 64-character hex string def test_i_get_different_hashes_for_different_content(self, document_service): """Test that different content produces different hashes.""" content1 = b"First content" content2 = b"Second content" hash1 = document_service._calculate_file_hash(content1) hash2 = document_service._calculate_file_hash(content2) assert hash1 != hash2 class TestFileTypeDetection: """Tests for file type detection.""" def test_i_can_detect_pdf_file_type(self, document_service): """Test PDF file type detection.""" file_type = document_service._detect_file_type("/path/to/document.pdf") assert file_type == FileType.PDF def test_i_can_detect_txt_file_type(self, document_service): """Test text file type detection.""" file_type = document_service._detect_file_type("/path/to/document.txt") assert file_type == FileType.TXT def test_i_can_detect_docx_file_type(self, document_service): """Test DOCX file type detection.""" file_type = document_service._detect_file_type("/path/to/document.docx") assert file_type == FileType.DOCX def test_i_cannot_detect_unsupported_file_type(self, document_service): """Test unsupported file type raises ValueError.""" with pytest.raises(ValueError, match="Unsupported file type"): document_service._detect_file_type("/path/to/document.xyz") class TestCreatePdf: """Tests for create_pdf method.""" @patch('app.services.document_service.convert_to_pdf') @patch('app.services.document_service.magic.from_buffer') def test_i_can_create_pdf_successfully( self, mock_magic, mock_convert_to_pdf, document_service, sample_file_bytes ): """Test creating PDF from an existing document.""" # Setup mock_magic.return_value = "text/plain" # Create a document first created_doc = document_service.create_document( "/test/test.txt", sample_file_bytes, "utf-8" ) # Mock the PDF conversion pdf_path = os.path.join(document_service.temp_folder, "converted.pdf") mock_convert_to_pdf.return_value = pdf_path # Write a sample PDF file that the conversion would create pdf_content = b"This is PDF content" os.makedirs(os.path.dirname(pdf_path), exist_ok=True) with open(pdf_path, "wb") as f: f.write(pdf_content) # Execute result = document_service.create_pdf(created_doc.id) # Verify assert result is True # Get the updated document updated_doc = document_service.get_document_by_id(created_doc.id) assert updated_doc.pdf_file_hash is not None # Verify the PDF content was saved pdf_hash = document_service._calculate_file_hash(pdf_content) assert updated_doc.pdf_file_hash == pdf_hash # Verify convert_to_pdf was called with correct arguments doc_path = document_service.get_document_path(created_doc.file_hash) mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder) # Verify content exists on disk validate_file_saved(document_service, pdf_hash, pdf_content) # Verify PDF hash was added to document updated_doc = document_service.get_document_by_id(created_doc.id) pdf_hash = document_service._calculate_file_hash(pdf_content) assert updated_doc.pdf_file_hash == pdf_hash @patch('app.services.document_service.convert_to_pdf') @patch('app.services.document_service.magic.from_buffer') def test_i_can_reuse_existing_pdf( self, mock_magic, mock_convert_to_pdf, document_service, sample_file_bytes ): """Test that if PDF already exists, it doesn't recreate it.""" # Setup mock_magic.return_value = "text/plain" # Create a document first created_doc = document_service.create_document( "/test/test.txt", sample_file_bytes, "utf-8" ) # Create a fake PDF file and update the document pdf_content = b"This is PDF content" pdf_hash = document_service._calculate_file_hash(pdf_content) document_service.save_content_if_needed(pdf_hash, pdf_content) document_service.update_document(created_doc.id, {"pdf_file_hash": pdf_hash}) # Execute result = document_service.create_pdf(created_doc.id) # Verify assert result is True # Verify convert_to_pdf was NOT called mock_convert_to_pdf.assert_not_called() def test_i_cannot_create_pdf_for_nonexistent_document( self, document_service ): """Test behavior when document ID doesn't exist.""" # Execute with random ObjectId result = document_service.create_pdf(ObjectId()) # Verify assert result is False @patch('app.services.document_service.magic.from_buffer') def test_i_cannot_create_pdf_when_file_content_missing( self, mock_magic, document_service, sample_file_bytes ): """Test behavior when file content doesn't exist.""" # Setup mock_magic.return_value = "text/plain" # Create a document created_doc = document_service.create_document( "/test/test.txt", sample_file_bytes, "utf-8" ) # Simulate missing content by removing file file_path = document_service.get_document_path(created_doc.file_hash) os.remove(file_path) # Execute result = document_service.create_pdf(created_doc.id) # Verify assert result is False