Files
MyDocManager/tests/test_document_service.py

697 lines
23 KiB
Python

"""
Unit tests for DocumentService using in-memory MongoDB.
Tests the orchestration logic with real MongoDB operations
using mongomock for better integration testing.
"""
import pytest
import pytest_asyncio
from unittest.mock import Mock, patch
from datetime import datetime
from bson import ObjectId
from pathlib import Path
from mongomock_motor import AsyncMongoMockClient
from app.services.document_service import DocumentService
from app.database.repositories.document_repository import FileDocumentRepository
from app.database.repositories.document_content_repository import DocumentContentRepository
from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod
from app.models.types import PyObjectId
@pytest_asyncio.fixture
async def in_memory_file_repository():
"""Create an in-memory FileDocumentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository(db)
await repo.initialize()
return repo
@pytest_asyncio.fixture
async def in_memory_content_repository():
"""Create an in-memory DocumentContentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = DocumentContentRepository(db)
await repo.initialize()
return repo
@pytest_asyncio.fixture
async def in_memory_database():
"""Create an in-memory database for testing."""
client = AsyncMongoMockClient()
return client.test_database
@pytest_asyncio.fixture
async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database):
"""Create DocumentService with in-memory repositories."""
with patch('app.services.document_service.get_database', return_value=in_memory_database):
service = DocumentService()
service.file_repository = in_memory_file_repository
service.content_repository = in_memory_content_repository
return service
@pytest.fixture
def sample_file_bytes():
"""Sample file content as bytes."""
return b"This is a test PDF content"
@pytest.fixture
def sample_text_bytes():
"""Sample text file content as bytes."""
return b"This is a test text file content"
@pytest.fixture
def sample_file_hash():
"""Expected SHA256 hash for sample file bytes."""
import hashlib
return hashlib.sha256(b"This is a test PDF content").hexdigest()
@pytest.fixture
def sample_file_document():
"""Sample FileDocument for testing."""
return FileDocument(
id=ObjectId(),
filename="test.pdf",
filepath="/test/test.pdf",
file_type=FileType.PDF,
extraction_method=None,
metadata={},
detected_at=datetime(2024, 1, 15, 10, 30, 0),
file_hash="test_hash"
)
class TestCreateDocument:
"""Tests for create_document method."""
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
@pytest.mark.asyncio
async def test_i_can_create_document_with_new_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content doesn't exist yet."""
# Setup mocks
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
mock_datetime.utcnow.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Execute
result = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify document creation
assert result is not None
assert result.filename == "test.pdf"
assert result.filepath == "/test/test.pdf"
assert result.file_type == FileType.PDF
assert result.detected_at == fixed_time
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
# Verify content was created
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content is not None
assert content.file_hash == result.file_hash
assert content.file_size == len(sample_file_bytes)
assert content.mime_type == "application/pdf"
assert content.encoding == "utf-8"
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
@pytest.mark.asyncio
async def test_i_can_create_document_with_existing_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content already exists (deduplication)."""
# Setup mocks
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
mock_datetime.utcnow.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Create first document
first_doc = await document_service.create_document(
"/test/first.pdf",
sample_file_bytes,
"utf-8"
)
# Create second document with same content
second_doc = await document_service.create_document(
"/test/second.pdf",
sample_file_bytes,
"utf-8"
)
# Verify both documents exist but share same hash
assert first_doc.file_hash == second_doc.file_hash
assert first_doc.filename != second_doc.filename
assert first_doc.filepath != second_doc.filepath
# Verify only one content document exists
all_content = await document_service.content_repository.list_document_content()
content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash]
assert len(content_for_hash) == 1
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_create_document_with_different_encodings(
self,
mock_magic,
document_service,
sample_text_bytes
):
"""Test creating documents with different text encodings."""
# Setup
mock_magic.return_value = "text/plain"
# Test with different encodings
encodings = ["utf-8", "latin-1", "ascii"]
for i, encoding in enumerate(encodings):
result = await document_service.create_document(
f"/test/test{i}.txt",
sample_text_bytes,
encoding
)
# Verify document was created
assert result is not None
assert result.file_type == FileType.TXT
# Verify content has correct encoding
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content.encoding == encoding
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_unsupported_file_type(
self,
document_service,
sample_file_bytes
):
"""Test that unsupported file types raise ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
await document_service.create_document(
"/test/test.xyz", # Unsupported extension
sample_file_bytes,
"utf-8"
)
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_empty_file_path(
self,
document_service,
sample_file_bytes
):
"""Test that empty file path raises ValueError."""
with pytest.raises(ValueError):
await document_service.create_document(
"", # Empty path
sample_file_bytes,
"utf-8"
)
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_create_document_with_empty_bytes(
self,
mock_magic,
document_service
):
"""Test behavior with empty file bytes."""
# Setup
mock_magic.return_value = "text/plain"
# Execute with empty bytes
result = await document_service.create_document(
"/test/empty.txt",
b"", # Empty bytes
"utf-8"
)
# Should still work but with zero file size
assert result is not None
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content.file_size == 0
class TestGetMethods:
"""Tests for document retrieval methods."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_id(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by ID."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_id(created_doc.id)
# Verify
assert result is not None
assert result.id == created_doc.id
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_hash(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file hash."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_hash(created_doc.file_hash)
# Verify
assert result is not None
assert result.file_hash == created_doc.file_hash
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_filepath(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file path."""
# Setup
mock_magic.return_value = "application/pdf"
test_path = "/test/unique_test.pdf"
# Create a document first
created_doc = await document_service.create_document(
test_path,
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_filepath(test_path)
# Verify
assert result is not None
assert result.filepath == test_path
assert result.id == created_doc.id
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_with_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document with associated content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_with_content(created_doc.id)
# Verify
assert result is not None
document, content = result
assert document.id == created_doc.id
assert content is not None
assert content.file_hash == created_doc.file_hash
@pytest.mark.asyncio
async def test_i_cannot_get_nonexistent_document_by_id(
self,
document_service
):
"""Test that nonexistent document returns None."""
# Execute with random ObjectId
result = await document_service.get_document_by_id(ObjectId())
# Verify
assert result is None
@pytest.mark.asyncio
async def test_i_cannot_get_nonexistent_document_by_hash(
self,
document_service
):
"""Test that nonexistent document hash returns None."""
# Execute
result = await document_service.get_document_by_hash("nonexistent_hash")
# Verify
assert result is None
class TestPaginationAndCounting:
"""Tests for document listing and counting."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_list_documents_with_pagination(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document listing with pagination parameters."""
# Setup
mock_magic.return_value = "application/pdf"
# Create multiple documents
for i in range(5):
await document_service.create_document(
f"/test/test{i}.pdf",
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
"utf-8"
)
# Execute with pagination
result = await document_service.list_documents(skip=1, limit=2)
# Verify
assert len(result) == 2
# Test counting
total_count = await document_service.count_documents()
assert total_count == 5
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_count_documents(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document counting."""
# Setup
mock_magic.return_value = "text/plain"
# Initially should be 0
initial_count = await document_service.count_documents()
assert initial_count == 0
# Create some documents
for i in range(3):
await document_service.create_document(
f"/test/test{i}.txt",
sample_file_bytes + bytes(str(i), 'utf-8'),
"utf-8"
)
# Execute
final_count = await document_service.count_documents()
# Verify
assert final_count == 3
class TestUpdateAndDelete:
"""Tests for document update and deletion operations."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_update_document_metadata(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test updating document metadata."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute update
update_data = {"metadata": {"page_count": 5}}
result = await document_service.update_document(created_doc.id, update_data)
# Verify
assert result is not None
assert result.metadata.get("page_count") == 5
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_delete_document_and_orphaned_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document with orphaned content cleanup."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify content exists
content_before = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert content_before is not None
# Execute deletion
result = await document_service.delete_document(created_doc.id)
# Verify document and content are deleted
assert result is True
deleted_doc = await document_service.get_document_by_id(created_doc.id)
assert deleted_doc is None
content_after = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert content_after is None
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_delete_document_without_affecting_shared_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document without removing shared content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create two documents with same content
doc1 = await document_service.create_document(
"/test/test1.pdf",
sample_file_bytes,
"utf-8"
)
doc2 = await document_service.create_document(
"/test/test2.pdf",
sample_file_bytes,
"utf-8"
)
# They should share the same hash
assert doc1.file_hash == doc2.file_hash
# Delete first document
result = await document_service.delete_document(doc1.id)
assert result is True
# Verify first document is deleted but content still exists
deleted_doc = await document_service.get_document_by_id(doc1.id)
assert deleted_doc is None
remaining_doc = await document_service.get_document_by_id(doc2.id)
assert remaining_doc is not None
content = await document_service.content_repository.find_document_content_by_file_hash(
doc2.file_hash
)
assert content is not None
class TestUtilityMethods:
"""Tests for utility methods."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_check_content_exists(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test checking if content exists by hash."""
# Setup
mock_magic.return_value = "application/pdf"
# Initially content doesn't exist
test_hash = "nonexistent_hash"
exists_before = await document_service.content_exists(test_hash)
assert exists_before is False
# Create a document
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Now content should exist
exists_after = await document_service.content_exists(created_doc.file_hash)
assert exists_after is True
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_update_document_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test updating extracted document content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Update content
new_content = "Updated extracted content"
result = await document_service.update_document_content(
created_doc.file_hash,
new_content
)
# Verify update
assert result is not None
assert result.content == new_content
# Verify persistence
updated_content = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert updated_content.content == new_content
class TestHashCalculation:
"""Tests for file hash calculation utility."""
def test_i_can_calculate_consistent_file_hash(self, document_service):
"""Test that file hash calculation is consistent."""
test_bytes = b"Test content for hashing"
# Calculate hash multiple times
hash1 = document_service._calculate_file_hash(test_bytes)
hash2 = document_service._calculate_file_hash(test_bytes)
# Should be identical
assert hash1 == hash2
assert len(hash1) == 64 # SHA256 produces 64-character hex string
def test_i_get_different_hashes_for_different_content(self, document_service):
"""Test that different content produces different hashes."""
content1 = b"First content"
content2 = b"Second content"
hash1 = document_service._calculate_file_hash(content1)
hash2 = document_service._calculate_file_hash(content2)
assert hash1 != hash2
class TestFileTypeDetection:
"""Tests for file type detection."""
def test_i_can_detect_pdf_file_type(self, document_service):
"""Test PDF file type detection."""
file_type = document_service._detect_file_type("/path/to/document.pdf")
assert file_type == FileType.PDF
def test_i_can_detect_txt_file_type(self, document_service):
"""Test text file type detection."""
file_type = document_service._detect_file_type("/path/to/document.txt")
assert file_type == FileType.TXT
def test_i_can_detect_docx_file_type(self, document_service):
"""Test DOCX file type detection."""
file_type = document_service._detect_file_type("/path/to/document.docx")
assert file_type == FileType.DOCX
def test_i_cannot_detect_unsupported_file_type(self, document_service):
"""Test unsupported file type raises ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
document_service._detect_file_type("/path/to/document.xyz")