697 lines
23 KiB
Python
697 lines
23 KiB
Python
"""
|
|
Unit tests for DocumentService using in-memory MongoDB.
|
|
|
|
Tests the orchestration logic with real MongoDB operations
|
|
using mongomock for better integration testing.
|
|
"""
|
|
|
|
import pytest
|
|
import pytest_asyncio
|
|
from unittest.mock import Mock, patch
|
|
from datetime import datetime
|
|
from bson import ObjectId
|
|
from pathlib import Path
|
|
|
|
from mongomock_motor import AsyncMongoMockClient
|
|
|
|
from app.services.document_service import DocumentService
|
|
from app.database.repositories.document_repository import FileDocumentRepository
|
|
from app.database.repositories.document_content_repository import DocumentContentRepository
|
|
from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod
|
|
from app.models.types import PyObjectId
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def in_memory_file_repository():
|
|
"""Create an in-memory FileDocumentRepository for testing."""
|
|
client = AsyncMongoMockClient()
|
|
db = client.test_database
|
|
repo = FileDocumentRepository(db)
|
|
await repo.initialize()
|
|
return repo
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def in_memory_content_repository():
|
|
"""Create an in-memory DocumentContentRepository for testing."""
|
|
client = AsyncMongoMockClient()
|
|
db = client.test_database
|
|
repo = DocumentContentRepository(db)
|
|
await repo.initialize()
|
|
return repo
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def in_memory_database():
|
|
"""Create an in-memory database for testing."""
|
|
client = AsyncMongoMockClient()
|
|
return client.test_database
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database):
|
|
"""Create DocumentService with in-memory repositories."""
|
|
with patch('app.services.document_service.get_database', return_value=in_memory_database):
|
|
service = DocumentService()
|
|
service.file_repository = in_memory_file_repository
|
|
service.content_repository = in_memory_content_repository
|
|
return service
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_file_bytes():
|
|
"""Sample file content as bytes."""
|
|
return b"This is a test PDF content"
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_text_bytes():
|
|
"""Sample text file content as bytes."""
|
|
return b"This is a test text file content"
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_file_hash():
|
|
"""Expected SHA256 hash for sample file bytes."""
|
|
import hashlib
|
|
return hashlib.sha256(b"This is a test PDF content").hexdigest()
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_file_document():
|
|
"""Sample FileDocument for testing."""
|
|
return FileDocument(
|
|
id=ObjectId(),
|
|
filename="test.pdf",
|
|
filepath="/test/test.pdf",
|
|
file_type=FileType.PDF,
|
|
extraction_method=None,
|
|
metadata={},
|
|
detected_at=datetime(2024, 1, 15, 10, 30, 0),
|
|
file_hash="test_hash"
|
|
)
|
|
|
|
|
|
class TestCreateDocument:
|
|
"""Tests for create_document method."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@patch('app.services.document_service.datetime')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_create_document_with_new_content(
|
|
self,
|
|
mock_datetime,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test creating document when content doesn't exist yet."""
|
|
# Setup mocks
|
|
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
|
|
mock_datetime.utcnow.return_value = fixed_time
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Execute
|
|
result = await document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Verify document creation
|
|
assert result is not None
|
|
assert result.filename == "test.pdf"
|
|
assert result.filepath == "/test/test.pdf"
|
|
assert result.file_type == FileType.PDF
|
|
assert result.detected_at == fixed_time
|
|
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
|
|
|
|
# Verify content was created
|
|
content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
result.file_hash
|
|
)
|
|
assert content is not None
|
|
assert content.file_hash == result.file_hash
|
|
assert content.file_size == len(sample_file_bytes)
|
|
assert content.mime_type == "application/pdf"
|
|
assert content.encoding == "utf-8"
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@patch('app.services.document_service.datetime')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_create_document_with_existing_content(
|
|
self,
|
|
mock_datetime,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test creating document when content already exists (deduplication)."""
|
|
# Setup mocks
|
|
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
|
|
mock_datetime.utcnow.return_value = fixed_time
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create first document
|
|
first_doc = await document_service.create_document(
|
|
"/test/first.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Create second document with same content
|
|
second_doc = await document_service.create_document(
|
|
"/test/second.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Verify both documents exist but share same hash
|
|
assert first_doc.file_hash == second_doc.file_hash
|
|
assert first_doc.filename != second_doc.filename
|
|
assert first_doc.filepath != second_doc.filepath
|
|
|
|
# Verify only one content document exists
|
|
all_content = await document_service.content_repository.list_document_content()
|
|
content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash]
|
|
assert len(content_for_hash) == 1
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_create_document_with_different_encodings(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_text_bytes
|
|
):
|
|
"""Test creating documents with different text encodings."""
|
|
# Setup
|
|
mock_magic.return_value = "text/plain"
|
|
|
|
# Test with different encodings
|
|
encodings = ["utf-8", "latin-1", "ascii"]
|
|
|
|
for i, encoding in enumerate(encodings):
|
|
result = await document_service.create_document(
|
|
f"/test/test{i}.txt",
|
|
sample_text_bytes,
|
|
encoding
|
|
)
|
|
|
|
# Verify document was created
|
|
assert result is not None
|
|
assert result.file_type == FileType.TXT
|
|
|
|
# Verify content has correct encoding
|
|
content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
result.file_hash
|
|
)
|
|
assert content.encoding == encoding
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_i_cannot_create_document_with_unsupported_file_type(
|
|
self,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test that unsupported file types raise ValueError."""
|
|
with pytest.raises(ValueError, match="Unsupported file type"):
|
|
await document_service.create_document(
|
|
"/test/test.xyz", # Unsupported extension
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_i_cannot_create_document_with_empty_file_path(
|
|
self,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test that empty file path raises ValueError."""
|
|
with pytest.raises(ValueError):
|
|
await document_service.create_document(
|
|
"", # Empty path
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_create_document_with_empty_bytes(
|
|
self,
|
|
mock_magic,
|
|
document_service
|
|
):
|
|
"""Test behavior with empty file bytes."""
|
|
# Setup
|
|
mock_magic.return_value = "text/plain"
|
|
|
|
# Execute with empty bytes
|
|
result = await document_service.create_document(
|
|
"/test/empty.txt",
|
|
b"", # Empty bytes
|
|
"utf-8"
|
|
)
|
|
|
|
# Should still work but with zero file size
|
|
assert result is not None
|
|
content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
result.file_hash
|
|
)
|
|
assert content.file_size == 0
|
|
|
|
|
|
class TestGetMethods:
|
|
"""Tests for document retrieval methods."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_get_document_by_id(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test retrieving document by ID."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = await document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
result = await document_service.get_document_by_id(created_doc.id)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
assert result.id == created_doc.id
|
|
assert result.filename == created_doc.filename
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_get_document_by_hash(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test retrieving document by file hash."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = await document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
result = await document_service.get_document_by_hash(created_doc.file_hash)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
assert result.file_hash == created_doc.file_hash
|
|
assert result.filename == created_doc.filename
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_get_document_by_filepath(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test retrieving document by file path."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
test_path = "/test/unique_test.pdf"
|
|
|
|
# Create a document first
|
|
created_doc = await document_service.create_document(
|
|
test_path,
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
result = await document_service.get_document_by_filepath(test_path)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
assert result.filepath == test_path
|
|
assert result.id == created_doc.id
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_get_document_with_content(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test retrieving document with associated content."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = await document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
result = await document_service.get_document_with_content(created_doc.id)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
document, content = result
|
|
assert document.id == created_doc.id
|
|
assert content is not None
|
|
assert content.file_hash == created_doc.file_hash
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_i_cannot_get_nonexistent_document_by_id(
|
|
self,
|
|
document_service
|
|
):
|
|
"""Test that nonexistent document returns None."""
|
|
# Execute with random ObjectId
|
|
result = await document_service.get_document_by_id(ObjectId())
|
|
|
|
# Verify
|
|
assert result is None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_i_cannot_get_nonexistent_document_by_hash(
|
|
self,
|
|
document_service
|
|
):
|
|
"""Test that nonexistent document hash returns None."""
|
|
# Execute
|
|
result = await document_service.get_document_by_hash("nonexistent_hash")
|
|
|
|
# Verify
|
|
assert result is None
|
|
|
|
|
|
class TestPaginationAndCounting:
|
|
"""Tests for document listing and counting."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_list_documents_with_pagination(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test document listing with pagination parameters."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create multiple documents
|
|
for i in range(5):
|
|
await document_service.create_document(
|
|
f"/test/test{i}.pdf",
|
|
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute with pagination
|
|
result = await document_service.list_documents(skip=1, limit=2)
|
|
|
|
# Verify
|
|
assert len(result) == 2
|
|
|
|
# Test counting
|
|
total_count = await document_service.count_documents()
|
|
assert total_count == 5
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_count_documents(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test document counting."""
|
|
# Setup
|
|
mock_magic.return_value = "text/plain"
|
|
|
|
# Initially should be 0
|
|
initial_count = await document_service.count_documents()
|
|
assert initial_count == 0
|
|
|
|
# Create some documents
|
|
for i in range(3):
|
|
await document_service.create_document(
|
|
f"/test/test{i}.txt",
|
|
sample_file_bytes + bytes(str(i), 'utf-8'),
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
final_count = await document_service.count_documents()
|
|
|
|
# Verify
|
|
assert final_count == 3
|
|
|
|
|
|
class TestUpdateAndDelete:
|
|
"""Tests for document update and deletion operations."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_update_document_metadata(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test updating document metadata."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = await document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute update
|
|
update_data = {"metadata": {"page_count": 5}}
|
|
result = await document_service.update_document(created_doc.id, update_data)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
assert result.metadata.get("page_count") == 5
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_delete_document_and_orphaned_content(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test deleting document with orphaned content cleanup."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document
|
|
created_doc = await document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Verify content exists
|
|
content_before = await document_service.content_repository.find_document_content_by_file_hash(
|
|
created_doc.file_hash
|
|
)
|
|
assert content_before is not None
|
|
|
|
# Execute deletion
|
|
result = await document_service.delete_document(created_doc.id)
|
|
|
|
# Verify document and content are deleted
|
|
assert result is True
|
|
|
|
deleted_doc = await document_service.get_document_by_id(created_doc.id)
|
|
assert deleted_doc is None
|
|
|
|
content_after = await document_service.content_repository.find_document_content_by_file_hash(
|
|
created_doc.file_hash
|
|
)
|
|
assert content_after is None
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_delete_document_without_affecting_shared_content(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test deleting document without removing shared content."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create two documents with same content
|
|
doc1 = await document_service.create_document(
|
|
"/test/test1.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
doc2 = await document_service.create_document(
|
|
"/test/test2.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# They should share the same hash
|
|
assert doc1.file_hash == doc2.file_hash
|
|
|
|
# Delete first document
|
|
result = await document_service.delete_document(doc1.id)
|
|
assert result is True
|
|
|
|
# Verify first document is deleted but content still exists
|
|
deleted_doc = await document_service.get_document_by_id(doc1.id)
|
|
assert deleted_doc is None
|
|
|
|
remaining_doc = await document_service.get_document_by_id(doc2.id)
|
|
assert remaining_doc is not None
|
|
|
|
content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
doc2.file_hash
|
|
)
|
|
assert content is not None
|
|
|
|
|
|
class TestUtilityMethods:
|
|
"""Tests for utility methods."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_check_content_exists(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test checking if content exists by hash."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Initially content doesn't exist
|
|
test_hash = "nonexistent_hash"
|
|
exists_before = await document_service.content_exists(test_hash)
|
|
assert exists_before is False
|
|
|
|
# Create a document
|
|
created_doc = await document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Now content should exist
|
|
exists_after = await document_service.content_exists(created_doc.file_hash)
|
|
assert exists_after is True
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@pytest.mark.asyncio
|
|
async def test_i_can_update_document_content(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test updating extracted document content."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = await document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Update content
|
|
new_content = "Updated extracted content"
|
|
result = await document_service.update_document_content(
|
|
created_doc.file_hash,
|
|
new_content
|
|
)
|
|
|
|
# Verify update
|
|
assert result is not None
|
|
assert result.content == new_content
|
|
|
|
# Verify persistence
|
|
updated_content = await document_service.content_repository.find_document_content_by_file_hash(
|
|
created_doc.file_hash
|
|
)
|
|
assert updated_content.content == new_content
|
|
|
|
|
|
class TestHashCalculation:
|
|
"""Tests for file hash calculation utility."""
|
|
|
|
def test_i_can_calculate_consistent_file_hash(self, document_service):
|
|
"""Test that file hash calculation is consistent."""
|
|
test_bytes = b"Test content for hashing"
|
|
|
|
# Calculate hash multiple times
|
|
hash1 = document_service._calculate_file_hash(test_bytes)
|
|
hash2 = document_service._calculate_file_hash(test_bytes)
|
|
|
|
# Should be identical
|
|
assert hash1 == hash2
|
|
assert len(hash1) == 64 # SHA256 produces 64-character hex string
|
|
|
|
def test_i_get_different_hashes_for_different_content(self, document_service):
|
|
"""Test that different content produces different hashes."""
|
|
content1 = b"First content"
|
|
content2 = b"Second content"
|
|
|
|
hash1 = document_service._calculate_file_hash(content1)
|
|
hash2 = document_service._calculate_file_hash(content2)
|
|
|
|
assert hash1 != hash2
|
|
|
|
|
|
class TestFileTypeDetection:
|
|
"""Tests for file type detection."""
|
|
|
|
def test_i_can_detect_pdf_file_type(self, document_service):
|
|
"""Test PDF file type detection."""
|
|
file_type = document_service._detect_file_type("/path/to/document.pdf")
|
|
assert file_type == FileType.PDF
|
|
|
|
def test_i_can_detect_txt_file_type(self, document_service):
|
|
"""Test text file type detection."""
|
|
file_type = document_service._detect_file_type("/path/to/document.txt")
|
|
assert file_type == FileType.TXT
|
|
|
|
def test_i_can_detect_docx_file_type(self, document_service):
|
|
"""Test DOCX file type detection."""
|
|
file_type = document_service._detect_file_type("/path/to/document.docx")
|
|
assert file_type == FileType.DOCX
|
|
|
|
def test_i_cannot_detect_unsupported_file_type(self, document_service):
|
|
"""Test unsupported file type raises ValueError."""
|
|
with pytest.raises(ValueError, match="Unsupported file type"):
|
|
document_service._detect_file_type("/path/to/document.xyz") |