Files
MyDocManager/tests/services/test_document_service.py
2025-09-24 21:53:48 +02:00

571 lines
17 KiB
Python

"""
Unit tests for DocumentService using in-memory MongoDB.
Tests the orchestration logic with real MongoDB operations
using mongomock for better integration testing.
"""
import os
from datetime import datetime
from unittest.mock import patch
import pytest
import pytest_asyncio
from bson import ObjectId
from mongomock.mongo_client import MongoClient
from app.models.document import FileType
from app.services.document_service import DocumentService
@pytest.fixture(autouse=True)
def cleanup_test_folder():
"""Clean up test folder."""
import shutil
shutil.rmtree("test_folder", ignore_errors=True)
@pytest.fixture
def in_memory_database():
"""Create an in-memory database for testing."""
client = MongoClient()
return client.test_database
@pytest_asyncio.fixture
def document_service(in_memory_database):
"""Create DocumentService with in-memory repositories."""
service = DocumentService(in_memory_database, objects_folder="test_folder")
return service
@pytest.fixture
def sample_file_bytes():
"""Sample file content as bytes."""
return b"This is a test PDF content"
@pytest.fixture
def sample_text_bytes():
"""Sample text file content as bytes."""
return b"This is a test text file content"
@pytest.fixture
def sample_file_hash():
"""Expected SHA256 hash for sample file bytes."""
import hashlib
return hashlib.sha256(b"This is a test PDF content").hexdigest()
def validate_file_saved(document_service, file_hash, file_bytes):
# Verify file is saved to disk
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
assert os.path.exists(target_file_path)
with open(target_file_path, "rb") as f:
content = f.read()
assert content == file_bytes
class TestCreateDocument:
"""Tests for create_document method."""
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
def test_i_can_create_document_with_new_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content doesn't exist yet."""
# Setup mocks
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
mock_datetime.now.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Execute
result = document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify document creation
assert result is not None
assert result.filename == "test.pdf"
assert result.filepath == "/test/test.pdf"
assert result.file_type == FileType.PDF
assert result.detected_at == fixed_time
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
# Verify document created in database
doc_in_db = document_service.document_repository.find_document_by_id(result.id)
assert doc_in_db is not None
assert doc_in_db.id == result.id
assert doc_in_db.filename == result.filename
assert doc_in_db.filepath == result.filepath
assert doc_in_db.file_type == result.file_type
assert doc_in_db.detected_at == fixed_time
assert doc_in_db.file_hash == result.file_hash
# Verify file is saved to disk
validate_file_saved(document_service, result.file_hash, sample_file_bytes)
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
def test_i_can_create_document_with_existing_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content already exists (deduplication)."""
# Setup mocks
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
mock_datetime.now.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Create first document
first_doc = document_service.create_document(
"/test/first.pdf",
sample_file_bytes,
"utf-8"
)
# Create second document with same content
second_doc = document_service.create_document(
"/test/second.pdf",
sample_file_bytes,
"utf-8"
)
# Verify both documents exist but share same hash
assert first_doc.file_hash == second_doc.file_hash
assert first_doc.filename != second_doc.filename
assert first_doc.filepath != second_doc.filepath
def test_i_cannot_create_document_with_unsupported_file_type(
self,
document_service,
sample_file_bytes
):
"""Test that unsupported file types raise ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
document_service.create_document(
"/test/test.xyz", # Unsupported extension
sample_file_bytes,
"utf-8"
)
def test_i_cannot_create_document_with_empty_file_path(
self,
document_service,
sample_file_bytes
):
"""Test that empty file path raises ValueError."""
with pytest.raises(ValueError):
document_service.create_document(
"", # Empty path
sample_file_bytes,
"utf-8"
)
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_create_document_with_empty_bytes(
self,
mock_magic,
document_service
):
"""Test behavior with empty file bytes."""
# Setup
mock_magic.return_value = "text/plain"
# Execute with empty bytes
result = document_service.create_document(
"/test/empty.txt",
b"", # Empty bytes
"utf-8"
)
# Verify file is saved to disk
validate_file_saved(document_service, result.file_hash, b"")
class TestGetMethods:
"""Tests for document retrieval methods."""
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_get_document_by_id(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by ID."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = document_service.get_document_by_id(created_doc.id)
# Verify
assert result is not None
assert result.id == created_doc.id
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_get_document_by_hash(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file hash."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = document_service.get_document_by_hash(created_doc.file_hash)
# Verify
assert result is not None
assert result.file_hash == created_doc.file_hash
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_get_document_by_filepath(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file path."""
# Setup
mock_magic.return_value = "application/pdf"
test_path = "/test/unique_test.pdf"
# Create a document first
created_doc = document_service.create_document(
test_path,
sample_file_bytes,
"utf-8"
)
# Execute
result = document_service.get_document_by_filepath(test_path)
# Verify
assert result is not None
assert result.filepath == test_path
assert result.id == created_doc.id
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_get_document_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document with associated content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = document_service.get_document_content_by_hash(created_doc.file_hash)
# Verify
assert result == sample_file_bytes
def test_i_cannot_get_nonexistent_document_by_id(
self,
document_service
):
"""Test that nonexistent document returns None."""
# Execute with random ObjectId
result = document_service.get_document_by_id(ObjectId())
# Verify
assert result is None
def test_i_cannot_get_nonexistent_document_by_hash(
self,
document_service
):
"""Test that nonexistent document hash returns None."""
# Execute
result = document_service.get_document_by_hash("nonexistent_hash")
# Verify
assert result is None
class TestPaginationAndCounting:
"""Tests for document listing and counting."""
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_list_documents_with_pagination(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document listing with pagination parameters."""
# Setup
mock_magic.return_value = "application/pdf"
# Create multiple documents
for i in range(5):
document_service.create_document(
f"/test/test{i}.pdf",
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
"utf-8"
)
# Execute with pagination
result = document_service.list_documents(skip=1, limit=2)
# Verify
assert len(result) == 2
# Test counting
total_count = document_service.count_documents()
assert total_count == 5
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_count_documents(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document counting."""
# Setup
mock_magic.return_value = "text/plain"
# Initially should be 0
initial_count = document_service.count_documents()
assert initial_count == 0
# Create some documents
for i in range(3):
document_service.create_document(
f"/test/test{i}.txt",
sample_file_bytes + bytes(str(i), 'utf-8'),
"utf-8"
)
# Execute
final_count = document_service.count_documents()
# Verify
assert final_count == 3
class TestUpdateAndDelete:
"""Tests for document update and deletion operations."""
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_update_document_metadata(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test updating document metadata."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute update
update_data = {"metadata": {"page_count": 5}}
result = document_service.update_document(created_doc.id, update_data)
# Verify
assert result is not None
assert result.metadata.get("page_count") == 5
assert result.filename == created_doc.filename
assert result.filepath == created_doc.filepath
assert result.file_hash == created_doc.file_hash
assert result.file_type == created_doc.file_type
assert result.metadata == update_data['metadata']
def test_i_can_update_document_content(
self,
document_service,
sample_file_bytes
):
# Create a document first
created_doc = document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute update
update_data = {"file_bytes": b"this is an updated file content"}
result = document_service.update_document(created_doc.id, update_data)
assert result.filename == created_doc.filename
assert result.filepath == created_doc.filepath
assert result.file_hash != created_doc.file_hash
assert result.file_type == created_doc.file_type
assert result.metadata == created_doc.metadata
# Verify file is saved to disk
validate_file_saved(document_service, result.file_hash, b"this is an updated file content")
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_delete_document_and_orphaned_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document with orphaned content cleanup."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document
created_doc = document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify content exists
validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes)
# Execute deletion
result = document_service.delete_document(created_doc.id)
# Verify document and content are deleted
assert result is True
deleted_doc = document_service.get_document_by_id(created_doc.id)
assert deleted_doc is None
# validate content is deleted
file_hash = created_doc.file_hash[:24]
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
assert not os.path.exists(target_file_path)
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_delete_document_without_affecting_shared_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document without removing shared content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create two documents with same content
doc1 = document_service.create_document(
"/test/test1.pdf",
sample_file_bytes,
"utf-8"
)
doc2 = document_service.create_document(
"/test/test2.pdf",
sample_file_bytes,
"utf-8"
)
# They should share the same hash
assert doc1.file_hash == doc2.file_hash
# Delete first document
result = document_service.delete_document(doc1.id)
assert result is True
# Verify first document is deleted but content still exists
deleted_doc = document_service.get_document_by_id(doc1.id)
assert deleted_doc is None
remaining_doc = document_service.get_document_by_id(doc2.id)
assert remaining_doc is not None
validate_file_saved(document_service, doc2.file_hash, sample_file_bytes)
class TestHashCalculation:
"""Tests for file hash calculation utility."""
def test_i_can_calculate_consistent_file_hash(self, document_service):
"""Test that file hash calculation is consistent."""
test_bytes = b"Test content for hashing"
# Calculate hash multiple times
hash1 = document_service._calculate_file_hash(test_bytes)
hash2 = document_service._calculate_file_hash(test_bytes)
# Should be identical
assert hash1 == hash2
assert len(hash1) == 64 # SHA256 produces 64-character hex string
def test_i_get_different_hashes_for_different_content(self, document_service):
"""Test that different content produces different hashes."""
content1 = b"First content"
content2 = b"Second content"
hash1 = document_service._calculate_file_hash(content1)
hash2 = document_service._calculate_file_hash(content2)
assert hash1 != hash2
class TestFileTypeDetection:
"""Tests for file type detection."""
def test_i_can_detect_pdf_file_type(self, document_service):
"""Test PDF file type detection."""
file_type = document_service._detect_file_type("/path/to/document.pdf")
assert file_type == FileType.PDF
def test_i_can_detect_txt_file_type(self, document_service):
"""Test text file type detection."""
file_type = document_service._detect_file_type("/path/to/document.txt")
assert file_type == FileType.TXT
def test_i_can_detect_docx_file_type(self, document_service):
"""Test DOCX file type detection."""
file_type = document_service._detect_file_type("/path/to/document.docx")
assert file_type == FileType.DOCX
def test_i_cannot_detect_unsupported_file_type(self, document_service):
"""Test unsupported file type raises ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
document_service._detect_file_type("/path/to/document.xyz")