571 lines
17 KiB
Python
571 lines
17 KiB
Python
"""
|
|
Unit tests for DocumentService using in-memory MongoDB.
|
|
|
|
Tests the orchestration logic with real MongoDB operations
|
|
using mongomock for better integration testing.
|
|
"""
|
|
import os
|
|
from datetime import datetime
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
import pytest_asyncio
|
|
from bson import ObjectId
|
|
from mongomock.mongo_client import MongoClient
|
|
|
|
from app.models.document import FileType
|
|
from app.services.document_service import DocumentService
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def cleanup_test_folder():
|
|
"""Clean up test folder."""
|
|
import shutil
|
|
shutil.rmtree("test_folder", ignore_errors=True)
|
|
|
|
|
|
@pytest.fixture
|
|
def in_memory_database():
|
|
"""Create an in-memory database for testing."""
|
|
client = MongoClient()
|
|
return client.test_database
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
def document_service(in_memory_database):
|
|
"""Create DocumentService with in-memory repositories."""
|
|
service = DocumentService(in_memory_database, objects_folder="test_folder")
|
|
return service
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_file_bytes():
|
|
"""Sample file content as bytes."""
|
|
return b"This is a test PDF content"
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_text_bytes():
|
|
"""Sample text file content as bytes."""
|
|
return b"This is a test text file content"
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_file_hash():
|
|
"""Expected SHA256 hash for sample file bytes."""
|
|
import hashlib
|
|
return hashlib.sha256(b"This is a test PDF content").hexdigest()
|
|
|
|
|
|
def validate_file_saved(document_service, file_hash, file_bytes):
|
|
# Verify file is saved to disk
|
|
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
|
|
assert os.path.exists(target_file_path)
|
|
|
|
with open(target_file_path, "rb") as f:
|
|
content = f.read()
|
|
assert content == file_bytes
|
|
|
|
|
|
class TestCreateDocument:
|
|
"""Tests for create_document method."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@patch('app.services.document_service.datetime')
|
|
def test_i_can_create_document_with_new_content(
|
|
self,
|
|
mock_datetime,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test creating document when content doesn't exist yet."""
|
|
# Setup mocks
|
|
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
|
|
mock_datetime.now.return_value = fixed_time
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Execute
|
|
result = document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Verify document creation
|
|
assert result is not None
|
|
assert result.filename == "test.pdf"
|
|
assert result.filepath == "/test/test.pdf"
|
|
assert result.file_type == FileType.PDF
|
|
assert result.detected_at == fixed_time
|
|
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
|
|
|
|
# Verify document created in database
|
|
doc_in_db = document_service.document_repository.find_document_by_id(result.id)
|
|
assert doc_in_db is not None
|
|
assert doc_in_db.id == result.id
|
|
assert doc_in_db.filename == result.filename
|
|
assert doc_in_db.filepath == result.filepath
|
|
assert doc_in_db.file_type == result.file_type
|
|
assert doc_in_db.detected_at == fixed_time
|
|
assert doc_in_db.file_hash == result.file_hash
|
|
|
|
# Verify file is saved to disk
|
|
validate_file_saved(document_service, result.file_hash, sample_file_bytes)
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
@patch('app.services.document_service.datetime')
|
|
def test_i_can_create_document_with_existing_content(
|
|
self,
|
|
mock_datetime,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test creating document when content already exists (deduplication)."""
|
|
# Setup mocks
|
|
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
|
|
mock_datetime.now.return_value = fixed_time
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create first document
|
|
first_doc = document_service.create_document(
|
|
"/test/first.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Create second document with same content
|
|
second_doc = document_service.create_document(
|
|
"/test/second.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Verify both documents exist but share same hash
|
|
assert first_doc.file_hash == second_doc.file_hash
|
|
assert first_doc.filename != second_doc.filename
|
|
assert first_doc.filepath != second_doc.filepath
|
|
|
|
def test_i_cannot_create_document_with_unsupported_file_type(
|
|
self,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test that unsupported file types raise ValueError."""
|
|
with pytest.raises(ValueError, match="Unsupported file type"):
|
|
document_service.create_document(
|
|
"/test/test.xyz", # Unsupported extension
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
def test_i_cannot_create_document_with_empty_file_path(
|
|
self,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test that empty file path raises ValueError."""
|
|
with pytest.raises(ValueError):
|
|
document_service.create_document(
|
|
"", # Empty path
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_create_document_with_empty_bytes(
|
|
self,
|
|
mock_magic,
|
|
document_service
|
|
):
|
|
"""Test behavior with empty file bytes."""
|
|
# Setup
|
|
mock_magic.return_value = "text/plain"
|
|
|
|
# Execute with empty bytes
|
|
result = document_service.create_document(
|
|
"/test/empty.txt",
|
|
b"", # Empty bytes
|
|
"utf-8"
|
|
)
|
|
|
|
# Verify file is saved to disk
|
|
validate_file_saved(document_service, result.file_hash, b"")
|
|
|
|
|
|
class TestGetMethods:
|
|
"""Tests for document retrieval methods."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_get_document_by_id(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test retrieving document by ID."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
result = document_service.get_document_by_id(created_doc.id)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
assert result.id == created_doc.id
|
|
assert result.filename == created_doc.filename
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_get_document_by_hash(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test retrieving document by file hash."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
result = document_service.get_document_by_hash(created_doc.file_hash)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
assert result.file_hash == created_doc.file_hash
|
|
assert result.filename == created_doc.filename
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_get_document_by_filepath(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test retrieving document by file path."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
test_path = "/test/unique_test.pdf"
|
|
|
|
# Create a document first
|
|
created_doc = document_service.create_document(
|
|
test_path,
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
result = document_service.get_document_by_filepath(test_path)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
assert result.filepath == test_path
|
|
assert result.id == created_doc.id
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_get_document_content(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test retrieving document with associated content."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
result = document_service.get_document_content_by_hash(created_doc.file_hash)
|
|
|
|
# Verify
|
|
assert result == sample_file_bytes
|
|
|
|
def test_i_cannot_get_nonexistent_document_by_id(
|
|
self,
|
|
document_service
|
|
):
|
|
"""Test that nonexistent document returns None."""
|
|
# Execute with random ObjectId
|
|
result = document_service.get_document_by_id(ObjectId())
|
|
|
|
# Verify
|
|
assert result is None
|
|
|
|
def test_i_cannot_get_nonexistent_document_by_hash(
|
|
self,
|
|
document_service
|
|
):
|
|
"""Test that nonexistent document hash returns None."""
|
|
# Execute
|
|
result = document_service.get_document_by_hash("nonexistent_hash")
|
|
|
|
# Verify
|
|
assert result is None
|
|
|
|
|
|
class TestPaginationAndCounting:
|
|
"""Tests for document listing and counting."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_list_documents_with_pagination(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test document listing with pagination parameters."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create multiple documents
|
|
for i in range(5):
|
|
document_service.create_document(
|
|
f"/test/test{i}.pdf",
|
|
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute with pagination
|
|
result = document_service.list_documents(skip=1, limit=2)
|
|
|
|
# Verify
|
|
assert len(result) == 2
|
|
|
|
# Test counting
|
|
total_count = document_service.count_documents()
|
|
assert total_count == 5
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_count_documents(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test document counting."""
|
|
# Setup
|
|
mock_magic.return_value = "text/plain"
|
|
|
|
# Initially should be 0
|
|
initial_count = document_service.count_documents()
|
|
assert initial_count == 0
|
|
|
|
# Create some documents
|
|
for i in range(3):
|
|
document_service.create_document(
|
|
f"/test/test{i}.txt",
|
|
sample_file_bytes + bytes(str(i), 'utf-8'),
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute
|
|
final_count = document_service.count_documents()
|
|
|
|
# Verify
|
|
assert final_count == 3
|
|
|
|
|
|
class TestUpdateAndDelete:
|
|
"""Tests for document update and deletion operations."""
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_update_document_metadata(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test updating document metadata."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document first
|
|
created_doc = document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute update
|
|
update_data = {"metadata": {"page_count": 5}}
|
|
result = document_service.update_document(created_doc.id, update_data)
|
|
|
|
# Verify
|
|
assert result is not None
|
|
assert result.metadata.get("page_count") == 5
|
|
assert result.filename == created_doc.filename
|
|
assert result.filepath == created_doc.filepath
|
|
assert result.file_hash == created_doc.file_hash
|
|
assert result.file_type == created_doc.file_type
|
|
assert result.metadata == update_data['metadata']
|
|
|
|
def test_i_can_update_document_content(
|
|
self,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
# Create a document first
|
|
created_doc = document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Execute update
|
|
update_data = {"file_bytes": b"this is an updated file content"}
|
|
result = document_service.update_document(created_doc.id, update_data)
|
|
|
|
assert result.filename == created_doc.filename
|
|
assert result.filepath == created_doc.filepath
|
|
assert result.file_hash != created_doc.file_hash
|
|
assert result.file_type == created_doc.file_type
|
|
assert result.metadata == created_doc.metadata
|
|
|
|
# Verify file is saved to disk
|
|
validate_file_saved(document_service, result.file_hash, b"this is an updated file content")
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_delete_document_and_orphaned_content(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test deleting document with orphaned content cleanup."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create a document
|
|
created_doc = document_service.create_document(
|
|
"/test/test.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# Verify content exists
|
|
validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes)
|
|
|
|
# Execute deletion
|
|
result = document_service.delete_document(created_doc.id)
|
|
|
|
# Verify document and content are deleted
|
|
assert result is True
|
|
|
|
deleted_doc = document_service.get_document_by_id(created_doc.id)
|
|
assert deleted_doc is None
|
|
|
|
# validate content is deleted
|
|
file_hash = created_doc.file_hash[:24]
|
|
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
|
|
assert not os.path.exists(target_file_path)
|
|
|
|
@patch('app.services.document_service.magic.from_buffer')
|
|
def test_i_can_delete_document_without_affecting_shared_content(
|
|
self,
|
|
mock_magic,
|
|
document_service,
|
|
sample_file_bytes
|
|
):
|
|
"""Test deleting document without removing shared content."""
|
|
# Setup
|
|
mock_magic.return_value = "application/pdf"
|
|
|
|
# Create two documents with same content
|
|
doc1 = document_service.create_document(
|
|
"/test/test1.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
doc2 = document_service.create_document(
|
|
"/test/test2.pdf",
|
|
sample_file_bytes,
|
|
"utf-8"
|
|
)
|
|
|
|
# They should share the same hash
|
|
assert doc1.file_hash == doc2.file_hash
|
|
|
|
# Delete first document
|
|
result = document_service.delete_document(doc1.id)
|
|
assert result is True
|
|
|
|
# Verify first document is deleted but content still exists
|
|
deleted_doc = document_service.get_document_by_id(doc1.id)
|
|
assert deleted_doc is None
|
|
|
|
remaining_doc = document_service.get_document_by_id(doc2.id)
|
|
assert remaining_doc is not None
|
|
|
|
validate_file_saved(document_service, doc2.file_hash, sample_file_bytes)
|
|
|
|
|
|
class TestHashCalculation:
|
|
"""Tests for file hash calculation utility."""
|
|
|
|
def test_i_can_calculate_consistent_file_hash(self, document_service):
|
|
"""Test that file hash calculation is consistent."""
|
|
test_bytes = b"Test content for hashing"
|
|
|
|
# Calculate hash multiple times
|
|
hash1 = document_service._calculate_file_hash(test_bytes)
|
|
hash2 = document_service._calculate_file_hash(test_bytes)
|
|
|
|
# Should be identical
|
|
assert hash1 == hash2
|
|
assert len(hash1) == 64 # SHA256 produces 64-character hex string
|
|
|
|
def test_i_get_different_hashes_for_different_content(self, document_service):
|
|
"""Test that different content produces different hashes."""
|
|
content1 = b"First content"
|
|
content2 = b"Second content"
|
|
|
|
hash1 = document_service._calculate_file_hash(content1)
|
|
hash2 = document_service._calculate_file_hash(content2)
|
|
|
|
assert hash1 != hash2
|
|
|
|
|
|
class TestFileTypeDetection:
|
|
"""Tests for file type detection."""
|
|
|
|
def test_i_can_detect_pdf_file_type(self, document_service):
|
|
"""Test PDF file type detection."""
|
|
file_type = document_service._detect_file_type("/path/to/document.pdf")
|
|
assert file_type == FileType.PDF
|
|
|
|
def test_i_can_detect_txt_file_type(self, document_service):
|
|
"""Test text file type detection."""
|
|
file_type = document_service._detect_file_type("/path/to/document.txt")
|
|
assert file_type == FileType.TXT
|
|
|
|
def test_i_can_detect_docx_file_type(self, document_service):
|
|
"""Test DOCX file type detection."""
|
|
file_type = document_service._detect_file_type("/path/to/document.docx")
|
|
assert file_type == FileType.DOCX
|
|
|
|
def test_i_cannot_detect_unsupported_file_type(self, document_service):
|
|
"""Test unsupported file type raises ValueError."""
|
|
with pytest.raises(ValueError, match="Unsupported file type"):
|
|
document_service._detect_file_type("/path/to/document.xyz")
|