Refactored DocumentService to save document in the filesystem. Fixed docker application

This commit is contained in:
2025-09-20 21:06:27 +02:00
parent f1b551d243
commit 9564cfadd5
28 changed files with 1577 additions and 2442 deletions

View File

0
tests/models/__init__.py Normal file
View File

View File

View File

@@ -0,0 +1,672 @@
"""
Test suite for FileDocumentRepository with async/await support.
This module contains comprehensive tests for all FileDocumentRepository methods
using mongomock-motor for in-memory MongoDB testing.
"""
import pytest
from datetime import datetime
from typing import Dict, Any
import pytest_asyncio
from bson import ObjectId
from pymongo.errors import DuplicateKeyError, PyMongoError
from mongomock_motor import AsyncMongoMockClient
from app.database.repositories.document_repository import (
FileDocumentRepository,
MatchMethodBase,
SubsequenceMatching,
FuzzyMatching
)
from app.models.document import FileDocument, FileType, ExtractionMethod
@pytest_asyncio.fixture
async def in_memory_repository():
"""Create an in-memory FileDocumentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository(db)
await repo.initialize()
return repo
@pytest.fixture
def sample_file_document():
"""Sample FileDocument data for testing."""
return FileDocument(
filename="sample_document.pdf",
filepath="/home/user/documents/sample_document.pdf",
file_type=FileType.PDF,
extraction_method=ExtractionMethod.OCR,
metadata={"pages": 5, "language": "en", "author": "John Doe"},
detected_at=datetime.now(),
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
encoding="utf-8",
file_size=1024000,
mime_type="application/pdf"
)
@pytest.fixture
def sample_update_data():
"""Sample update data for testing."""
return {
"extraction_method": ExtractionMethod.HYBRID,
"metadata": {"pages": 10, "language": "fr", "updated": True},
"file_size": 2048000
}
@pytest.fixture
def multiple_sample_files():
"""Multiple FileDocument objects for list/search testing."""
base_time = datetime.now()
return [
FileDocument(
filename="first_doc.txt",
filepath="/docs/first_doc.txt",
file_type=FileType.TXT,
extraction_method=ExtractionMethod.DIRECT_TEXT,
metadata={"words": 500},
detected_at=base_time,
file_hash="hash1" + "0" * 58,
encoding="utf-8",
file_size=5000,
mime_type="text/plain"
),
FileDocument(
filename="second_document.pdf",
filepath="/docs/second_document.pdf",
file_type=FileType.PDF,
extraction_method=ExtractionMethod.OCR,
metadata={"pages": 8},
detected_at=base_time,
file_hash="hash2" + "0" * 58,
encoding="utf-8",
file_size=10000,
mime_type="application/pdf"
),
FileDocument(
filename="third_file.docx",
filepath="/docs/third_file.docx",
file_type=FileType.DOCX,
extraction_method=ExtractionMethod.HYBRID,
metadata={"paragraphs": 15},
detected_at=base_time,
file_hash="hash3" + "0" * 58,
encoding="utf-8",
file_size=15000,
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
]
class TestFileDocumentRepositoryInitialization:
"""Tests for repository initialization."""
@pytest.mark.asyncio
async def test_i_can_initialize_repository(self):
"""Test repository initialization."""
# Arrange
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository(db)
await repo.initialize()
# Act & Assert (should not raise any exception)
assert repo.db is not None
assert repo.collection is not None
# TODO : check that the indexes are created
class TestFileDocumentRepositoryCreation:
"""Tests for file document creation functionality."""
@pytest.mark.asyncio
async def test_i_can_create_file_document(self, in_memory_repository, sample_file_document):
"""Test successful file document creation."""
# Act
created_file = await in_memory_repository.create_document(sample_file_document)
# Assert
assert created_file is not None
assert created_file.filename == sample_file_document.filename
assert created_file.filepath == sample_file_document.filepath
assert created_file.file_type == sample_file_document.file_type
assert created_file.extraction_method == sample_file_document.extraction_method
assert created_file.metadata == sample_file_document.metadata
assert created_file.file_hash == sample_file_document.file_hash
assert created_file.file_size == sample_file_document.file_size
assert created_file.mime_type == sample_file_document.mime_type
assert created_file.id is not None
assert isinstance(created_file.id, ObjectId)
@pytest.mark.asyncio
async def test_i_can_create_file_document_without_id(self, in_memory_repository, sample_file_document):
"""Test creating file document with _id set to None (should be removed)."""
# Arrange
sample_file_document.id = None
# Act
created_file = await in_memory_repository.create_document(sample_file_document)
# Assert
assert created_file is not None
assert created_file.id is not None
assert isinstance(created_file.id, ObjectId)
@pytest.mark.asyncio
async def test_i_cannot_create_duplicate_file_document(self, in_memory_repository, sample_file_document):
"""Test that creating file document with duplicate filepath raises DuplicateKeyError."""
# Arrange
await in_memory_repository.create_document(sample_file_document)
duplicate_file = FileDocument(
filename="different_name.pdf",
filepath=sample_file_document.filepath, # Same filepath
file_type=FileType.PDF,
extraction_method=ExtractionMethod.OCR,
metadata={"different": "metadata"},
detected_at=datetime.now(),
file_hash="different_hash_123456789012345678901234567890123456789012345678",
encoding="utf-8",
file_size=2000,
mime_type="application/pdf"
)
# Act & Assert
with pytest.raises(DuplicateKeyError) as exc_info:
await in_memory_repository.create_document(duplicate_file)
assert "already exists" in str(exc_info.value)
@pytest.mark.asyncio
async def test_i_cannot_create_file_document_with_pymongo_error(self, in_memory_repository,
sample_file_document, mocker):
"""Test handling of PyMongo errors during file document creation."""
# Arrange
mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
# Act & Assert
with pytest.raises(ValueError) as exc_info:
await in_memory_repository.create_document(sample_file_document)
assert "Failed to create file document" in str(exc_info.value)
class TestFileDocumentRepositoryFinding:
"""Tests for file document finding functionality."""
@pytest.mark.asyncio
async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document):
"""Test finding file document by valid ObjectId."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
# Act
found_file = await in_memory_repository.find_document_by_id(str(created_file.id))
# Assert
assert found_file is not None
assert found_file.id == created_file.id
assert found_file.filename == created_file.filename
assert found_file.filepath == created_file.filepath
@pytest.mark.asyncio
async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository):
"""Test that invalid ObjectId returns None."""
# Act
found_file = await in_memory_repository.find_document_by_id("invalid_id")
# Assert
assert found_file is None
@pytest.mark.asyncio
async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository):
"""Test that nonexistent but valid ObjectId returns None."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
found_file = await in_memory_repository.find_document_by_id(nonexistent_id)
# Assert
assert found_file is None
@pytest.mark.asyncio
async def test_i_can_find_document_by_file_hash(self, in_memory_repository, sample_file_document):
"""Test finding file document by file hash."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
# Act
found_file = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash)
# Assert
assert found_file is not None
assert found_file.file_hash == created_file.file_hash
assert found_file.id == created_file.id
@pytest.mark.asyncio
async def test_i_cannot_find_document_with_nonexistent_file_hash(self, in_memory_repository):
"""Test that nonexistent file hash returns None."""
# Act
found_file = await in_memory_repository.find_document_by_hash("nonexistent_hash")
# Assert
assert found_file is None
@pytest.mark.asyncio
async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document):
"""Test finding file document by filepath."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
# Act
found_file = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath)
# Assert
assert found_file is not None
assert found_file.filepath == created_file.filepath
assert found_file.id == created_file.id
@pytest.mark.asyncio
async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository):
"""Test that nonexistent filepath returns None."""
# Act
found_file = await in_memory_repository.find_document_by_filepath("/nonexistent/path/file.pdf")
# Assert
assert found_file is None
@pytest.mark.asyncio
async def test_i_cannot_find_document_with_pymongo_error(self, in_memory_repository, mocker):
"""Test handling of PyMongo errors during file document finding."""
# Arrange
mocker.patch.object(in_memory_repository.collection, 'find_one', side_effect=PyMongoError("Database error"))
# Act
found_file = await in_memory_repository.find_document_by_hash("test_hash")
# Assert
assert found_file is None
class TestFileDocumentRepositoryNameMatching:
"""Tests for file document name matching functionality."""
@pytest.mark.asyncio
async def test_i_can_find_documents_by_name_with_fuzzy_matching(self, in_memory_repository, multiple_sample_files):
"""Test finding file documents by filename using fuzzy matching."""
# Arrange
for file_doc in multiple_sample_files:
await in_memory_repository.create_document(file_doc)
# Act
fuzzy_method = FuzzyMatching(threshold=0.5)
found_files = await in_memory_repository.find_document_by_name("document", fuzzy_method)
# Assert
assert len(found_files) >= 1
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
# Should find files with "document" in the name
found_filenames = [f.filename for f in found_files]
assert any("document" in fname.lower() for fname in found_filenames)
@pytest.mark.asyncio
async def test_i_can_find_documents_by_name_with_subsequence_matching(self, in_memory_repository,
multiple_sample_files):
"""Test finding file documents by filename using subsequence matching."""
# Arrange
for file_doc in multiple_sample_files:
await in_memory_repository.create_document(file_doc)
# Act
subsequence_method = SubsequenceMatching()
found_files = await in_memory_repository.find_document_by_name("doc", subsequence_method)
# Assert
assert len(found_files) >= 1
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
@pytest.mark.asyncio
async def test_i_can_find_documents_by_name_with_default_method(self, in_memory_repository, multiple_sample_files):
"""Test finding file documents by filename with default matching method."""
# Arrange
for file_doc in multiple_sample_files:
await in_memory_repository.create_document(file_doc)
# Act
found_files = await in_memory_repository.find_document_by_name("first")
# Assert
assert len(found_files) >= 0
assert all(isinstance(file_doc, FileDocument) for file_doc in found_files)
@pytest.mark.asyncio
async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker):
"""Test handling of PyMongo errors during document name matching."""
# Arrange
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
# Act
found_files = await in_memory_repository.find_document_by_name("test")
# Assert
assert found_files == []
class TestFileDocumentRepositoryListing:
"""Tests for file document listing functionality."""
@pytest.mark.asyncio
async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_files):
"""Test listing file documents with default pagination."""
# Arrange
for file_doc in multiple_sample_files:
await in_memory_repository.create_document(file_doc)
# Act
files = await in_memory_repository.list_documents()
# Assert
assert len(files) == len(multiple_sample_files)
assert all(isinstance(file_doc, FileDocument) for file_doc in files)
@pytest.mark.asyncio
async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_files):
"""Test listing file documents with custom pagination."""
# Arrange
for file_doc in multiple_sample_files:
await in_memory_repository.create_document(file_doc)
# Act
files_page1 = await in_memory_repository.list_documents(skip=0, limit=2)
files_page2 = await in_memory_repository.list_documents(skip=2, limit=2)
# Assert
assert len(files_page1) == 2
assert len(files_page2) == 1 # Only 3 total files
# Ensure no overlap between pages
page1_ids = [file_doc.id for file_doc in files_page1]
page2_ids = [file_doc.id for file_doc in files_page2]
assert len(set(page1_ids).intersection(set(page2_ids))) == 0
@pytest.mark.asyncio
async def test_i_can_list_documents_sorted_by_detected_at(self, in_memory_repository, sample_file_document):
"""Test that file documents are sorted by detected_at in descending order."""
# Arrange
file1 = sample_file_document.model_copy()
file1.filepath = "/docs/file1.pdf"
file1.filename = "file1.pdf"
file1.file_hash = "hash1" + "0" * 58
file1.detected_at = datetime(2024, 1, 1, 10, 0, 0)
file2 = sample_file_document.model_copy()
file2.filepath = "/docs/file2.pdf"
file2.filename = "file2.pdf"
file2.file_hash = "hash2" + "0" * 58
file2.detected_at = datetime(2024, 1, 2, 10, 0, 0) # Later date
created_file1 = await in_memory_repository.create_document(file1)
created_file2 = await in_memory_repository.create_document(file2)
# Act
files = await in_memory_repository.list_documents()
# Assert
assert len(files) == 2
# Most recent (latest detected_at) should be first
assert files[0].id == created_file2.id
assert files[1].id == created_file1.id
@pytest.mark.asyncio
async def test_i_can_list_empty_documents(self, in_memory_repository):
"""Test listing file documents from empty collection."""
# Act
files = await in_memory_repository.list_documents()
# Assert
assert files == []
@pytest.mark.asyncio
async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker):
"""Test handling of PyMongo errors during file document listing."""
# Arrange
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
# Act
files = await in_memory_repository.list_documents()
# Assert
assert files == []
class TestFileDocumentRepositoryUpdate:
"""Tests for file document update functionality."""
@pytest.mark.asyncio
async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document,
sample_update_data):
"""Test successful file document update."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
# Act
updated_file = await in_memory_repository.update_document(str(created_file.id), sample_update_data)
# Assert
assert updated_file is not None
assert updated_file.extraction_method == sample_update_data["extraction_method"]
assert updated_file.metadata == sample_update_data["metadata"]
assert updated_file.file_size == sample_update_data["file_size"]
assert updated_file.id == created_file.id
assert updated_file.filename == created_file.filename # Unchanged fields remain
assert updated_file.filepath == created_file.filepath
@pytest.mark.asyncio
async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document):
"""Test updating file document with partial data."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
partial_update = {"file_size": 999999}
# Act
updated_file = await in_memory_repository.update_document(str(created_file.id), partial_update)
# Assert
assert updated_file is not None
assert updated_file.file_size == 999999
assert updated_file.filename == created_file.filename # Should remain unchanged
assert updated_file.metadata == created_file.metadata # Should remain unchanged
@pytest.mark.asyncio
async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document):
"""Test that None values are filtered out from update data."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
update_with_none = {"file_size": 777777, "metadata": None}
# Act
updated_file = await in_memory_repository.update_document(str(created_file.id), update_with_none)
# Assert
assert updated_file is not None
assert updated_file.file_size == 777777
assert updated_file.metadata == created_file.metadata # Should remain unchanged (None filtered out)
@pytest.mark.asyncio
async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document):
"""Test updating file document with empty data returns current document."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
empty_update = {}
# Act
result = await in_memory_repository.update_document(str(created_file.id), empty_update)
# Assert
assert result is not None
assert result.filename == created_file.filename
assert result.filepath == created_file.filepath
assert result.metadata == created_file.metadata
@pytest.mark.asyncio
async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data):
"""Test that updating with invalid ID returns None."""
# Act
result = await in_memory_repository.update_document("invalid_id", sample_update_data)
# Assert
assert result is None
@pytest.mark.asyncio
async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data):
"""Test that updating nonexistent file document returns None."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
result = await in_memory_repository.update_document(nonexistent_id, sample_update_data)
# Assert
assert result is None
@pytest.mark.asyncio
async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document,
sample_update_data, mocker):
"""Test handling of PyMongo errors during file document update."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
side_effect=PyMongoError("Database error"))
# Act
result = await in_memory_repository.update_document(str(created_file.id), sample_update_data)
# Assert
assert result is None
class TestFileDocumentRepositoryDeletion:
"""Tests for file document deletion functionality."""
@pytest.mark.asyncio
async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document):
"""Test successful file document deletion."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
# Act
deletion_result = await in_memory_repository.delete_document(str(created_file.id))
# Assert
assert deletion_result is True
# Verify document is actually deleted
found_file = await in_memory_repository.find_document_by_id(str(created_file.id))
assert found_file is None
@pytest.mark.asyncio
async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository):
"""Test that deleting with invalid ID returns False."""
# Act
result = await in_memory_repository.delete_document("invalid_id")
# Assert
assert result is False
@pytest.mark.asyncio
async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository):
"""Test that deleting nonexistent file document returns False."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
result = await in_memory_repository.delete_document(nonexistent_id)
# Assert
assert result is False
@pytest.mark.asyncio
async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
"""Test handling of PyMongo errors during file document deletion."""
# Arrange
created_file = await in_memory_repository.create_document(sample_file_document)
mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
# Act
result = await in_memory_repository.delete_document(str(created_file.id))
# Assert
assert result is False
class TestFileDocumentRepositoryUtilities:
"""Tests for utility methods."""
@pytest.mark.asyncio
async def test_i_can_count_documents(self, in_memory_repository, sample_file_document):
"""Test counting file documents."""
# Arrange
initial_count = await in_memory_repository.count_documents()
await in_memory_repository.create_document(sample_file_document)
# Act
final_count = await in_memory_repository.count_documents()
# Assert
assert final_count == initial_count + 1
@pytest.mark.asyncio
async def test_i_can_count_zero_documents(self, in_memory_repository):
"""Test counting file documents in empty collection."""
# Act
count = await in_memory_repository.count_documents()
# Assert
assert count == 0
@pytest.mark.asyncio
async def test_i_cannot_count_documents_with_pymongo_error(self, in_memory_repository, mocker):
"""Test handling of PyMongo errors during file document counting."""
# Arrange
mocker.patch.object(in_memory_repository.collection, 'count_documents', side_effect=PyMongoError("Database error"))
# Act
count = await in_memory_repository.count_documents()
# Assert
assert count == 0
class TestMatchingMethods:
"""Tests for matching method classes."""
def test_i_can_create_fuzzy_matching_with_default_threshold(self):
"""Test creating FuzzyMatching with default threshold."""
# Act
fuzzy = FuzzyMatching()
# Assert
assert fuzzy.threshold == 0.6
def test_i_can_create_fuzzy_matching_with_custom_threshold(self):
"""Test creating FuzzyMatching with custom threshold."""
# Act
fuzzy = FuzzyMatching(threshold=0.8)
# Assert
assert fuzzy.threshold == 0.8
def test_i_can_create_subsequence_matching(self):
"""Test creating SubsequenceMatching."""
# Act
subsequence = SubsequenceMatching()
# Assert
assert isinstance(subsequence, MatchMethodBase)
assert isinstance(subsequence, SubsequenceMatching)

View File

View File

@@ -0,0 +1,587 @@
"""
Unit tests for DocumentService using in-memory MongoDB.
Tests the orchestration logic with real MongoDB operations
using mongomock for better integration testing.
"""
import os
from datetime import datetime
from unittest.mock import patch
import pytest
import pytest_asyncio
from bson import ObjectId
from mongomock_motor import AsyncMongoMockClient
from app.models.document import FileType
from app.services.document_service import DocumentService
@pytest.fixture(autouse=True)
def cleanup_test_folder():
"""Clean up test folder."""
import shutil
shutil.rmtree("test_folder", ignore_errors=True)
@pytest_asyncio.fixture
async def in_memory_database():
"""Create an in-memory database for testing."""
client = AsyncMongoMockClient()
return client.test_database
@pytest_asyncio.fixture
async def document_service(in_memory_database):
"""Create DocumentService with in-memory repositories."""
service = DocumentService(in_memory_database, objects_folder="test_folder")
return service
@pytest.fixture
def sample_file_bytes():
"""Sample file content as bytes."""
return b"This is a test PDF content"
@pytest.fixture
def sample_text_bytes():
"""Sample text file content as bytes."""
return b"This is a test text file content"
@pytest.fixture
def sample_file_hash():
"""Expected SHA256 hash for sample file bytes."""
import hashlib
return hashlib.sha256(b"This is a test PDF content").hexdigest()
def validate_file_saved(document_service, file_hash, file_bytes):
# Verify file is saved to disk
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
assert os.path.exists(target_file_path)
with open(target_file_path, "rb") as f:
content = f.read()
assert content == file_bytes
class TestCreateDocument:
"""Tests for create_document method."""
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
@pytest.mark.asyncio
async def test_i_can_create_document_with_new_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content doesn't exist yet."""
# Setup mocks
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
mock_datetime.now.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Execute
result = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify document creation
assert result is not None
assert result.filename == "test.pdf"
assert result.filepath == "/test/test.pdf"
assert result.file_type == FileType.PDF
assert result.detected_at == fixed_time
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
# Verify document created in database
doc_in_db = await document_service.document_repository.find_document_by_id(result.id)
assert doc_in_db is not None
assert doc_in_db.id == result.id
assert doc_in_db.filename == result.filename
assert doc_in_db.filepath == result.filepath
assert doc_in_db.file_type == result.file_type
assert doc_in_db.detected_at == fixed_time
assert doc_in_db.file_hash == result.file_hash
# Verify file is saved to disk
validate_file_saved(document_service, result.file_hash, sample_file_bytes)
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
@pytest.mark.asyncio
async def test_i_can_create_document_with_existing_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content already exists (deduplication)."""
# Setup mocks
fixed_time = datetime(2025, 1, 1, 10, 30, 0)
mock_datetime.now.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Create first document
first_doc = await document_service.create_document(
"/test/first.pdf",
sample_file_bytes,
"utf-8"
)
# Create second document with same content
second_doc = await document_service.create_document(
"/test/second.pdf",
sample_file_bytes,
"utf-8"
)
# Verify both documents exist but share same hash
assert first_doc.file_hash == second_doc.file_hash
assert first_doc.filename != second_doc.filename
assert first_doc.filepath != second_doc.filepath
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_unsupported_file_type(
self,
document_service,
sample_file_bytes
):
"""Test that unsupported file types raise ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
await document_service.create_document(
"/test/test.xyz", # Unsupported extension
sample_file_bytes,
"utf-8"
)
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_empty_file_path(
self,
document_service,
sample_file_bytes
):
"""Test that empty file path raises ValueError."""
with pytest.raises(ValueError):
await document_service.create_document(
"", # Empty path
sample_file_bytes,
"utf-8"
)
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_create_document_with_empty_bytes(
self,
mock_magic,
document_service
):
"""Test behavior with empty file bytes."""
# Setup
mock_magic.return_value = "text/plain"
# Execute with empty bytes
result = await document_service.create_document(
"/test/empty.txt",
b"", # Empty bytes
"utf-8"
)
# Verify file is saved to disk
validate_file_saved(document_service, result.file_hash, b"")
class TestGetMethods:
"""Tests for document retrieval methods."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_id(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by ID."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_id(created_doc.id)
# Verify
assert result is not None
assert result.id == created_doc.id
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_hash(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file hash."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_hash(created_doc.file_hash)
# Verify
assert result is not None
assert result.file_hash == created_doc.file_hash
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_filepath(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file path."""
# Setup
mock_magic.return_value = "application/pdf"
test_path = "/test/unique_test.pdf"
# Create a document first
created_doc = await document_service.create_document(
test_path,
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_filepath(test_path)
# Verify
assert result is not None
assert result.filepath == test_path
assert result.id == created_doc.id
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document with associated content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_content_by_hash(created_doc.file_hash)
# Verify
assert result == sample_file_bytes
@pytest.mark.asyncio
async def test_i_cannot_get_nonexistent_document_by_id(
self,
document_service
):
"""Test that nonexistent document returns None."""
# Execute with random ObjectId
result = await document_service.get_document_by_id(ObjectId())
# Verify
assert result is None
@pytest.mark.asyncio
async def test_i_cannot_get_nonexistent_document_by_hash(
self,
document_service
):
"""Test that nonexistent document hash returns None."""
# Execute
result = await document_service.get_document_by_hash("nonexistent_hash")
# Verify
assert result is None
class TestPaginationAndCounting:
"""Tests for document listing and counting."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_list_documents_with_pagination(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document listing with pagination parameters."""
# Setup
mock_magic.return_value = "application/pdf"
# Create multiple documents
for i in range(5):
await document_service.create_document(
f"/test/test{i}.pdf",
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
"utf-8"
)
# Execute with pagination
result = await document_service.list_documents(skip=1, limit=2)
# Verify
assert len(result) == 2
# Test counting
total_count = await document_service.count_documents()
assert total_count == 5
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_count_documents(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document counting."""
# Setup
mock_magic.return_value = "text/plain"
# Initially should be 0
initial_count = await document_service.count_documents()
assert initial_count == 0
# Create some documents
for i in range(3):
await document_service.create_document(
f"/test/test{i}.txt",
sample_file_bytes + bytes(str(i), 'utf-8'),
"utf-8"
)
# Execute
final_count = await document_service.count_documents()
# Verify
assert final_count == 3
class TestUpdateAndDelete:
"""Tests for document update and deletion operations."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_update_document_metadata(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test updating document metadata."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute update
update_data = {"metadata": {"page_count": 5}}
result = await document_service.update_document(created_doc.id, update_data)
# Verify
assert result is not None
assert result.metadata.get("page_count") == 5
assert result.filename == created_doc.filename
assert result.filepath == created_doc.filepath
assert result.file_hash == created_doc.file_hash
assert result.file_type == created_doc.file_type
assert result.metadata == update_data['metadata']
@pytest.mark.asyncio
async def test_i_can_update_document_content(
self,
document_service,
sample_file_bytes
):
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute update
update_data = {"file_bytes": b"this is an updated file content"}
result = await document_service.update_document(created_doc.id, update_data)
assert result.filename == created_doc.filename
assert result.filepath == created_doc.filepath
assert result.file_hash != created_doc.file_hash
assert result.file_type == created_doc.file_type
assert result.metadata == created_doc.metadata
# Verify file is saved to disk
validate_file_saved(document_service, result.file_hash, b"this is an updated file content")
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_delete_document_and_orphaned_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document with orphaned content cleanup."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify content exists
validate_file_saved(document_service, created_doc.file_hash, sample_file_bytes)
# Execute deletion
result = await document_service.delete_document(created_doc.id)
# Verify document and content are deleted
assert result is True
deleted_doc = await document_service.get_document_by_id(created_doc.id)
assert deleted_doc is None
# validate content is deleted
file_hash = created_doc.file_hash[:24]
target_file_path = os.path.join(document_service.objects_folder, file_hash[:24], file_hash)
assert not os.path.exists(target_file_path)
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_delete_document_without_affecting_shared_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document without removing shared content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create two documents with same content
doc1 = await document_service.create_document(
"/test/test1.pdf",
sample_file_bytes,
"utf-8"
)
doc2 = await document_service.create_document(
"/test/test2.pdf",
sample_file_bytes,
"utf-8"
)
# They should share the same hash
assert doc1.file_hash == doc2.file_hash
# Delete first document
result = await document_service.delete_document(doc1.id)
assert result is True
# Verify first document is deleted but content still exists
deleted_doc = await document_service.get_document_by_id(doc1.id)
assert deleted_doc is None
remaining_doc = await document_service.get_document_by_id(doc2.id)
assert remaining_doc is not None
validate_file_saved(document_service, doc2.file_hash, sample_file_bytes)
class TestHashCalculation:
"""Tests for file hash calculation utility."""
def test_i_can_calculate_consistent_file_hash(self, document_service):
"""Test that file hash calculation is consistent."""
test_bytes = b"Test content for hashing"
# Calculate hash multiple times
hash1 = document_service._calculate_file_hash(test_bytes)
hash2 = document_service._calculate_file_hash(test_bytes)
# Should be identical
assert hash1 == hash2
assert len(hash1) == 64 # SHA256 produces 64-character hex string
def test_i_get_different_hashes_for_different_content(self, document_service):
"""Test that different content produces different hashes."""
content1 = b"First content"
content2 = b"Second content"
hash1 = document_service._calculate_file_hash(content1)
hash2 = document_service._calculate_file_hash(content2)
assert hash1 != hash2
class TestFileTypeDetection:
"""Tests for file type detection."""
def test_i_can_detect_pdf_file_type(self, document_service):
"""Test PDF file type detection."""
file_type = document_service._detect_file_type("/path/to/document.pdf")
assert file_type == FileType.PDF
def test_i_can_detect_txt_file_type(self, document_service):
"""Test text file type detection."""
file_type = document_service._detect_file_type("/path/to/document.txt")
assert file_type == FileType.TXT
def test_i_can_detect_docx_file_type(self, document_service):
"""Test DOCX file type detection."""
file_type = document_service._detect_file_type("/path/to/document.docx")
assert file_type == FileType.DOCX
def test_i_cannot_detect_unsupported_file_type(self, document_service):
"""Test unsupported file type raises ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
document_service._detect_file_type("/path/to/document.xyz")

View File

@@ -1,187 +0,0 @@
"""
Unit tests for MongoDB database connection module.
Tests the database connection functionality with mocking
to avoid requiring actual MongoDB instance during tests.
"""
import pytest
from unittest.mock import Mock, patch, MagicMock
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
from app.database.connection import (
create_mongodb_client,
get_database,
close_database_connection,
get_mongodb_client,
test_database_connection
)
def test_i_can_get_database_connection():
"""Test successful database connection creation."""
mock_client = Mock()
mock_database = Mock()
# Configure the mock to support dictionary-like access
mock_client.__getitem__ = Mock(return_value=mock_database)
with patch('app.database.connection.MongoClient', return_value=mock_client):
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
# Reset global variables
import app.database.connection
app.database.connection._client = None
app.database.connection._database = None
result = get_database()
assert result == mock_database
mock_client.admin.command.assert_called_with('ping')
# Verify that __getitem__ was called with the database name
mock_client.__getitem__.assert_called_with("testdb")
def test_i_cannot_connect_to_invalid_mongodb_url():
"""Test fail-fast behavior with invalid MongoDB URL."""
mock_client = Mock()
mock_client.admin.command.side_effect = ConnectionFailure("Connection failed")
with patch('app.database.connection.MongoClient', return_value=mock_client):
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://invalid:27017"):
with pytest.raises(SystemExit) as exc_info:
create_mongodb_client()
assert exc_info.value.code == 1
def test_i_cannot_connect_with_server_selection_timeout():
"""Test fail-fast behavior with server selection timeout."""
mock_client = Mock()
mock_client.admin.command.side_effect = ServerSelectionTimeoutError("Timeout")
with patch('app.database.connection.MongoClient', return_value=mock_client):
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://timeout:27017"):
with pytest.raises(SystemExit) as exc_info:
create_mongodb_client()
assert exc_info.value.code == 1
def test_i_cannot_connect_with_unexpected_error():
"""Test fail-fast behavior with unexpected connection error."""
with patch('app.database.connection.MongoClient', side_effect=Exception("Unexpected error")):
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://error:27017"):
with pytest.raises(SystemExit) as exc_info:
create_mongodb_client()
assert exc_info.value.code == 1
def test_i_can_get_database_singleton():
"""Test that get_database returns the same instance (singleton pattern)."""
mock_client = Mock()
mock_database = Mock()
mock_client.__getitem__ = Mock(return_value=mock_database)
with patch('app.database.connection.MongoClient', return_value=mock_client):
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
# Reset global variables
import app.database.connection
app.database.connection._client = None
app.database.connection._database = None
# First call
db1 = get_database()
# Second call
db2 = get_database()
assert db1 is db2
# MongoClient should be called only once
assert mock_client.admin.command.call_count == 1
def test_i_can_close_database_connection():
"""Test closing database connection."""
mock_client = Mock()
mock_database = Mock()
mock_client.__getitem__ = Mock(return_value=mock_database)
with patch('app.database.connection.MongoClient', return_value=mock_client):
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
# Reset global variables
import app.database.connection
app.database.connection._client = None
app.database.connection._database = None
# Create connection
get_database()
# Close connection
close_database_connection()
mock_client.close.assert_called_once()
assert app.database.connection._client is None
assert app.database.connection._database is None
def test_i_can_get_mongodb_client():
"""Test getting raw MongoDB client instance."""
mock_client = Mock()
mock_database = Mock()
mock_client.__getitem__ = Mock(return_value=mock_database)
with patch('app.database.connection.MongoClient', return_value=mock_client):
with patch('app.database.connection.get_mongodb_url', return_value="mongodb://localhost:27017"):
with patch('app.database.connection.get_mongodb_database_name', return_value="testdb"):
# Reset global variables
import app.database.connection
app.database.connection._client = None
app.database.connection._database = None
# Create connection first
get_database()
# Get client
result = get_mongodb_client()
assert result == mock_client
def test_i_can_get_none_mongodb_client_when_not_connected():
"""Test getting MongoDB client returns None when not connected."""
# Reset global variables
import app.database.connection
app.database.connection._client = None
app.database.connection._database = None
result = get_mongodb_client()
assert result is None
def test_i_can_test_database_connection_success():
"""Test database connection health check - success case."""
mock_database = Mock()
mock_database.command.return_value = True
with patch('app.database.connection.get_database', return_value=mock_database):
result = test_database_connection()
assert result is True
mock_database.command.assert_called_with('ping')
def test_i_can_close_connection_when_no_client():
"""Test closing connection when no client exists (should not raise error)."""
# Reset global variables
import app.database.connection
app.database.connection._client = None
app.database.connection._database = None
# Should not raise any exception
close_database_connection()
assert app.database.connection._client is None
assert app.database.connection._database is None

View File

@@ -1,311 +0,0 @@
"""
Test suite for DocumentContentRepository with async/await support.
This module contains comprehensive tests for all DocumentContentRepository methods
using mongomock-motor for in-memory MongoDB testing.
"""
import pytest
import hashlib
from datetime import datetime
import pytest_asyncio
from bson import ObjectId
from pymongo.errors import DuplicateKeyError
from mongomock_motor import AsyncMongoMockClient
from app.database.repositories.document_content_repository import DocumentContentRepository
from app.models.document import DocumentContent
@pytest_asyncio.fixture
async def in_memory_repository():
"""Create an in-memory DocumentContentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = DocumentContentRepository(db)
await repo.initialize()
return repo
@pytest.fixture
def sample_document_content():
"""Sample DocumentContent data for testing."""
content = "This is sample document content for testing purposes."
file_hash = hashlib.sha256(content.encode()).hexdigest()
return DocumentContent(
file_hash=file_hash,
content=content,
encoding="utf-8",
file_size=len(content.encode()),
mime_type="text/plain"
)
@pytest.fixture
def another_document_content():
"""Another sample DocumentContent data for testing."""
content = "This is another sample document with different content."
file_hash = hashlib.sha256(content.encode()).hexdigest()
return DocumentContent(
file_hash=file_hash,
content=content,
encoding="utf-8",
file_size=len(content.encode()),
mime_type="text/plain"
)
class TestDocumentContentRepositoryCreation:
"""Tests for document content creation functionality."""
@pytest.mark.asyncio
async def test_i_can_create_document_content(self, in_memory_repository, sample_document_content):
"""Test successful document content creation."""
# Act
created_content = await in_memory_repository.create_document_content(sample_document_content)
# Assert
assert created_content is not None
assert created_content.file_hash == sample_document_content.file_hash
assert created_content.content == sample_document_content.content
assert created_content.encoding == sample_document_content.encoding
assert created_content.file_size == sample_document_content.file_size
assert created_content.mime_type == sample_document_content.mime_type
assert created_content.id is not None
@pytest.mark.asyncio
async def test_i_cannot_create_document_content_with_duplicate_file_hash(self, in_memory_repository,
sample_document_content):
"""Test that creating document content with duplicate file_hash raises DuplicateKeyError."""
# Arrange
await in_memory_repository.create_document_content(sample_document_content)
# Act & Assert
with pytest.raises(DuplicateKeyError) as exc_info:
await in_memory_repository.create_document_content(sample_document_content)
assert "already exists" in str(exc_info.value)
class TestDocumentContentRepositoryFinding:
"""Tests for document content finding functionality."""
@pytest.mark.asyncio
async def test_i_can_find_document_content_by_id(self, in_memory_repository, sample_document_content):
"""Test finding document content by valid ID."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
# Act
found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
# Assert
assert found_content is not None
assert found_content.id == created_content.id
assert found_content.file_hash == created_content.file_hash
assert found_content.content == created_content.content
@pytest.mark.asyncio
async def test_i_cannot_find_document_content_by_invalid_id(self, in_memory_repository):
"""Test that invalid ObjectId returns None."""
# Act
found_content = await in_memory_repository.find_document_content_by_id("invalid_id")
# Assert
assert found_content is None
@pytest.mark.asyncio
async def test_i_cannot_find_document_content_by_nonexistent_id(self, in_memory_repository):
"""Test that nonexistent but valid ObjectId returns None."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
found_content = await in_memory_repository.find_document_content_by_id(nonexistent_id)
# Assert
assert found_content is None
@pytest.mark.asyncio
async def test_i_can_find_document_content_by_file_hash(self, in_memory_repository, sample_document_content):
"""Test finding document content by file hash."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
# Act
found_content = await in_memory_repository.find_document_content_by_file_hash(sample_document_content.file_hash)
# Assert
assert found_content is not None
assert found_content.file_hash == created_content.file_hash
assert found_content.id == created_content.id
@pytest.mark.asyncio
async def test_i_cannot_find_document_content_by_nonexistent_file_hash(self, in_memory_repository):
"""Test that nonexistent file hash returns None."""
# Act
found_content = await in_memory_repository.find_document_content_by_file_hash("nonexistent_hash")
# Assert
assert found_content is None
class TestDocumentContentRepositoryUpdate:
"""Tests for document content update functionality."""
@pytest.mark.asyncio
async def test_i_can_update_document_content(self, in_memory_repository, sample_document_content):
"""Test successful document content update."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
update_data = {
"content": "Updated content for testing",
"encoding": "utf-16",
"mime_type": "text/html"
}
# Act
updated_content = await in_memory_repository.update_document_content(str(created_content.id), update_data)
# Assert
assert updated_content is not None
assert updated_content.content == update_data["content"]
assert updated_content.encoding == update_data["encoding"]
assert updated_content.mime_type == update_data["mime_type"]
assert updated_content.id == created_content.id
assert updated_content.file_hash == created_content.file_hash # Should remain unchanged
@pytest.mark.asyncio
async def test_i_cannot_update_document_content_with_invalid_id(self, in_memory_repository):
"""Test that updating with invalid ID returns None."""
# Act
result = await in_memory_repository.update_document_content("invalid_id", {"content": "test"})
# Assert
assert result is None
@pytest.mark.asyncio
async def test_i_can_update_document_content_with_partial_data(self, in_memory_repository, sample_document_content):
"""Test updating document content with partial data."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
partial_update = {"encoding": "iso-8859-1"}
# Act
updated_content = await in_memory_repository.update_document_content(str(created_content.id), partial_update)
# Assert
assert updated_content is not None
assert updated_content.encoding == "iso-8859-1"
assert updated_content.content == created_content.content # Should remain unchanged
assert updated_content.mime_type == created_content.mime_type # Should remain unchanged
@pytest.mark.asyncio
async def test_i_can_update_document_content_with_empty_data(self, in_memory_repository, sample_document_content):
"""Test updating document content with empty data returns current content."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
empty_update = {}
# Act
result = await in_memory_repository.update_document_content(str(created_content.id), empty_update)
# Assert
assert result is not None
assert result.content == created_content.content
assert result.encoding == created_content.encoding
assert result.mime_type == created_content.mime_type
class TestDocumentContentRepositoryDeletion:
"""Tests for document content deletion functionality."""
@pytest.mark.asyncio
async def test_i_can_delete_document_content(self, in_memory_repository, sample_document_content):
"""Test successful document content deletion."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
# Act
deletion_result = await in_memory_repository.delete_document_content(str(created_content.id))
# Assert
assert deletion_result is True
# Verify content is actually deleted
found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
assert found_content is None
@pytest.mark.asyncio
async def test_i_cannot_delete_document_content_with_invalid_id(self, in_memory_repository):
"""Test that deleting with invalid ID returns False."""
# Act
result = await in_memory_repository.delete_document_content("invalid_id")
# Assert
assert result is False
@pytest.mark.asyncio
async def test_i_cannot_delete_nonexistent_document_content(self, in_memory_repository):
"""Test that deleting nonexistent document content returns False."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
result = await in_memory_repository.delete_document_content(nonexistent_id)
# Assert
assert result is False
class TestDocumentContentRepositoryUtilities:
"""Tests for utility methods."""
@pytest.mark.asyncio
async def test_i_can_check_content_exists(self, in_memory_repository, sample_document_content):
"""Test checking if document content exists by file hash."""
# Arrange
await in_memory_repository.create_document_content(sample_document_content)
# Act
exists = await in_memory_repository.content_exists(sample_document_content.file_hash)
not_exists = await in_memory_repository.content_exists("nonexistent_hash")
# Assert
assert exists is True
assert not_exists is False
@pytest.mark.asyncio
async def test_i_can_list_document_contents(self, in_memory_repository, sample_document_content,
another_document_content):
"""Test listing document contents with pagination."""
# Arrange
await in_memory_repository.create_document_content(sample_document_content)
await in_memory_repository.create_document_content(another_document_content)
# Act
all_contents = await in_memory_repository.list_document_contents()
limited_contents = await in_memory_repository.list_document_contents(skip=0, limit=1)
# Assert
assert len(all_contents) == 2
assert len(limited_contents) == 1
assert all(isinstance(content, DocumentContent) for content in all_contents)
@pytest.mark.asyncio
async def test_i_can_count_document_contents(self, in_memory_repository, sample_document_content,
another_document_content):
"""Test counting document contents."""
# Arrange
initial_count = await in_memory_repository.count_document_contents()
await in_memory_repository.create_document_content(sample_document_content)
await in_memory_repository.create_document_content(another_document_content)
# Act
final_count = await in_memory_repository.count_document_contents()
# Assert
assert final_count == initial_count + 2

View File

@@ -1,566 +0,0 @@
"""
Test suite for FileDocumentRepository with async/await support.
This module contains comprehensive tests for all FileDocumentRepository methods
using mongomock-motor for in-memory MongoDB testing.
"""
import pytest
from datetime import datetime
from typing import Dict, Any
import pytest_asyncio
from bson import ObjectId
from pymongo.errors import DuplicateKeyError, PyMongoError
from mongomock_motor import AsyncMongoMockClient
from app.database.repositories.document_repository import FileDocumentRepository
from app.models.document import FileDocument, FileType
@pytest_asyncio.fixture
async def in_memory_repository():
"""Create an in-memory FileDocumentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository(db)
# repo.db = db
# repo.collection = db.files
await repo.initialize()
return repo
@pytest.fixture
def sample_file_document():
"""Sample FileDocument data for testing."""
return FileDocument(
filename="test_document.pdf",
filepath="/path/to/test_document.pdf",
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
file_type=FileType("pdf"),
detected_at=datetime.now(),
)
@pytest.fixture
def sample_update_data():
"""Sample update data for testing."""
return {
"metadata": {"tags": ["updated", "document"]},
"file_type": FileType("txt"),
}
@pytest.fixture
def multiple_sample_documents():
"""Multiple FileDocument objects for list/search testing."""
base_time = datetime.now()
return [
FileDocument(
filename="document1.pdf",
filepath="/path/to/document1.pdf",
file_hash="hash1" + "0" * 58,
file_type=FileType("pdf"),
detected_at=base_time,
),
FileDocument(
filename="similar_document.pdf",
filepath="/path/to/similar_document.pdf",
file_hash="hash2" + "0" * 58,
file_type=FileType("pdf"),
detected_at=base_time,
),
FileDocument(
filename="completely_different.txt",
filepath="/path/to/completely_different.txt",
file_hash="hash3" + "0" * 58,
file_type=FileType("pdf"),
detected_at=base_time,
)
]
class TestFileDocumentRepositoryInitialization:
"""Tests for repository initialization."""
@pytest.mark.asyncio
async def test_i_can_initialize_repository(self):
"""Test repository initialization."""
# Arrange
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository(db)
await repo.initialize()
# Act & Assert (should not raise any exception)
assert repo.db is not None
assert repo.collection is not None
# TODO : check that the indexes are create
class TestFileDocumentRepositoryCreation:
"""Tests for file document creation functionality."""
@pytest.mark.asyncio
async def test_i_can_create_document(self, in_memory_repository, sample_file_document):
"""Test successful file document creation."""
# Act
created_doc = await in_memory_repository.create_document(sample_file_document)
# Assert
assert created_doc is not None
assert created_doc.filename == sample_file_document.filename
assert created_doc.filepath == sample_file_document.filepath
assert created_doc.file_hash == sample_file_document.file_hash
assert created_doc.file_type == sample_file_document.file_type
assert created_doc.id is not None
assert isinstance(created_doc.id, ObjectId)
@pytest.mark.asyncio
async def test_i_can_create_document_without_id(self, in_memory_repository, sample_file_document):
"""Test creating document with _id set to None (should be removed)."""
# Arrange
sample_file_document.id = None
# Act
created_doc = await in_memory_repository.create_document(sample_file_document)
# Assert
assert created_doc is not None
assert created_doc.id is not None
assert isinstance(created_doc.id, ObjectId)
@pytest.mark.asyncio
async def test_i_cannot_create_duplicate_document(self, in_memory_repository, sample_file_document):
"""Test that creating document with duplicate hash raises DuplicateKeyError."""
# Arrange
await in_memory_repository.create_document(sample_file_document)
duplicate_doc = FileDocument(
filename="different_name.pdf",
filepath=sample_file_document.filepath,
file_hash="different_hash" + "0" * 58,
file_type=FileType("pdf"),
detected_at=datetime.now()
)
# Act & Assert
with pytest.raises(DuplicateKeyError) as exc_info:
await in_memory_repository.create_document(duplicate_doc)
assert "already exists" in str(exc_info.value)
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
"""Test handling of PyMongo errors during document creation."""
# Arrange
mocker.patch.object(in_memory_repository.collection, 'insert_one', side_effect=PyMongoError("Database error"))
# Act & Assert
with pytest.raises(ValueError) as exc_info:
await in_memory_repository.create_document(sample_file_document)
assert "Failed to create file document" in str(exc_info.value)
class TestFileDocumentRepositoryFinding:
"""Tests for file document finding functionality."""
@pytest.mark.asyncio
async def test_i_can_find_document_by_valid_id(self, in_memory_repository, sample_file_document):
"""Test finding document by valid ObjectId."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
# Act
found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id))
# Assert
assert found_doc is not None
assert found_doc.id == created_doc.id
assert found_doc.filename == created_doc.filename
assert found_doc.file_hash == created_doc.file_hash
@pytest.mark.asyncio
async def test_i_cannot_find_document_with_invalid_id(self, in_memory_repository):
"""Test that invalid ObjectId returns None."""
# Act
found_doc = await in_memory_repository.find_document_by_id("invalid_id")
# Assert
assert found_doc is None
@pytest.mark.asyncio
async def test_i_cannot_find_document_by_nonexistent_id(self, in_memory_repository):
"""Test that nonexistent but valid ObjectId returns None."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
found_doc = await in_memory_repository.find_document_by_id(nonexistent_id)
# Assert
assert found_doc is None
@pytest.mark.asyncio
async def test_i_can_find_document_by_hash(self, in_memory_repository, sample_file_document):
"""Test finding document by file hash."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
# Act
found_doc = await in_memory_repository.find_document_by_hash(sample_file_document.file_hash)
# Assert
assert found_doc is not None
assert found_doc.file_hash == created_doc.file_hash
assert found_doc.id == created_doc.id
@pytest.mark.asyncio
async def test_i_cannot_find_document_with_nonexistent_hash(self, in_memory_repository):
"""Test that nonexistent hash returns None."""
# Act
found_doc = await in_memory_repository.find_document_by_hash("nonexistent_hash")
# Assert
assert found_doc is None
@pytest.mark.asyncio
async def test_i_can_find_document_by_filepath(self, in_memory_repository, sample_file_document):
"""Test finding document by exact filepath."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
# Act
found_doc = await in_memory_repository.find_document_by_filepath(sample_file_document.filepath)
# Assert
assert found_doc is not None
assert found_doc.filepath == created_doc.filepath
assert found_doc.id == created_doc.id
@pytest.mark.asyncio
async def test_i_cannot_find_document_with_nonexistent_filepath(self, in_memory_repository):
"""Test that nonexistent filepath returns None."""
# Act
found_doc = await in_memory_repository.find_document_by_filepath("/nonexistent/path.pdf")
# Assert
assert found_doc is None
class TestFileDocumentRepositoryFuzzySearch:
"""Tests for fuzzy search functionality by filename."""
@pytest.mark.asyncio
async def test_i_can_find_documents_by_exact_name(self, in_memory_repository, multiple_sample_documents):
"""Test finding documents with exact filename match."""
# Arrange
for doc in multiple_sample_documents:
await in_memory_repository.create_document(doc)
# Act
found_docs = await in_memory_repository.find_document_by_name("document1.pdf")
# Assert
assert len(found_docs) == 1
assert found_docs[0].filename == "document1.pdf"
@pytest.mark.asyncio
async def test_i_can_find_documents_by_fuzzy_name(self, in_memory_repository, multiple_sample_documents):
"""Test finding documents with fuzzy matching using default threshold."""
# Arrange
for doc in multiple_sample_documents:
await in_memory_repository.create_document(doc)
# Act
found_docs = await in_memory_repository.find_document_by_name("document")
# Assert
assert len(found_docs) >= 2 # Should find document1.pdf and similar_document.pdf
filenames = [doc.filename for doc in found_docs]
assert "document1.pdf" in filenames
assert "similar_document.pdf" in filenames
@pytest.mark.asyncio
async def test_i_cannot_find_documents_by_name_with_pymongo_error(self, in_memory_repository, mocker):
"""Test handling of PyMongo errors during name search."""
# Arrange
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
# Act
found_docs = await in_memory_repository.find_document_by_name("test")
# Assert
assert found_docs == []
class TestFileDocumentRepositoryListing:
"""Tests for document listing functionality."""
@pytest.mark.asyncio
async def test_i_can_list_documents_with_default_pagination(self, in_memory_repository, multiple_sample_documents):
"""Test listing documents with default pagination."""
# Arrange
for doc in multiple_sample_documents:
await in_memory_repository.create_document(doc)
# Act
docs = await in_memory_repository.list_documents()
# Assert
assert len(docs) == len(multiple_sample_documents)
assert all(isinstance(doc, FileDocument) for doc in docs)
@pytest.mark.asyncio
async def test_i_can_list_documents_with_custom_pagination(self, in_memory_repository, multiple_sample_documents):
"""Test listing documents with custom pagination."""
# Arrange
for doc in multiple_sample_documents:
await in_memory_repository.create_document(doc)
# Act
docs_page1 = await in_memory_repository.list_documents(skip=0, limit=2)
docs_page2 = await in_memory_repository.list_documents(skip=2, limit=2)
# Assert
assert len(docs_page1) == 2
assert len(docs_page2) == 1 # Only 3 total documents
# Ensure no overlap between pages
page1_ids = [doc.id for doc in docs_page1]
page2_ids = [doc.id for doc in docs_page2]
assert len(set(page1_ids).intersection(set(page2_ids))) == 0
@pytest.mark.asyncio
async def test_i_can_list_documents_sorted_by_date(self, in_memory_repository, sample_file_document):
"""Test that documents are sorted by detected_at in descending order."""
# Arrange
from datetime import timedelta
# Create documents with different timestamps
doc1 = sample_file_document.model_copy()
doc1.filename = "oldest.pdf"
doc1.filepath = f"/path/to/{doc1.filename}"
doc1.file_hash = "hash1" + "0" * 58
doc1.detected_at = datetime.now() - timedelta(hours=2)
doc2 = sample_file_document.model_copy()
doc2.filename = "newest.pdf"
doc2.filepath = f"/path/to/{doc2.filename}"
doc2.file_hash = "hash2" + "0" * 58
doc2.detected_at = datetime.now()
await in_memory_repository.create_document(doc1)
await in_memory_repository.create_document(doc2)
# Act
docs = await in_memory_repository.list_documents()
# Assert
assert len(docs) == 2
assert docs[0].filename == "newest.pdf" # Most recent first
assert docs[1].filename == "oldest.pdf"
@pytest.mark.asyncio
async def test_i_can_list_empty_documents(self, in_memory_repository):
"""Test listing documents from empty collection."""
# Act
docs = await in_memory_repository.list_documents()
# Assert
assert docs == []
@pytest.mark.asyncio
async def test_i_cannot_list_documents_with_pymongo_error(self, in_memory_repository, mocker):
"""Test handling of PyMongo errors during document listing."""
# Arrange
mocker.patch.object(in_memory_repository.collection, 'find', side_effect=PyMongoError("Database error"))
# Act
docs = await in_memory_repository.list_documents()
# Assert
assert docs == []
class TestFileDocumentRepositoryUpdate:
"""Tests for document update functionality."""
@pytest.mark.asyncio
async def test_i_can_update_document_successfully(self, in_memory_repository, sample_file_document,
sample_update_data):
"""Test successful document update."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
# Act
updated_doc = await in_memory_repository.update_document(str(created_doc.id), sample_update_data)
# Assert
assert updated_doc is not None
assert updated_doc.file_type == sample_update_data["file_type"]
assert updated_doc.id == created_doc.id
assert updated_doc.filename == created_doc.filename # Unchanged fields remain
@pytest.mark.asyncio
async def test_i_can_update_document_with_partial_data(self, in_memory_repository, sample_file_document):
"""Test updating document with partial data."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
partial_update = {"file_type": FileType("txt")}
# Act
updated_doc = await in_memory_repository.update_document(str(created_doc.id), partial_update)
# Assert
assert updated_doc is not None
assert updated_doc.file_type == FileType("txt")
assert updated_doc.filename == created_doc.filename # Should remain unchanged
assert updated_doc.filepath == created_doc.filepath # Should remain unchanged
@pytest.mark.asyncio
async def test_i_can_update_document_filtering_none_values(self, in_memory_repository, sample_file_document):
"""Test that None values are filtered out from update data."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
update_with_none = {"metadata": {"tags": ["updated", "document"]}, "file_type": None}
# Act
updated_doc = await in_memory_repository.update_document(str(created_doc.id), update_with_none)
# Assert
assert updated_doc is not None
assert updated_doc.metadata == {"tags": ["updated", "document"]}
assert updated_doc.file_type == created_doc.file_type # Should remain unchanged (None filtered out)
@pytest.mark.asyncio
async def test_i_can_update_document_with_empty_data(self, in_memory_repository, sample_file_document):
"""Test updating document with empty data returns current document."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
empty_update = {}
# Act
result = await in_memory_repository.update_document(str(created_doc.id), empty_update)
# Assert
assert result is not None
assert result.filename == created_doc.filename
assert result.file_hash == created_doc.file_hash
assert result.metadata == created_doc.metadata
@pytest.mark.asyncio
async def test_i_cannot_update_document_with_invalid_id(self, in_memory_repository, sample_update_data):
"""Test that updating with invalid ID returns None."""
# Act
result = await in_memory_repository.update_document("invalid_id", sample_update_data)
# Assert
assert result is None
@pytest.mark.asyncio
async def test_i_cannot_update_nonexistent_document(self, in_memory_repository, sample_update_data):
"""Test that updating nonexistent document returns None."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
result = await in_memory_repository.update_document(nonexistent_id, sample_update_data)
# Assert
assert result is None
@pytest.mark.asyncio
async def test_i_cannot_update_document_with_pymongo_error(self, in_memory_repository, sample_file_document,
sample_update_data, mocker):
"""Test handling of PyMongo errors during document update."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
mocker.patch.object(in_memory_repository.collection, 'find_one_and_update',
side_effect=PyMongoError("Database error"))
# Act
result = await in_memory_repository.update_document(str(created_doc.id), sample_update_data)
# Assert
assert result is None
class TestFileDocumentRepositoryDeletion:
"""Tests for document deletion functionality."""
@pytest.mark.asyncio
async def test_i_can_delete_existing_document(self, in_memory_repository, sample_file_document):
"""Test successful document deletion."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
# Act
deletion_result = await in_memory_repository.delete_document(str(created_doc.id))
# Assert
assert deletion_result is True
# Verify document is actually deleted
found_doc = await in_memory_repository.find_document_by_id(str(created_doc.id))
assert found_doc is None
@pytest.mark.asyncio
async def test_i_cannot_delete_document_with_invalid_id(self, in_memory_repository):
"""Test that deleting with invalid ID returns False."""
# Act
result = await in_memory_repository.delete_document("invalid_id")
# Assert
assert result is False
@pytest.mark.asyncio
async def test_i_cannot_delete_nonexistent_document(self, in_memory_repository):
"""Test that deleting nonexistent document returns False."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
result = await in_memory_repository.delete_document(nonexistent_id)
# Assert
assert result is False
@pytest.mark.asyncio
async def test_i_cannot_delete_document_with_pymongo_error(self, in_memory_repository, sample_file_document, mocker):
"""Test handling of PyMongo errors during document deletion."""
# Arrange
created_doc = await in_memory_repository.create_document(sample_file_document)
mocker.patch.object(in_memory_repository.collection, 'delete_one', side_effect=PyMongoError("Database error"))
# Act
result = await in_memory_repository.delete_document(str(created_doc.id))
# Assert
assert result is False
class TestFileDocumentRepositoryUtilities:
"""Tests for utility methods."""
@pytest.mark.asyncio
async def test_i_can_count_documents(self, in_memory_repository, sample_file_document):
"""Test counting documents."""
# Arrange
initial_count = await in_memory_repository.count_documents()
await in_memory_repository.create_document(sample_file_document)
# Act
final_count = await in_memory_repository.count_documents()
# Assert
assert final_count == initial_count + 1
@pytest.mark.asyncio
async def test_i_can_count_zero_documents(self, in_memory_repository):
"""Test counting documents in empty collection."""
# Act
count = await in_memory_repository.count_documents()
# Assert
assert count == 0

View File

@@ -1,697 +0,0 @@
"""
Unit tests for DocumentService using in-memory MongoDB.
Tests the orchestration logic with real MongoDB operations
using mongomock for better integration testing.
"""
import pytest
import pytest_asyncio
from unittest.mock import Mock, patch
from datetime import datetime
from bson import ObjectId
from pathlib import Path
from mongomock_motor import AsyncMongoMockClient
from app.services.document_service import DocumentService
from app.database.repositories.document_repository import FileDocumentRepository
from app.database.repositories.document_content_repository import DocumentContentRepository
from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod
from app.models.types import PyObjectId
@pytest_asyncio.fixture
async def in_memory_file_repository():
"""Create an in-memory FileDocumentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository(db)
await repo.initialize()
return repo
@pytest_asyncio.fixture
async def in_memory_content_repository():
"""Create an in-memory DocumentContentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = DocumentContentRepository(db)
await repo.initialize()
return repo
@pytest_asyncio.fixture
async def in_memory_database():
"""Create an in-memory database for testing."""
client = AsyncMongoMockClient()
return client.test_database
@pytest_asyncio.fixture
async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database):
"""Create DocumentService with in-memory repositories."""
with patch('app.services.document_service.get_database', return_value=in_memory_database):
service = DocumentService()
service.file_repository = in_memory_file_repository
service.content_repository = in_memory_content_repository
return service
@pytest.fixture
def sample_file_bytes():
"""Sample file content as bytes."""
return b"This is a test PDF content"
@pytest.fixture
def sample_text_bytes():
"""Sample text file content as bytes."""
return b"This is a test text file content"
@pytest.fixture
def sample_file_hash():
"""Expected SHA256 hash for sample file bytes."""
import hashlib
return hashlib.sha256(b"This is a test PDF content").hexdigest()
@pytest.fixture
def sample_file_document():
"""Sample FileDocument for testing."""
return FileDocument(
id=ObjectId(),
filename="test.pdf",
filepath="/test/test.pdf",
file_type=FileType.PDF,
extraction_method=None,
metadata={},
detected_at=datetime(2024, 1, 15, 10, 30, 0),
file_hash="test_hash"
)
class TestCreateDocument:
"""Tests for create_document method."""
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
@pytest.mark.asyncio
async def test_i_can_create_document_with_new_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content doesn't exist yet."""
# Setup mocks
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
mock_datetime.utcnow.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Execute
result = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify document creation
assert result is not None
assert result.filename == "test.pdf"
assert result.filepath == "/test/test.pdf"
assert result.file_type == FileType.PDF
assert result.detected_at == fixed_time
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
# Verify content was created
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content is not None
assert content.file_hash == result.file_hash
assert content.file_size == len(sample_file_bytes)
assert content.mime_type == "application/pdf"
assert content.encoding == "utf-8"
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
@pytest.mark.asyncio
async def test_i_can_create_document_with_existing_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content already exists (deduplication)."""
# Setup mocks
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
mock_datetime.utcnow.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Create first document
first_doc = await document_service.create_document(
"/test/first.pdf",
sample_file_bytes,
"utf-8"
)
# Create second document with same content
second_doc = await document_service.create_document(
"/test/second.pdf",
sample_file_bytes,
"utf-8"
)
# Verify both documents exist but share same hash
assert first_doc.file_hash == second_doc.file_hash
assert first_doc.filename != second_doc.filename
assert first_doc.filepath != second_doc.filepath
# Verify only one content document exists
all_content = await document_service.content_repository.list_document_content()
content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash]
assert len(content_for_hash) == 1
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_create_document_with_different_encodings(
self,
mock_magic,
document_service,
sample_text_bytes
):
"""Test creating documents with different text encodings."""
# Setup
mock_magic.return_value = "text/plain"
# Test with different encodings
encodings = ["utf-8", "latin-1", "ascii"]
for i, encoding in enumerate(encodings):
result = await document_service.create_document(
f"/test/test{i}.txt",
sample_text_bytes,
encoding
)
# Verify document was created
assert result is not None
assert result.file_type == FileType.TXT
# Verify content has correct encoding
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content.encoding == encoding
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_unsupported_file_type(
self,
document_service,
sample_file_bytes
):
"""Test that unsupported file types raise ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
await document_service.create_document(
"/test/test.xyz", # Unsupported extension
sample_file_bytes,
"utf-8"
)
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_empty_file_path(
self,
document_service,
sample_file_bytes
):
"""Test that empty file path raises ValueError."""
with pytest.raises(ValueError):
await document_service.create_document(
"", # Empty path
sample_file_bytes,
"utf-8"
)
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_create_document_with_empty_bytes(
self,
mock_magic,
document_service
):
"""Test behavior with empty file bytes."""
# Setup
mock_magic.return_value = "text/plain"
# Execute with empty bytes
result = await document_service.create_document(
"/test/empty.txt",
b"", # Empty bytes
"utf-8"
)
# Should still work but with zero file size
assert result is not None
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content.file_size == 0
class TestGetMethods:
"""Tests for document retrieval methods."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_id(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by ID."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_id(created_doc.id)
# Verify
assert result is not None
assert result.id == created_doc.id
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_hash(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file hash."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_hash(created_doc.file_hash)
# Verify
assert result is not None
assert result.file_hash == created_doc.file_hash
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_filepath(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file path."""
# Setup
mock_magic.return_value = "application/pdf"
test_path = "/test/unique_test.pdf"
# Create a document first
created_doc = await document_service.create_document(
test_path,
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_filepath(test_path)
# Verify
assert result is not None
assert result.filepath == test_path
assert result.id == created_doc.id
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_with_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document with associated content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_with_content(created_doc.id)
# Verify
assert result is not None
document, content = result
assert document.id == created_doc.id
assert content is not None
assert content.file_hash == created_doc.file_hash
@pytest.mark.asyncio
async def test_i_cannot_get_nonexistent_document_by_id(
self,
document_service
):
"""Test that nonexistent document returns None."""
# Execute with random ObjectId
result = await document_service.get_document_by_id(ObjectId())
# Verify
assert result is None
@pytest.mark.asyncio
async def test_i_cannot_get_nonexistent_document_by_hash(
self,
document_service
):
"""Test that nonexistent document hash returns None."""
# Execute
result = await document_service.get_document_by_hash("nonexistent_hash")
# Verify
assert result is None
class TestPaginationAndCounting:
"""Tests for document listing and counting."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_list_documents_with_pagination(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document listing with pagination parameters."""
# Setup
mock_magic.return_value = "application/pdf"
# Create multiple documents
for i in range(5):
await document_service.create_document(
f"/test/test{i}.pdf",
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
"utf-8"
)
# Execute with pagination
result = await document_service.list_documents(skip=1, limit=2)
# Verify
assert len(result) == 2
# Test counting
total_count = await document_service.count_documents()
assert total_count == 5
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_count_documents(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document counting."""
# Setup
mock_magic.return_value = "text/plain"
# Initially should be 0
initial_count = await document_service.count_documents()
assert initial_count == 0
# Create some documents
for i in range(3):
await document_service.create_document(
f"/test/test{i}.txt",
sample_file_bytes + bytes(str(i), 'utf-8'),
"utf-8"
)
# Execute
final_count = await document_service.count_documents()
# Verify
assert final_count == 3
class TestUpdateAndDelete:
"""Tests for document update and deletion operations."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_update_document_metadata(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test updating document metadata."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute update
update_data = {"metadata": {"page_count": 5}}
result = await document_service.update_document(created_doc.id, update_data)
# Verify
assert result is not None
assert result.metadata.get("page_count") == 5
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_delete_document_and_orphaned_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document with orphaned content cleanup."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify content exists
content_before = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert content_before is not None
# Execute deletion
result = await document_service.delete_document(created_doc.id)
# Verify document and content are deleted
assert result is True
deleted_doc = await document_service.get_document_by_id(created_doc.id)
assert deleted_doc is None
content_after = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert content_after is None
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_delete_document_without_affecting_shared_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document without removing shared content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create two documents with same content
doc1 = await document_service.create_document(
"/test/test1.pdf",
sample_file_bytes,
"utf-8"
)
doc2 = await document_service.create_document(
"/test/test2.pdf",
sample_file_bytes,
"utf-8"
)
# They should share the same hash
assert doc1.file_hash == doc2.file_hash
# Delete first document
result = await document_service.delete_document(doc1.id)
assert result is True
# Verify first document is deleted but content still exists
deleted_doc = await document_service.get_document_by_id(doc1.id)
assert deleted_doc is None
remaining_doc = await document_service.get_document_by_id(doc2.id)
assert remaining_doc is not None
content = await document_service.content_repository.find_document_content_by_file_hash(
doc2.file_hash
)
assert content is not None
class TestUtilityMethods:
"""Tests for utility methods."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_check_content_exists(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test checking if content exists by hash."""
# Setup
mock_magic.return_value = "application/pdf"
# Initially content doesn't exist
test_hash = "nonexistent_hash"
exists_before = await document_service.content_exists(test_hash)
assert exists_before is False
# Create a document
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Now content should exist
exists_after = await document_service.content_exists(created_doc.file_hash)
assert exists_after is True
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_update_document_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test updating extracted document content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Update content
new_content = "Updated extracted content"
result = await document_service.update_document_content(
created_doc.file_hash,
new_content
)
# Verify update
assert result is not None
assert result.content == new_content
# Verify persistence
updated_content = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert updated_content.content == new_content
class TestHashCalculation:
"""Tests for file hash calculation utility."""
def test_i_can_calculate_consistent_file_hash(self, document_service):
"""Test that file hash calculation is consistent."""
test_bytes = b"Test content for hashing"
# Calculate hash multiple times
hash1 = document_service._calculate_file_hash(test_bytes)
hash2 = document_service._calculate_file_hash(test_bytes)
# Should be identical
assert hash1 == hash2
assert len(hash1) == 64 # SHA256 produces 64-character hex string
def test_i_get_different_hashes_for_different_content(self, document_service):
"""Test that different content produces different hashes."""
content1 = b"First content"
content2 = b"Second content"
hash1 = document_service._calculate_file_hash(content1)
hash2 = document_service._calculate_file_hash(content2)
assert hash1 != hash2
class TestFileTypeDetection:
"""Tests for file type detection."""
def test_i_can_detect_pdf_file_type(self, document_service):
"""Test PDF file type detection."""
file_type = document_service._detect_file_type("/path/to/document.pdf")
assert file_type == FileType.PDF
def test_i_can_detect_txt_file_type(self, document_service):
"""Test text file type detection."""
file_type = document_service._detect_file_type("/path/to/document.txt")
assert file_type == FileType.TXT
def test_i_can_detect_docx_file_type(self, document_service):
"""Test DOCX file type detection."""
file_type = document_service._detect_file_type("/path/to/document.docx")
assert file_type == FileType.DOCX
def test_i_cannot_detect_unsupported_file_type(self, document_service):
"""Test unsupported file type raises ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
document_service._detect_file_type("/path/to/document.xyz")

0
tests/utils/__init__.py Normal file
View File

View File

@@ -14,6 +14,8 @@ def get_doc(filename: str = None):
file_hash="a1b2c3d4e5f6789012345678901234567890abcdef1234567890abcdef123456",
file_type=FileType(os.path.splitext(filename)[1].lstrip(".") or "txt"),
detected_at=datetime.now(),
file_size=1024,
mime_type="application/pdf"
)