Adding document service

This commit is contained in:
2025-09-19 22:59:41 +02:00
parent e8b306ac4a
commit f1b551d243
13 changed files with 1734 additions and 24 deletions

View File

@@ -0,0 +1,311 @@
"""
Test suite for DocumentContentRepository with async/await support.
This module contains comprehensive tests for all DocumentContentRepository methods
using mongomock-motor for in-memory MongoDB testing.
"""
import pytest
import hashlib
from datetime import datetime
import pytest_asyncio
from bson import ObjectId
from pymongo.errors import DuplicateKeyError
from mongomock_motor import AsyncMongoMockClient
from app.database.repositories.document_content_repository import DocumentContentRepository
from app.models.document import DocumentContent
@pytest_asyncio.fixture
async def in_memory_repository():
"""Create an in-memory DocumentContentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = DocumentContentRepository(db)
await repo.initialize()
return repo
@pytest.fixture
def sample_document_content():
"""Sample DocumentContent data for testing."""
content = "This is sample document content for testing purposes."
file_hash = hashlib.sha256(content.encode()).hexdigest()
return DocumentContent(
file_hash=file_hash,
content=content,
encoding="utf-8",
file_size=len(content.encode()),
mime_type="text/plain"
)
@pytest.fixture
def another_document_content():
"""Another sample DocumentContent data for testing."""
content = "This is another sample document with different content."
file_hash = hashlib.sha256(content.encode()).hexdigest()
return DocumentContent(
file_hash=file_hash,
content=content,
encoding="utf-8",
file_size=len(content.encode()),
mime_type="text/plain"
)
class TestDocumentContentRepositoryCreation:
"""Tests for document content creation functionality."""
@pytest.mark.asyncio
async def test_i_can_create_document_content(self, in_memory_repository, sample_document_content):
"""Test successful document content creation."""
# Act
created_content = await in_memory_repository.create_document_content(sample_document_content)
# Assert
assert created_content is not None
assert created_content.file_hash == sample_document_content.file_hash
assert created_content.content == sample_document_content.content
assert created_content.encoding == sample_document_content.encoding
assert created_content.file_size == sample_document_content.file_size
assert created_content.mime_type == sample_document_content.mime_type
assert created_content.id is not None
@pytest.mark.asyncio
async def test_i_cannot_create_document_content_with_duplicate_file_hash(self, in_memory_repository,
sample_document_content):
"""Test that creating document content with duplicate file_hash raises DuplicateKeyError."""
# Arrange
await in_memory_repository.create_document_content(sample_document_content)
# Act & Assert
with pytest.raises(DuplicateKeyError) as exc_info:
await in_memory_repository.create_document_content(sample_document_content)
assert "already exists" in str(exc_info.value)
class TestDocumentContentRepositoryFinding:
"""Tests for document content finding functionality."""
@pytest.mark.asyncio
async def test_i_can_find_document_content_by_id(self, in_memory_repository, sample_document_content):
"""Test finding document content by valid ID."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
# Act
found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
# Assert
assert found_content is not None
assert found_content.id == created_content.id
assert found_content.file_hash == created_content.file_hash
assert found_content.content == created_content.content
@pytest.mark.asyncio
async def test_i_cannot_find_document_content_by_invalid_id(self, in_memory_repository):
"""Test that invalid ObjectId returns None."""
# Act
found_content = await in_memory_repository.find_document_content_by_id("invalid_id")
# Assert
assert found_content is None
@pytest.mark.asyncio
async def test_i_cannot_find_document_content_by_nonexistent_id(self, in_memory_repository):
"""Test that nonexistent but valid ObjectId returns None."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
found_content = await in_memory_repository.find_document_content_by_id(nonexistent_id)
# Assert
assert found_content is None
@pytest.mark.asyncio
async def test_i_can_find_document_content_by_file_hash(self, in_memory_repository, sample_document_content):
"""Test finding document content by file hash."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
# Act
found_content = await in_memory_repository.find_document_content_by_file_hash(sample_document_content.file_hash)
# Assert
assert found_content is not None
assert found_content.file_hash == created_content.file_hash
assert found_content.id == created_content.id
@pytest.mark.asyncio
async def test_i_cannot_find_document_content_by_nonexistent_file_hash(self, in_memory_repository):
"""Test that nonexistent file hash returns None."""
# Act
found_content = await in_memory_repository.find_document_content_by_file_hash("nonexistent_hash")
# Assert
assert found_content is None
class TestDocumentContentRepositoryUpdate:
"""Tests for document content update functionality."""
@pytest.mark.asyncio
async def test_i_can_update_document_content(self, in_memory_repository, sample_document_content):
"""Test successful document content update."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
update_data = {
"content": "Updated content for testing",
"encoding": "utf-16",
"mime_type": "text/html"
}
# Act
updated_content = await in_memory_repository.update_document_content(str(created_content.id), update_data)
# Assert
assert updated_content is not None
assert updated_content.content == update_data["content"]
assert updated_content.encoding == update_data["encoding"]
assert updated_content.mime_type == update_data["mime_type"]
assert updated_content.id == created_content.id
assert updated_content.file_hash == created_content.file_hash # Should remain unchanged
@pytest.mark.asyncio
async def test_i_cannot_update_document_content_with_invalid_id(self, in_memory_repository):
"""Test that updating with invalid ID returns None."""
# Act
result = await in_memory_repository.update_document_content("invalid_id", {"content": "test"})
# Assert
assert result is None
@pytest.mark.asyncio
async def test_i_can_update_document_content_with_partial_data(self, in_memory_repository, sample_document_content):
"""Test updating document content with partial data."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
partial_update = {"encoding": "iso-8859-1"}
# Act
updated_content = await in_memory_repository.update_document_content(str(created_content.id), partial_update)
# Assert
assert updated_content is not None
assert updated_content.encoding == "iso-8859-1"
assert updated_content.content == created_content.content # Should remain unchanged
assert updated_content.mime_type == created_content.mime_type # Should remain unchanged
@pytest.mark.asyncio
async def test_i_can_update_document_content_with_empty_data(self, in_memory_repository, sample_document_content):
"""Test updating document content with empty data returns current content."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
empty_update = {}
# Act
result = await in_memory_repository.update_document_content(str(created_content.id), empty_update)
# Assert
assert result is not None
assert result.content == created_content.content
assert result.encoding == created_content.encoding
assert result.mime_type == created_content.mime_type
class TestDocumentContentRepositoryDeletion:
"""Tests for document content deletion functionality."""
@pytest.mark.asyncio
async def test_i_can_delete_document_content(self, in_memory_repository, sample_document_content):
"""Test successful document content deletion."""
# Arrange
created_content = await in_memory_repository.create_document_content(sample_document_content)
# Act
deletion_result = await in_memory_repository.delete_document_content(str(created_content.id))
# Assert
assert deletion_result is True
# Verify content is actually deleted
found_content = await in_memory_repository.find_document_content_by_id(str(created_content.id))
assert found_content is None
@pytest.mark.asyncio
async def test_i_cannot_delete_document_content_with_invalid_id(self, in_memory_repository):
"""Test that deleting with invalid ID returns False."""
# Act
result = await in_memory_repository.delete_document_content("invalid_id")
# Assert
assert result is False
@pytest.mark.asyncio
async def test_i_cannot_delete_nonexistent_document_content(self, in_memory_repository):
"""Test that deleting nonexistent document content returns False."""
# Arrange
nonexistent_id = str(ObjectId())
# Act
result = await in_memory_repository.delete_document_content(nonexistent_id)
# Assert
assert result is False
class TestDocumentContentRepositoryUtilities:
"""Tests for utility methods."""
@pytest.mark.asyncio
async def test_i_can_check_content_exists(self, in_memory_repository, sample_document_content):
"""Test checking if document content exists by file hash."""
# Arrange
await in_memory_repository.create_document_content(sample_document_content)
# Act
exists = await in_memory_repository.content_exists(sample_document_content.file_hash)
not_exists = await in_memory_repository.content_exists("nonexistent_hash")
# Assert
assert exists is True
assert not_exists is False
@pytest.mark.asyncio
async def test_i_can_list_document_contents(self, in_memory_repository, sample_document_content,
another_document_content):
"""Test listing document contents with pagination."""
# Arrange
await in_memory_repository.create_document_content(sample_document_content)
await in_memory_repository.create_document_content(another_document_content)
# Act
all_contents = await in_memory_repository.list_document_contents()
limited_contents = await in_memory_repository.list_document_contents(skip=0, limit=1)
# Assert
assert len(all_contents) == 2
assert len(limited_contents) == 1
assert all(isinstance(content, DocumentContent) for content in all_contents)
@pytest.mark.asyncio
async def test_i_can_count_document_contents(self, in_memory_repository, sample_document_content,
another_document_content):
"""Test counting document contents."""
# Arrange
initial_count = await in_memory_repository.count_document_contents()
await in_memory_repository.create_document_content(sample_document_content)
await in_memory_repository.create_document_content(another_document_content)
# Act
final_count = await in_memory_repository.count_document_contents()
# Assert
assert final_count == initial_count + 2

View File

@@ -23,9 +23,9 @@ async def in_memory_repository():
"""Create an in-memory FileDocumentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository()
repo.db = db
repo.collection = db.files
repo = FileDocumentRepository(db)
# repo.db = db
# repo.collection = db.files
await repo.initialize()
return repo
@@ -87,12 +87,15 @@ class TestFileDocumentRepositoryInitialization:
async def test_i_can_initialize_repository(self):
"""Test repository initialization."""
# Arrange
repo = FileDocumentRepository()
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository(db)
await repo.initialize()
# Act & Assert (should not raise any exception)
assert repo.db is not None
assert repo.collection is not None
# TODO : check that the indexes are create
class TestFileDocumentRepositoryCreation:

View File

@@ -0,0 +1,697 @@
"""
Unit tests for DocumentService using in-memory MongoDB.
Tests the orchestration logic with real MongoDB operations
using mongomock for better integration testing.
"""
import pytest
import pytest_asyncio
from unittest.mock import Mock, patch
from datetime import datetime
from bson import ObjectId
from pathlib import Path
from mongomock_motor import AsyncMongoMockClient
from app.services.document_service import DocumentService
from app.database.repositories.document_repository import FileDocumentRepository
from app.database.repositories.document_content_repository import DocumentContentRepository
from app.models.document import FileDocument, DocumentContent, FileType, ExtractionMethod
from app.models.types import PyObjectId
@pytest_asyncio.fixture
async def in_memory_file_repository():
"""Create an in-memory FileDocumentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = FileDocumentRepository(db)
await repo.initialize()
return repo
@pytest_asyncio.fixture
async def in_memory_content_repository():
"""Create an in-memory DocumentContentRepository for testing."""
client = AsyncMongoMockClient()
db = client.test_database
repo = DocumentContentRepository(db)
await repo.initialize()
return repo
@pytest_asyncio.fixture
async def in_memory_database():
"""Create an in-memory database for testing."""
client = AsyncMongoMockClient()
return client.test_database
@pytest_asyncio.fixture
async def document_service(in_memory_file_repository, in_memory_content_repository, in_memory_database):
"""Create DocumentService with in-memory repositories."""
with patch('app.services.document_service.get_database', return_value=in_memory_database):
service = DocumentService()
service.file_repository = in_memory_file_repository
service.content_repository = in_memory_content_repository
return service
@pytest.fixture
def sample_file_bytes():
"""Sample file content as bytes."""
return b"This is a test PDF content"
@pytest.fixture
def sample_text_bytes():
"""Sample text file content as bytes."""
return b"This is a test text file content"
@pytest.fixture
def sample_file_hash():
"""Expected SHA256 hash for sample file bytes."""
import hashlib
return hashlib.sha256(b"This is a test PDF content").hexdigest()
@pytest.fixture
def sample_file_document():
"""Sample FileDocument for testing."""
return FileDocument(
id=ObjectId(),
filename="test.pdf",
filepath="/test/test.pdf",
file_type=FileType.PDF,
extraction_method=None,
metadata={},
detected_at=datetime(2024, 1, 15, 10, 30, 0),
file_hash="test_hash"
)
class TestCreateDocument:
"""Tests for create_document method."""
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
@pytest.mark.asyncio
async def test_i_can_create_document_with_new_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content doesn't exist yet."""
# Setup mocks
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
mock_datetime.utcnow.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Execute
result = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify document creation
assert result is not None
assert result.filename == "test.pdf"
assert result.filepath == "/test/test.pdf"
assert result.file_type == FileType.PDF
assert result.detected_at == fixed_time
assert result.file_hash == document_service._calculate_file_hash(sample_file_bytes)
# Verify content was created
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content is not None
assert content.file_hash == result.file_hash
assert content.file_size == len(sample_file_bytes)
assert content.mime_type == "application/pdf"
assert content.encoding == "utf-8"
@patch('app.services.document_service.magic.from_buffer')
@patch('app.services.document_service.datetime')
@pytest.mark.asyncio
async def test_i_can_create_document_with_existing_content(
self,
mock_datetime,
mock_magic,
document_service,
sample_file_bytes
):
"""Test creating document when content already exists (deduplication)."""
# Setup mocks
fixed_time = datetime(2024, 1, 15, 10, 30, 0)
mock_datetime.utcnow.return_value = fixed_time
mock_magic.return_value = "application/pdf"
# Create first document
first_doc = await document_service.create_document(
"/test/first.pdf",
sample_file_bytes,
"utf-8"
)
# Create second document with same content
second_doc = await document_service.create_document(
"/test/second.pdf",
sample_file_bytes,
"utf-8"
)
# Verify both documents exist but share same hash
assert first_doc.file_hash == second_doc.file_hash
assert first_doc.filename != second_doc.filename
assert first_doc.filepath != second_doc.filepath
# Verify only one content document exists
all_content = await document_service.content_repository.list_document_content()
content_for_hash = [c for c in all_content if c.file_hash == first_doc.file_hash]
assert len(content_for_hash) == 1
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_create_document_with_different_encodings(
self,
mock_magic,
document_service,
sample_text_bytes
):
"""Test creating documents with different text encodings."""
# Setup
mock_magic.return_value = "text/plain"
# Test with different encodings
encodings = ["utf-8", "latin-1", "ascii"]
for i, encoding in enumerate(encodings):
result = await document_service.create_document(
f"/test/test{i}.txt",
sample_text_bytes,
encoding
)
# Verify document was created
assert result is not None
assert result.file_type == FileType.TXT
# Verify content has correct encoding
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content.encoding == encoding
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_unsupported_file_type(
self,
document_service,
sample_file_bytes
):
"""Test that unsupported file types raise ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
await document_service.create_document(
"/test/test.xyz", # Unsupported extension
sample_file_bytes,
"utf-8"
)
@pytest.mark.asyncio
async def test_i_cannot_create_document_with_empty_file_path(
self,
document_service,
sample_file_bytes
):
"""Test that empty file path raises ValueError."""
with pytest.raises(ValueError):
await document_service.create_document(
"", # Empty path
sample_file_bytes,
"utf-8"
)
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_create_document_with_empty_bytes(
self,
mock_magic,
document_service
):
"""Test behavior with empty file bytes."""
# Setup
mock_magic.return_value = "text/plain"
# Execute with empty bytes
result = await document_service.create_document(
"/test/empty.txt",
b"", # Empty bytes
"utf-8"
)
# Should still work but with zero file size
assert result is not None
content = await document_service.content_repository.find_document_content_by_file_hash(
result.file_hash
)
assert content.file_size == 0
class TestGetMethods:
"""Tests for document retrieval methods."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_id(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by ID."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_id(created_doc.id)
# Verify
assert result is not None
assert result.id == created_doc.id
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_hash(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file hash."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_hash(created_doc.file_hash)
# Verify
assert result is not None
assert result.file_hash == created_doc.file_hash
assert result.filename == created_doc.filename
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_by_filepath(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document by file path."""
# Setup
mock_magic.return_value = "application/pdf"
test_path = "/test/unique_test.pdf"
# Create a document first
created_doc = await document_service.create_document(
test_path,
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_by_filepath(test_path)
# Verify
assert result is not None
assert result.filepath == test_path
assert result.id == created_doc.id
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_get_document_with_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test retrieving document with associated content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute
result = await document_service.get_document_with_content(created_doc.id)
# Verify
assert result is not None
document, content = result
assert document.id == created_doc.id
assert content is not None
assert content.file_hash == created_doc.file_hash
@pytest.mark.asyncio
async def test_i_cannot_get_nonexistent_document_by_id(
self,
document_service
):
"""Test that nonexistent document returns None."""
# Execute with random ObjectId
result = await document_service.get_document_by_id(ObjectId())
# Verify
assert result is None
@pytest.mark.asyncio
async def test_i_cannot_get_nonexistent_document_by_hash(
self,
document_service
):
"""Test that nonexistent document hash returns None."""
# Execute
result = await document_service.get_document_by_hash("nonexistent_hash")
# Verify
assert result is None
class TestPaginationAndCounting:
"""Tests for document listing and counting."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_list_documents_with_pagination(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document listing with pagination parameters."""
# Setup
mock_magic.return_value = "application/pdf"
# Create multiple documents
for i in range(5):
await document_service.create_document(
f"/test/test{i}.pdf",
sample_file_bytes + bytes(str(i), 'utf-8'), # Make each file unique
"utf-8"
)
# Execute with pagination
result = await document_service.list_documents(skip=1, limit=2)
# Verify
assert len(result) == 2
# Test counting
total_count = await document_service.count_documents()
assert total_count == 5
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_count_documents(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test document counting."""
# Setup
mock_magic.return_value = "text/plain"
# Initially should be 0
initial_count = await document_service.count_documents()
assert initial_count == 0
# Create some documents
for i in range(3):
await document_service.create_document(
f"/test/test{i}.txt",
sample_file_bytes + bytes(str(i), 'utf-8'),
"utf-8"
)
# Execute
final_count = await document_service.count_documents()
# Verify
assert final_count == 3
class TestUpdateAndDelete:
"""Tests for document update and deletion operations."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_update_document_metadata(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test updating document metadata."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Execute update
update_data = {"metadata": {"page_count": 5}}
result = await document_service.update_document(created_doc.id, update_data)
# Verify
assert result is not None
assert result.metadata.get("page_count") == 5
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_delete_document_and_orphaned_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document with orphaned content cleanup."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Verify content exists
content_before = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert content_before is not None
# Execute deletion
result = await document_service.delete_document(created_doc.id)
# Verify document and content are deleted
assert result is True
deleted_doc = await document_service.get_document_by_id(created_doc.id)
assert deleted_doc is None
content_after = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert content_after is None
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_delete_document_without_affecting_shared_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test deleting document without removing shared content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create two documents with same content
doc1 = await document_service.create_document(
"/test/test1.pdf",
sample_file_bytes,
"utf-8"
)
doc2 = await document_service.create_document(
"/test/test2.pdf",
sample_file_bytes,
"utf-8"
)
# They should share the same hash
assert doc1.file_hash == doc2.file_hash
# Delete first document
result = await document_service.delete_document(doc1.id)
assert result is True
# Verify first document is deleted but content still exists
deleted_doc = await document_service.get_document_by_id(doc1.id)
assert deleted_doc is None
remaining_doc = await document_service.get_document_by_id(doc2.id)
assert remaining_doc is not None
content = await document_service.content_repository.find_document_content_by_file_hash(
doc2.file_hash
)
assert content is not None
class TestUtilityMethods:
"""Tests for utility methods."""
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_check_content_exists(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test checking if content exists by hash."""
# Setup
mock_magic.return_value = "application/pdf"
# Initially content doesn't exist
test_hash = "nonexistent_hash"
exists_before = await document_service.content_exists(test_hash)
assert exists_before is False
# Create a document
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Now content should exist
exists_after = await document_service.content_exists(created_doc.file_hash)
assert exists_after is True
@patch('app.services.document_service.magic.from_buffer')
@pytest.mark.asyncio
async def test_i_can_update_document_content(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test updating extracted document content."""
# Setup
mock_magic.return_value = "application/pdf"
# Create a document first
created_doc = await document_service.create_document(
"/test/test.pdf",
sample_file_bytes,
"utf-8"
)
# Update content
new_content = "Updated extracted content"
result = await document_service.update_document_content(
created_doc.file_hash,
new_content
)
# Verify update
assert result is not None
assert result.content == new_content
# Verify persistence
updated_content = await document_service.content_repository.find_document_content_by_file_hash(
created_doc.file_hash
)
assert updated_content.content == new_content
class TestHashCalculation:
"""Tests for file hash calculation utility."""
def test_i_can_calculate_consistent_file_hash(self, document_service):
"""Test that file hash calculation is consistent."""
test_bytes = b"Test content for hashing"
# Calculate hash multiple times
hash1 = document_service._calculate_file_hash(test_bytes)
hash2 = document_service._calculate_file_hash(test_bytes)
# Should be identical
assert hash1 == hash2
assert len(hash1) == 64 # SHA256 produces 64-character hex string
def test_i_get_different_hashes_for_different_content(self, document_service):
"""Test that different content produces different hashes."""
content1 = b"First content"
content2 = b"Second content"
hash1 = document_service._calculate_file_hash(content1)
hash2 = document_service._calculate_file_hash(content2)
assert hash1 != hash2
class TestFileTypeDetection:
"""Tests for file type detection."""
def test_i_can_detect_pdf_file_type(self, document_service):
"""Test PDF file type detection."""
file_type = document_service._detect_file_type("/path/to/document.pdf")
assert file_type == FileType.PDF
def test_i_can_detect_txt_file_type(self, document_service):
"""Test text file type detection."""
file_type = document_service._detect_file_type("/path/to/document.txt")
assert file_type == FileType.TXT
def test_i_can_detect_docx_file_type(self, document_service):
"""Test DOCX file type detection."""
file_type = document_service._detect_file_type("/path/to/document.docx")
assert file_type == FileType.DOCX
def test_i_cannot_detect_unsupported_file_type(self, document_service):
"""Test unsupported file type raises ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
document_service._detect_file_type("/path/to/document.xyz")

View File

@@ -3,7 +3,7 @@ from datetime import datetime
import pytest
from app.models.document import FileDocument, FileType
from app.utils.ducment_matching import fuzzy_matching, subsequence_matching
from app.utils.document_matching import fuzzy_matching, subsequence_matching
def get_doc(filename: str = None):