I can put a new file and create the associated pdf
This commit is contained in:
@@ -40,6 +40,8 @@ services:
|
||||
- ./src/worker/tasks:/app/tasks # <- Added: shared access to worker tasks
|
||||
- ./volumes/watched_files:/watched_files
|
||||
- ./volumes/objects:/objects
|
||||
- ./volumes/errors:/errors
|
||||
- ./volumes/ignored:/ignored
|
||||
depends_on:
|
||||
- redis
|
||||
- mongodb
|
||||
@@ -62,6 +64,8 @@ services:
|
||||
- ./src/file-processor/app:/app/app # <- Added: shared access file-processor app
|
||||
- ./volumes/watched_files:/watched_files
|
||||
- ./volumes/objects:/objects
|
||||
- ./volumes/errors:/errors
|
||||
- ./volumes/ignored:/ignored
|
||||
depends_on:
|
||||
- redis
|
||||
- mongodb
|
||||
|
||||
@@ -106,3 +106,13 @@ def get_watch_folder() -> str:
|
||||
def get_temp_folder() -> str:
|
||||
"""Directory to store temporary files"""
|
||||
return os.getenv("TEMP_DIRECTORY", "/tmp")
|
||||
|
||||
|
||||
def get_errors_folder() -> str:
|
||||
"""Directory to store temporary files"""
|
||||
return os.getenv("ERRORS_DIRECTORY", "/errors")
|
||||
|
||||
|
||||
def get_ignored_folder() -> str:
|
||||
"""Directory to store temporary files"""
|
||||
return os.getenv("IGNORED_DIRECTORY", "/ignored")
|
||||
|
||||
@@ -130,6 +130,47 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
def find_document_with_pdf_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Find file document by file hash with a pdf_file_hash set (not None).
|
||||
|
||||
Args:
|
||||
file_hash (str): SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
FileDocument or None: File document if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
file_doc = self.collection.find_one({"file_hash": file_hash,
|
||||
"pdf_file_hash": {"$ne": None}})
|
||||
if file_doc:
|
||||
return FileDocument(**file_doc)
|
||||
return None
|
||||
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
def find_same_document(self, filename: str, file_hash: str):
|
||||
"""
|
||||
Find document with the same file_name and the same file hash
|
||||
|
||||
Args:
|
||||
filename (str):
|
||||
file_hash (str): SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
FileDocument or None: File document if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
file_doc = self.collection.find_one({"file_hash": file_hash,
|
||||
"filename": filename})
|
||||
if file_doc:
|
||||
return FileDocument(**file_doc)
|
||||
return None
|
||||
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Find file document by exact filepath.
|
||||
|
||||
@@ -30,7 +30,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
|
||||
dispatching Celery tasks, and managing processing jobs.
|
||||
"""
|
||||
|
||||
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx'}
|
||||
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx', '.jpg', '.png', '.jpeg'}
|
||||
|
||||
def __init__(self, document_service: DocumentService, job_service: JobService):
|
||||
"""
|
||||
@@ -59,6 +59,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
|
||||
|
||||
if file_extension not in self.SUPPORTED_EXTENSIONS:
|
||||
logger.info(f"Ignoring unsupported file type: {filepath}")
|
||||
self.document_service.move_to_ignored(filepath, "unsupported file type")
|
||||
return
|
||||
|
||||
logger.info(f"Processing new file: {filepath}")
|
||||
|
||||
@@ -49,6 +49,7 @@ class FileDocument(BaseModel):
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
pdf_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the associated pdf file content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
@@ -6,7 +6,9 @@ while maintaining data consistency through MongoDB transactions.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
@@ -14,13 +16,16 @@ from typing import List, Optional, Dict, Any
|
||||
import magic
|
||||
from pymongo.errors import PyMongoError
|
||||
|
||||
from app.config.settings import get_objects_folder
|
||||
from app.config.settings import get_objects_folder, get_temp_folder, get_errors_folder, get_ignored_folder
|
||||
from app.database.repositories.document_repository import FileDocumentRepository
|
||||
from app.models.document import (
|
||||
FileDocument,
|
||||
FileType,
|
||||
)
|
||||
from app.models.types import PyObjectId
|
||||
from app.utils.pdf_converter import convert_to_pdf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentService:
|
||||
@@ -31,7 +36,11 @@ class DocumentService:
|
||||
and their content while ensuring data consistency through transactions.
|
||||
"""
|
||||
|
||||
def __init__(self, database, objects_folder: str = None):
|
||||
def __init__(self, database,
|
||||
objects_folder: str = None,
|
||||
temp_folder: str = None,
|
||||
errors_folder: str = None,
|
||||
ignored_folder: str = None):
|
||||
"""
|
||||
Initialize the document service with repository dependencies.
|
||||
|
||||
@@ -43,6 +52,9 @@ class DocumentService:
|
||||
self.db = database
|
||||
self.document_repository = FileDocumentRepository(self.db)
|
||||
self.objects_folder = objects_folder or get_objects_folder()
|
||||
self.temp_folder = temp_folder or get_temp_folder()
|
||||
self.errors_folder = errors_folder or get_errors_folder()
|
||||
self.ignored_folder = ignored_folder or get_ignored_folder()
|
||||
|
||||
def initialize(self):
|
||||
self.document_repository.initialize()
|
||||
@@ -117,6 +129,39 @@ class DocumentService:
|
||||
|
||||
return path.read_bytes()
|
||||
|
||||
@staticmethod
|
||||
def _get_safe_path(file_path):
|
||||
"""
|
||||
If the path already exists, add a suffix to the filename.
|
||||
Increment the suffix until a safe path is found.
|
||||
:param file_path:
|
||||
:return:
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# If the path doesn't exist, return it as is
|
||||
if not path.exists():
|
||||
return file_path
|
||||
|
||||
# Split the filename and extension
|
||||
stem = path.stem
|
||||
suffix = path.suffix
|
||||
directory = path.parent
|
||||
|
||||
# Try incrementing numbers until a unique path is found
|
||||
counter = 1
|
||||
while True:
|
||||
# Create new filename with counter
|
||||
new_filename = f"{stem}_{counter}{suffix}"
|
||||
new_path = os.path.join(directory, new_filename)
|
||||
|
||||
# Check if this new path exists
|
||||
if not os.path.exists(new_path):
|
||||
return new_path
|
||||
|
||||
# Increment counter for next attempt
|
||||
counter += 1
|
||||
|
||||
def _get_document_path(self, file_hash):
|
||||
"""
|
||||
|
||||
@@ -125,6 +170,9 @@ class DocumentService:
|
||||
"""
|
||||
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
||||
|
||||
def exists(self, file_hash):
|
||||
return os.path.exists(self._get_document_path(file_hash))
|
||||
|
||||
def save_content_if_needed(self, file_hash, content: bytes):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if os.path.exists(target_path):
|
||||
@@ -136,6 +184,18 @@ class DocumentService:
|
||||
with open(target_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
def move_to_errors(self, document_id, file_path):
|
||||
logger.info(f"Moving file {file_path} to error folder")
|
||||
error_file_name = f"{document_id}_{os.path.basename(file_path)}"
|
||||
error_file_path = self._get_safe_path(os.path.join(self.errors_folder, error_file_name))
|
||||
shutil.move(file_path, error_file_path)
|
||||
|
||||
def move_to_ignored(self, file_path, reason="Unknown"):
|
||||
logger.info(f"Moving file {file_path} to ignored folder")
|
||||
ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(file_path)
|
||||
ignored_file_path = self._get_safe_path(os.path.join(self.ignored_folder, ignored_file_name))
|
||||
shutil.move(file_path, ignored_file_path)
|
||||
|
||||
def create_document(
|
||||
self,
|
||||
file_path: str,
|
||||
@@ -171,7 +231,15 @@ class DocumentService:
|
||||
detected_at = datetime.now()
|
||||
|
||||
try:
|
||||
logger.info(f"Creating Document for {file_path}")
|
||||
# Skip the document if it already exists
|
||||
same_document = self.document_repository.find_same_document(filename, file_hash)
|
||||
if same_document is not None:
|
||||
logger.info(f" Document with same hash already exists. Skipping...")
|
||||
self.move_to_ignored(file_path, f"already exists ({same_document.id})")
|
||||
|
||||
self.save_content_if_needed(file_hash, file_bytes)
|
||||
logger.info(f" Saved content to {self._get_document_path(file_hash)}")
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
@@ -188,6 +256,7 @@ class DocumentService:
|
||||
)
|
||||
|
||||
created_file = self.document_repository.create_document(file_data)
|
||||
logger.info(f" Created document with id '{created_file.id}'")
|
||||
|
||||
return created_file
|
||||
|
||||
@@ -195,6 +264,50 @@ class DocumentService:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
|
||||
def create_pdf(self, document_id: PyObjectId):
|
||||
"""
|
||||
For all files, a controlled pdf version will be created for standard visualization and action
|
||||
:return:
|
||||
"""
|
||||
logger.info(f"Creating PDF document for {document_id}")
|
||||
document = self.get_document_by_id(document_id)
|
||||
if document is None:
|
||||
logger.error(f" Document not found")
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
|
||||
# try to find another document that has the same hash
|
||||
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
|
||||
|
||||
# the pdf will be created only if it does not exist yet
|
||||
if (document_with_same_hash is not None and
|
||||
document_with_same_hash.pdf_file_hash and
|
||||
self.exists(document_with_same_hash.pdf_file_hash)):
|
||||
logger.info(f"Found document with same hash. Will use pdf {document_with_same_hash.pdf_file_hash}")
|
||||
self.update_document(document_id, {"pdf_file_hash": document_with_same_hash.pdf_file_hash})
|
||||
return True
|
||||
|
||||
# get the content of the file
|
||||
logger.info(f" No document with same hash found and valid pdf found. Will create new pdf")
|
||||
file_bytes = self.get_document_content_by_hash(document.file_hash)
|
||||
if file_bytes is None:
|
||||
logger.error(f"Content for document {document_id} not found. hash = {document.file_hash}.")
|
||||
return False
|
||||
|
||||
# create the pdf file
|
||||
temp_pdf_file = convert_to_pdf(self._get_document_path(document.file_hash), self.temp_folder)
|
||||
pdf_file_hash = self._calculate_file_hash(self._read_file_bytes(temp_pdf_file))
|
||||
self.save_content_if_needed(pdf_file_hash, self._read_file_bytes(temp_pdf_file))
|
||||
logger.info(f" Created new pdf file with hash {pdf_file_hash}")
|
||||
|
||||
# remove the temporary file
|
||||
os.remove(temp_pdf_file)
|
||||
logger.info(f" Removed temporary pdf file {temp_pdf_file}")
|
||||
|
||||
# update the document
|
||||
self.update_document(document_id, {"pdf_file_hash": pdf_file_hash})
|
||||
|
||||
return True
|
||||
|
||||
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||
"""
|
||||
Retrieve a document by its ID.
|
||||
@@ -219,6 +332,9 @@ class DocumentService:
|
||||
"""
|
||||
return self.document_repository.find_document_by_hash(file_hash)
|
||||
|
||||
def get_document_with_pdf_hash(self, file_hash) -> Optional[FileDocument]:
|
||||
return self.document_repository.find_document_with_pdf_hash(file_hash)
|
||||
|
||||
def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Retrieve a document by its file path.
|
||||
|
||||
@@ -111,7 +111,9 @@ class JobService:
|
||||
current_job = self.repository.find_job_by_id(job_id)
|
||||
|
||||
# Validate status transition
|
||||
if current_job.status != ProcessingStatus.PROCESSING:
|
||||
if current_job.status in (ProcessingStatus.PENDING,
|
||||
ProcessingStatus.COMPLETED,
|
||||
ProcessingStatus.FAILED):
|
||||
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED)
|
||||
|
||||
# Update status
|
||||
@@ -141,7 +143,7 @@ class JobService:
|
||||
current_job = self.repository.find_job_by_id(job_id)
|
||||
|
||||
# Validate status transition
|
||||
if current_job.status != ProcessingStatus.PROCESSING:
|
||||
if current_job.status in (ProcessingStatus.PENDING, ProcessingStatus.COMPLETED, ProcessingStatus.FAILED):
|
||||
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED)
|
||||
|
||||
# Update status with error message
|
||||
@@ -151,6 +153,11 @@ class JobService:
|
||||
error_message
|
||||
)
|
||||
|
||||
def update_job_status(self, job_id: PyObjectId,
|
||||
status: ProcessingStatus,
|
||||
error_message: str = None) -> ProcessingJob:
|
||||
return self.repository.update_job_status(job_id, status, error_message)
|
||||
|
||||
def delete_job(self, job_id: PyObjectId) -> bool:
|
||||
"""
|
||||
Delete a job from the database.
|
||||
|
||||
@@ -20,12 +20,19 @@ def detect_file_type(file_path: str) -> str:
|
||||
UnsupportedFileTypeError: If file type is not supported.
|
||||
"""
|
||||
mime = magic.from_file(file_path, mime=True)
|
||||
extension = Path(file_path).suffix
|
||||
if mime.startswith("text/"):
|
||||
return "text"
|
||||
elif mime.startswith("image/"):
|
||||
return "image"
|
||||
elif mime in ("application/vnd.openxmlformats-officedocument.wordprocessingml.document",):
|
||||
return "word"
|
||||
elif mime == "application/pdf":
|
||||
return "pdf"
|
||||
elif mime == "application/vnd.ms-powerpoint":
|
||||
return "powerpoint"
|
||||
elif mime == "application/octet-stream" and extension in (".jpg", ".jpeg", ".png", ".gif"):
|
||||
return "image"
|
||||
else:
|
||||
raise UnsupportedFileTypeError(f"Unsupported file type: {mime}")
|
||||
|
||||
|
||||
@@ -6,14 +6,14 @@ and update processing job statuses throughout the task lifecycle.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
from app.config import settings
|
||||
from app.database.connection import get_database
|
||||
from app.models.job import ProcessingStatus
|
||||
from app.services.document_service import DocumentService
|
||||
from app.services.job_service import JobService
|
||||
from tasks.common.document_utils import save_as_object
|
||||
from tasks.common.pdf_converter import convert_to_pdf
|
||||
from tasks.main import celery_app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -26,7 +26,8 @@ def get_services():
|
||||
return document_service, job_service
|
||||
|
||||
|
||||
@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||
#@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||
@celery_app.task(bind=True)
|
||||
def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a document file and extract its content.
|
||||
@@ -46,41 +47,31 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
Raises:
|
||||
Exception: Any processing error (will trigger retry)
|
||||
"""
|
||||
task_id = self.request.id
|
||||
task_id = self.request.id
|
||||
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
||||
|
||||
# get services
|
||||
document_service, job_service = get_services()
|
||||
|
||||
job = None
|
||||
document = None
|
||||
try:
|
||||
# Step 1: Create the document and a new job record for the document
|
||||
document = document_service.create_document(filepath)
|
||||
job = job_service.create_job(task_id=task_id, document_id=document.id)
|
||||
job_service.mark_job_as_started(job_id=job.id)
|
||||
logger.info(f"Task {task_id} created for document {document.id} with file path: {filepath} and job id: {job.id}")
|
||||
logger.info(f"Task {task_id} created for document {document.id} from file path: {filepath} and job id: {job.id}")
|
||||
|
||||
logger.info(f"Job {task_id} marked as PROCESSING")
|
||||
|
||||
raw_file_hash = save_as_object(filepath)
|
||||
logger.info(f"Job {task_id} saved document as object: {raw_file_hash}")
|
||||
logger.info(f"Task {task_id} : Creating associated PDF")
|
||||
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.SAVING_PDF)
|
||||
document_service.create_pdf(document.id)
|
||||
|
||||
# Step 4: Create the pdf version of the document
|
||||
pdf_file_hash = convert_to_pdf(filepath, raw_file_hash)
|
||||
logger.info(f"Job {task_id} saved PDF with hash: {pdf_file_hash}")
|
||||
|
||||
|
||||
|
||||
# Step 3: Mark job as started
|
||||
|
||||
# Step 4: Create the pdf version of the document
|
||||
pdf_file_path = convert_to_pdf(filepath, settings.get_temp_folder())
|
||||
digest = save_as_object(pdf_file_path)
|
||||
logger.info(f"Job {task_id} internal PDF file created: {digest}")
|
||||
# remove the file from the watch folder
|
||||
os.remove(filepath)
|
||||
|
||||
# Step x: Mark job as completed
|
||||
job_service.mark_job_as_completed(job_id=job.id)
|
||||
logger.info(f"Job {task_id} marked as COMPLETED")
|
||||
logger.info(f"Task {task_id} marked as COMPLETED")
|
||||
|
||||
return {
|
||||
"task_id": task_id,
|
||||
@@ -99,6 +90,11 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
logger.info(f"Job {task_id} marked as FAILED")
|
||||
else:
|
||||
logger.error(f"Failed to process {filepath}. error = {str(e)}")
|
||||
|
||||
if document is not None:
|
||||
document_service.move_to_errors(document.id, filepath)
|
||||
logger.info(f"Moved file {filepath} to errors/{document.id}")
|
||||
|
||||
except Exception as job_error:
|
||||
logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
|
||||
|
||||
|
||||
@@ -41,15 +41,10 @@ celery_app.conf.update(
|
||||
|
||||
def global_init(**kwargs):
|
||||
"""Initialize global variables."""
|
||||
logger.info(f"{'*' * 20}")
|
||||
logger.info(f"{'*' * 45}")
|
||||
logger.info(f"{'--' * 5}" + " Starting MyDocManager worker " + f"{'--' * 5}")
|
||||
logger.info(f"{'*' * 20}")
|
||||
tmp_folder = settings.get_temp_folder()
|
||||
if not os.path.exists(tmp_folder):
|
||||
logger.info(f"Creating temporary folder: {tmp_folder}")
|
||||
os.makedirs(tmp_folder)
|
||||
else:
|
||||
logger.info(f"Temporary folder already exists: {os.path.abspath(tmp_folder)}")
|
||||
logger.info(f"{'*' * 45}")
|
||||
|
||||
|
||||
global_init()
|
||||
|
||||
|
||||
@@ -568,3 +568,137 @@ class TestFileTypeDetection:
|
||||
"""Test unsupported file type raises ValueError."""
|
||||
with pytest.raises(ValueError, match="Unsupported file type"):
|
||||
document_service._detect_file_type("/path/to/document.xyz")
|
||||
|
||||
|
||||
class TestCreatePdf:
|
||||
"""Tests for create_pdf method."""
|
||||
|
||||
@patch('app.services.document_service.convert_to_pdf')
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
def test_i_can_create_pdf_successfully(
|
||||
self,
|
||||
mock_magic,
|
||||
mock_convert_to_pdf,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test creating PDF from an existing document."""
|
||||
# Setup
|
||||
mock_magic.return_value = "text/plain"
|
||||
|
||||
# Create a document first
|
||||
created_doc = document_service.create_document(
|
||||
"/test/test.txt",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Mock the PDF conversion
|
||||
pdf_path = os.path.join(document_service.temp_folder, "converted.pdf")
|
||||
mock_convert_to_pdf.return_value = pdf_path
|
||||
|
||||
# Write a sample PDF file that the conversion would create
|
||||
pdf_content = b"This is PDF content"
|
||||
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(pdf_content)
|
||||
|
||||
# Execute
|
||||
result = document_service.create_pdf(created_doc.id)
|
||||
|
||||
# Verify
|
||||
assert result is True
|
||||
|
||||
# Get the updated document
|
||||
updated_doc = document_service.get_document_by_id(created_doc.id)
|
||||
assert updated_doc.pdf_file_hash is not None
|
||||
|
||||
# Verify the PDF content was saved
|
||||
pdf_hash = document_service._calculate_file_hash(pdf_content)
|
||||
assert updated_doc.pdf_file_hash == pdf_hash
|
||||
|
||||
# Verify convert_to_pdf was called with correct arguments
|
||||
doc_path = document_service._get_document_path(created_doc.file_hash)
|
||||
mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder)
|
||||
|
||||
# Verify content exists on disk
|
||||
validate_file_saved(document_service, pdf_hash, pdf_content)
|
||||
|
||||
# Verify PDF hash was added to document
|
||||
updated_doc = document_service.get_document_by_id(created_doc.id)
|
||||
pdf_hash = document_service._calculate_file_hash(pdf_content)
|
||||
assert updated_doc.pdf_file_hash == pdf_hash
|
||||
|
||||
@patch('app.services.document_service.convert_to_pdf')
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
def test_i_can_reuse_existing_pdf(
|
||||
self,
|
||||
mock_magic,
|
||||
mock_convert_to_pdf,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test that if PDF already exists, it doesn't recreate it."""
|
||||
# Setup
|
||||
mock_magic.return_value = "text/plain"
|
||||
|
||||
# Create a document first
|
||||
created_doc = document_service.create_document(
|
||||
"/test/test.txt",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Create a fake PDF file and update the document
|
||||
pdf_content = b"This is PDF content"
|
||||
pdf_hash = document_service._calculate_file_hash(pdf_content)
|
||||
document_service.save_content_if_needed(pdf_hash, pdf_content)
|
||||
document_service.update_document(created_doc.id, {"pdf_file_hash": pdf_hash})
|
||||
|
||||
# Execute
|
||||
result = document_service.create_pdf(created_doc.id)
|
||||
|
||||
# Verify
|
||||
assert result is True
|
||||
|
||||
# Verify convert_to_pdf was NOT called
|
||||
mock_convert_to_pdf.assert_not_called()
|
||||
|
||||
def test_i_cannot_create_pdf_for_nonexistent_document(
|
||||
self,
|
||||
document_service
|
||||
):
|
||||
"""Test behavior when document ID doesn't exist."""
|
||||
# Execute with random ObjectId
|
||||
result = document_service.create_pdf(ObjectId())
|
||||
|
||||
# Verify
|
||||
assert result is False
|
||||
|
||||
@patch('app.services.document_service.magic.from_buffer')
|
||||
def test_i_cannot_create_pdf_when_file_content_missing(
|
||||
self,
|
||||
mock_magic,
|
||||
document_service,
|
||||
sample_file_bytes
|
||||
):
|
||||
"""Test behavior when file content doesn't exist."""
|
||||
# Setup
|
||||
mock_magic.return_value = "text/plain"
|
||||
|
||||
# Create a document
|
||||
created_doc = document_service.create_document(
|
||||
"/test/test.txt",
|
||||
sample_file_bytes,
|
||||
"utf-8"
|
||||
)
|
||||
|
||||
# Simulate missing content by removing file
|
||||
file_path = document_service._get_document_path(created_doc.file_hash)
|
||||
os.remove(file_path)
|
||||
|
||||
# Execute
|
||||
result = document_service.create_pdf(created_doc.id)
|
||||
|
||||
# Verify
|
||||
assert result is False
|
||||
|
||||
@@ -417,6 +417,25 @@ class TestUpdateStatus:
|
||||
# Verify exception details
|
||||
assert exc_info.value.current_status == ProcessingStatus.FAILED
|
||||
assert exc_info.value.target_status == ProcessingStatus.FAILED
|
||||
|
||||
def test_i_can_update_job_status(
|
||||
self,
|
||||
job_service,
|
||||
sample_document_id,
|
||||
sample_task_id
|
||||
):
|
||||
"""Test that failed job cannot be marked as failed again."""
|
||||
# Create, start, and fail a job
|
||||
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||
job_service.mark_job_as_started(created_job.id)
|
||||
|
||||
# Execute without error message
|
||||
result = job_service.update_job_status(created_job.id, ProcessingStatus.SAVING_OBJECT)
|
||||
|
||||
# Verify status transition
|
||||
assert result is not None
|
||||
assert result.status == ProcessingStatus.SAVING_OBJECT
|
||||
assert result.error_message is None
|
||||
|
||||
|
||||
class TestDeleteJob:
|
||||
|
||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from tasks.common.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter
|
||||
from app.utils.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -20,10 +20,10 @@ def test_i_can_convert_text_to_pdf(temp_dir):
|
||||
input_txt.write_text("Hello World!\nThis is a test.")
|
||||
|
||||
converter = TextToPdfConverter(str(input_txt), output_dir=temp_dir)
|
||||
output_pdf = converter.convert()
|
||||
converter.convert()
|
||||
|
||||
assert Path(output_pdf).exists()
|
||||
assert output_pdf.endswith(".pdf")
|
||||
assert Path(converter.output_path).exists()
|
||||
assert str(converter.output_path).endswith(".pdf")
|
||||
|
||||
|
||||
def test_i_can_convert_image_to_pdf(temp_dir):
|
||||
@@ -34,10 +34,10 @@ def test_i_can_convert_image_to_pdf(temp_dir):
|
||||
image.save(input_img)
|
||||
|
||||
converter = ImageToPdfConverter(str(input_img), output_dir=temp_dir)
|
||||
output_pdf = converter.convert()
|
||||
converter.convert()
|
||||
|
||||
assert Path(output_pdf).exists()
|
||||
assert output_pdf.endswith(".pdf")
|
||||
assert Path(converter.output_path).exists()
|
||||
assert str(converter.output_path).endswith(".pdf")
|
||||
|
||||
|
||||
def test_i_can_convert_word_to_pdf(temp_dir):
|
||||
@@ -49,7 +49,7 @@ def test_i_can_convert_word_to_pdf(temp_dir):
|
||||
doc.save(input_docx)
|
||||
|
||||
converter = WordToPdfConverter(str(input_docx), output_dir=temp_dir)
|
||||
output_pdf = converter.convert()
|
||||
converter.convert()
|
||||
|
||||
assert Path(output_pdf).exists()
|
||||
assert output_pdf.endswith(".pdf")
|
||||
assert Path(converter.output_path).exists()
|
||||
assert str(converter.output_path).endswith(".pdf")
|
||||
Reference in New Issue
Block a user