I can put a new file and create the associated pdf

This commit is contained in:
2025-10-05 23:54:59 +02:00
parent bd52f2d296
commit 8ae9754fde
14 changed files with 376 additions and 45 deletions

View File

@@ -40,6 +40,8 @@ services:
- ./src/worker/tasks:/app/tasks # <- Added: shared access to worker tasks - ./src/worker/tasks:/app/tasks # <- Added: shared access to worker tasks
- ./volumes/watched_files:/watched_files - ./volumes/watched_files:/watched_files
- ./volumes/objects:/objects - ./volumes/objects:/objects
- ./volumes/errors:/errors
- ./volumes/ignored:/ignored
depends_on: depends_on:
- redis - redis
- mongodb - mongodb
@@ -62,6 +64,8 @@ services:
- ./src/file-processor/app:/app/app # <- Added: shared access file-processor app - ./src/file-processor/app:/app/app # <- Added: shared access file-processor app
- ./volumes/watched_files:/watched_files - ./volumes/watched_files:/watched_files
- ./volumes/objects:/objects - ./volumes/objects:/objects
- ./volumes/errors:/errors
- ./volumes/ignored:/ignored
depends_on: depends_on:
- redis - redis
- mongodb - mongodb

View File

@@ -106,3 +106,13 @@ def get_watch_folder() -> str:
def get_temp_folder() -> str: def get_temp_folder() -> str:
"""Directory to store temporary files""" """Directory to store temporary files"""
return os.getenv("TEMP_DIRECTORY", "/tmp") return os.getenv("TEMP_DIRECTORY", "/tmp")
def get_errors_folder() -> str:
"""Directory to store temporary files"""
return os.getenv("ERRORS_DIRECTORY", "/errors")
def get_ignored_folder() -> str:
"""Directory to store temporary files"""
return os.getenv("IGNORED_DIRECTORY", "/ignored")

View File

@@ -130,6 +130,47 @@ class FileDocumentRepository:
except PyMongoError: except PyMongoError:
return None return None
def find_document_with_pdf_hash(self, file_hash: str) -> Optional[FileDocument]:
"""
Find file document by file hash with a pdf_file_hash set (not None).
Args:
file_hash (str): SHA256 hash of file content
Returns:
FileDocument or None: File document if found, None otherwise
"""
try:
file_doc = self.collection.find_one({"file_hash": file_hash,
"pdf_file_hash": {"$ne": None}})
if file_doc:
return FileDocument(**file_doc)
return None
except PyMongoError:
return None
def find_same_document(self, filename: str, file_hash: str):
"""
Find document with the same file_name and the same file hash
Args:
filename (str):
file_hash (str): SHA256 hash of file content
Returns:
FileDocument or None: File document if found, None otherwise
"""
try:
file_doc = self.collection.find_one({"file_hash": file_hash,
"filename": filename})
if file_doc:
return FileDocument(**file_doc)
return None
except PyMongoError:
return None
def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
""" """
Find file document by exact filepath. Find file document by exact filepath.

View File

@@ -30,7 +30,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
dispatching Celery tasks, and managing processing jobs. dispatching Celery tasks, and managing processing jobs.
""" """
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx'} SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx', '.jpg', '.png', '.jpeg'}
def __init__(self, document_service: DocumentService, job_service: JobService): def __init__(self, document_service: DocumentService, job_service: JobService):
""" """
@@ -59,6 +59,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
if file_extension not in self.SUPPORTED_EXTENSIONS: if file_extension not in self.SUPPORTED_EXTENSIONS:
logger.info(f"Ignoring unsupported file type: {filepath}") logger.info(f"Ignoring unsupported file type: {filepath}")
self.document_service.move_to_ignored(filepath, "unsupported file type")
return return
logger.info(f"Processing new file: {filepath}") logger.info(f"Processing new file: {filepath}")

View File

@@ -49,6 +49,7 @@ class FileDocument(BaseModel):
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata") metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected") detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
pdf_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the associated pdf file content")
encoding: str = Field(default="utf-8", description="Character encoding for text files") encoding: str = Field(default="utf-8", description="Character encoding for text files")
file_size: int = Field(..., ge=0, description="File size in bytes") file_size: int = Field(..., ge=0, description="File size in bytes")
mime_type: str = Field(..., description="MIME type detected") mime_type: str = Field(..., description="MIME type detected")

View File

@@ -6,7 +6,9 @@ while maintaining data consistency through MongoDB transactions.
""" """
import hashlib import hashlib
import logging
import os import os
import shutil
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import List, Optional, Dict, Any from typing import List, Optional, Dict, Any
@@ -14,13 +16,16 @@ from typing import List, Optional, Dict, Any
import magic import magic
from pymongo.errors import PyMongoError from pymongo.errors import PyMongoError
from app.config.settings import get_objects_folder from app.config.settings import get_objects_folder, get_temp_folder, get_errors_folder, get_ignored_folder
from app.database.repositories.document_repository import FileDocumentRepository from app.database.repositories.document_repository import FileDocumentRepository
from app.models.document import ( from app.models.document import (
FileDocument, FileDocument,
FileType, FileType,
) )
from app.models.types import PyObjectId from app.models.types import PyObjectId
from app.utils.pdf_converter import convert_to_pdf
logger = logging.getLogger(__name__)
class DocumentService: class DocumentService:
@@ -31,7 +36,11 @@ class DocumentService:
and their content while ensuring data consistency through transactions. and their content while ensuring data consistency through transactions.
""" """
def __init__(self, database, objects_folder: str = None): def __init__(self, database,
objects_folder: str = None,
temp_folder: str = None,
errors_folder: str = None,
ignored_folder: str = None):
""" """
Initialize the document service with repository dependencies. Initialize the document service with repository dependencies.
@@ -43,6 +52,9 @@ class DocumentService:
self.db = database self.db = database
self.document_repository = FileDocumentRepository(self.db) self.document_repository = FileDocumentRepository(self.db)
self.objects_folder = objects_folder or get_objects_folder() self.objects_folder = objects_folder or get_objects_folder()
self.temp_folder = temp_folder or get_temp_folder()
self.errors_folder = errors_folder or get_errors_folder()
self.ignored_folder = ignored_folder or get_ignored_folder()
def initialize(self): def initialize(self):
self.document_repository.initialize() self.document_repository.initialize()
@@ -117,6 +129,39 @@ class DocumentService:
return path.read_bytes() return path.read_bytes()
@staticmethod
def _get_safe_path(file_path):
"""
If the path already exists, add a suffix to the filename.
Increment the suffix until a safe path is found.
:param file_path:
:return:
"""
path = Path(file_path)
# If the path doesn't exist, return it as is
if not path.exists():
return file_path
# Split the filename and extension
stem = path.stem
suffix = path.suffix
directory = path.parent
# Try incrementing numbers until a unique path is found
counter = 1
while True:
# Create new filename with counter
new_filename = f"{stem}_{counter}{suffix}"
new_path = os.path.join(directory, new_filename)
# Check if this new path exists
if not os.path.exists(new_path):
return new_path
# Increment counter for next attempt
counter += 1
def _get_document_path(self, file_hash): def _get_document_path(self, file_hash):
""" """
@@ -125,6 +170,9 @@ class DocumentService:
""" """
return os.path.join(self.objects_folder, file_hash[:24], file_hash) return os.path.join(self.objects_folder, file_hash[:24], file_hash)
def exists(self, file_hash):
return os.path.exists(self._get_document_path(file_hash))
def save_content_if_needed(self, file_hash, content: bytes): def save_content_if_needed(self, file_hash, content: bytes):
target_path = self._get_document_path(file_hash) target_path = self._get_document_path(file_hash)
if os.path.exists(target_path): if os.path.exists(target_path):
@@ -136,6 +184,18 @@ class DocumentService:
with open(target_path, "wb") as f: with open(target_path, "wb") as f:
f.write(content) f.write(content)
def move_to_errors(self, document_id, file_path):
logger.info(f"Moving file {file_path} to error folder")
error_file_name = f"{document_id}_{os.path.basename(file_path)}"
error_file_path = self._get_safe_path(os.path.join(self.errors_folder, error_file_name))
shutil.move(file_path, error_file_path)
def move_to_ignored(self, file_path, reason="Unknown"):
logger.info(f"Moving file {file_path} to ignored folder")
ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(file_path)
ignored_file_path = self._get_safe_path(os.path.join(self.ignored_folder, ignored_file_name))
shutil.move(file_path, ignored_file_path)
def create_document( def create_document(
self, self,
file_path: str, file_path: str,
@@ -171,7 +231,15 @@ class DocumentService:
detected_at = datetime.now() detected_at = datetime.now()
try: try:
logger.info(f"Creating Document for {file_path}")
# Skip the document if it already exists
same_document = self.document_repository.find_same_document(filename, file_hash)
if same_document is not None:
logger.info(f" Document with same hash already exists. Skipping...")
self.move_to_ignored(file_path, f"already exists ({same_document.id})")
self.save_content_if_needed(file_hash, file_bytes) self.save_content_if_needed(file_hash, file_bytes)
logger.info(f" Saved content to {self._get_document_path(file_hash)}")
# Create FileDocument # Create FileDocument
file_data = FileDocument( file_data = FileDocument(
@@ -188,6 +256,7 @@ class DocumentService:
) )
created_file = self.document_repository.create_document(file_data) created_file = self.document_repository.create_document(file_data)
logger.info(f" Created document with id '{created_file.id}'")
return created_file return created_file
@@ -195,6 +264,50 @@ class DocumentService:
# Transaction will automatically rollback if supported # Transaction will automatically rollback if supported
raise PyMongoError(f"Failed to create document: {str(e)}") raise PyMongoError(f"Failed to create document: {str(e)}")
def create_pdf(self, document_id: PyObjectId):
"""
For all files, a controlled pdf version will be created for standard visualization and action
:return:
"""
logger.info(f"Creating PDF document for {document_id}")
document = self.get_document_by_id(document_id)
if document is None:
logger.error(f" Document not found")
raise ValueError(f"Document {document_id} not found")
# try to find another document that has the same hash
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
# the pdf will be created only if it does not exist yet
if (document_with_same_hash is not None and
document_with_same_hash.pdf_file_hash and
self.exists(document_with_same_hash.pdf_file_hash)):
logger.info(f"Found document with same hash. Will use pdf {document_with_same_hash.pdf_file_hash}")
self.update_document(document_id, {"pdf_file_hash": document_with_same_hash.pdf_file_hash})
return True
# get the content of the file
logger.info(f" No document with same hash found and valid pdf found. Will create new pdf")
file_bytes = self.get_document_content_by_hash(document.file_hash)
if file_bytes is None:
logger.error(f"Content for document {document_id} not found. hash = {document.file_hash}.")
return False
# create the pdf file
temp_pdf_file = convert_to_pdf(self._get_document_path(document.file_hash), self.temp_folder)
pdf_file_hash = self._calculate_file_hash(self._read_file_bytes(temp_pdf_file))
self.save_content_if_needed(pdf_file_hash, self._read_file_bytes(temp_pdf_file))
logger.info(f" Created new pdf file with hash {pdf_file_hash}")
# remove the temporary file
os.remove(temp_pdf_file)
logger.info(f" Removed temporary pdf file {temp_pdf_file}")
# update the document
self.update_document(document_id, {"pdf_file_hash": pdf_file_hash})
return True
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]: def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
""" """
Retrieve a document by its ID. Retrieve a document by its ID.
@@ -219,6 +332,9 @@ class DocumentService:
""" """
return self.document_repository.find_document_by_hash(file_hash) return self.document_repository.find_document_by_hash(file_hash)
def get_document_with_pdf_hash(self, file_hash) -> Optional[FileDocument]:
return self.document_repository.find_document_with_pdf_hash(file_hash)
def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]: def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
""" """
Retrieve a document by its file path. Retrieve a document by its file path.

View File

@@ -111,7 +111,9 @@ class JobService:
current_job = self.repository.find_job_by_id(job_id) current_job = self.repository.find_job_by_id(job_id)
# Validate status transition # Validate status transition
if current_job.status != ProcessingStatus.PROCESSING: if current_job.status in (ProcessingStatus.PENDING,
ProcessingStatus.COMPLETED,
ProcessingStatus.FAILED):
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED) raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED)
# Update status # Update status
@@ -141,7 +143,7 @@ class JobService:
current_job = self.repository.find_job_by_id(job_id) current_job = self.repository.find_job_by_id(job_id)
# Validate status transition # Validate status transition
if current_job.status != ProcessingStatus.PROCESSING: if current_job.status in (ProcessingStatus.PENDING, ProcessingStatus.COMPLETED, ProcessingStatus.FAILED):
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED) raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED)
# Update status with error message # Update status with error message
@@ -151,6 +153,11 @@ class JobService:
error_message error_message
) )
def update_job_status(self, job_id: PyObjectId,
status: ProcessingStatus,
error_message: str = None) -> ProcessingJob:
return self.repository.update_job_status(job_id, status, error_message)
def delete_job(self, job_id: PyObjectId) -> bool: def delete_job(self, job_id: PyObjectId) -> bool:
""" """
Delete a job from the database. Delete a job from the database.

View File

@@ -20,12 +20,19 @@ def detect_file_type(file_path: str) -> str:
UnsupportedFileTypeError: If file type is not supported. UnsupportedFileTypeError: If file type is not supported.
""" """
mime = magic.from_file(file_path, mime=True) mime = magic.from_file(file_path, mime=True)
extension = Path(file_path).suffix
if mime.startswith("text/"): if mime.startswith("text/"):
return "text" return "text"
elif mime.startswith("image/"): elif mime.startswith("image/"):
return "image" return "image"
elif mime in ("application/vnd.openxmlformats-officedocument.wordprocessingml.document",): elif mime in ("application/vnd.openxmlformats-officedocument.wordprocessingml.document",):
return "word" return "word"
elif mime == "application/pdf":
return "pdf"
elif mime == "application/vnd.ms-powerpoint":
return "powerpoint"
elif mime == "application/octet-stream" and extension in (".jpg", ".jpeg", ".png", ".gif"):
return "image"
else: else:
raise UnsupportedFileTypeError(f"Unsupported file type: {mime}") raise UnsupportedFileTypeError(f"Unsupported file type: {mime}")

View File

@@ -6,14 +6,14 @@ and update processing job statuses throughout the task lifecycle.
""" """
import logging import logging
import os
from typing import Any, Dict from typing import Any, Dict
from app.config import settings from app.config import settings
from app.database.connection import get_database from app.database.connection import get_database
from app.models.job import ProcessingStatus
from app.services.document_service import DocumentService from app.services.document_service import DocumentService
from app.services.job_service import JobService from app.services.job_service import JobService
from tasks.common.document_utils import save_as_object
from tasks.common.pdf_converter import convert_to_pdf
from tasks.main import celery_app from tasks.main import celery_app
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -26,7 +26,8 @@ def get_services():
return document_service, job_service return document_service, job_service
@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60}) #@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
@celery_app.task(bind=True)
def process_document(self, filepath: str) -> Dict[str, Any]: def process_document(self, filepath: str) -> Dict[str, Any]:
""" """
Process a document file and extract its content. Process a document file and extract its content.
@@ -53,34 +54,24 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
document_service, job_service = get_services() document_service, job_service = get_services()
job = None job = None
document = None
try: try:
# Step 1: Create the document and a new job record for the document # Step 1: Create the document and a new job record for the document
document = document_service.create_document(filepath) document = document_service.create_document(filepath)
job = job_service.create_job(task_id=task_id, document_id=document.id) job = job_service.create_job(task_id=task_id, document_id=document.id)
job_service.mark_job_as_started(job_id=job.id) job_service.mark_job_as_started(job_id=job.id)
logger.info(f"Task {task_id} created for document {document.id} with file path: {filepath} and job id: {job.id}") logger.info(f"Task {task_id} created for document {document.id} from file path: {filepath} and job id: {job.id}")
logger.info(f"Job {task_id} marked as PROCESSING") logger.info(f"Task {task_id} : Creating associated PDF")
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.SAVING_PDF)
document_service.create_pdf(document.id)
raw_file_hash = save_as_object(filepath) # remove the file from the watch folder
logger.info(f"Job {task_id} saved document as object: {raw_file_hash}") os.remove(filepath)
# Step 4: Create the pdf version of the document
pdf_file_hash = convert_to_pdf(filepath, raw_file_hash)
logger.info(f"Job {task_id} saved PDF with hash: {pdf_file_hash}")
# Step 3: Mark job as started
# Step 4: Create the pdf version of the document
pdf_file_path = convert_to_pdf(filepath, settings.get_temp_folder())
digest = save_as_object(pdf_file_path)
logger.info(f"Job {task_id} internal PDF file created: {digest}")
# Step x: Mark job as completed # Step x: Mark job as completed
job_service.mark_job_as_completed(job_id=job.id) job_service.mark_job_as_completed(job_id=job.id)
logger.info(f"Job {task_id} marked as COMPLETED") logger.info(f"Task {task_id} marked as COMPLETED")
return { return {
"task_id": task_id, "task_id": task_id,
@@ -99,6 +90,11 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
logger.info(f"Job {task_id} marked as FAILED") logger.info(f"Job {task_id} marked as FAILED")
else: else:
logger.error(f"Failed to process {filepath}. error = {str(e)}") logger.error(f"Failed to process {filepath}. error = {str(e)}")
if document is not None:
document_service.move_to_errors(document.id, filepath)
logger.info(f"Moved file {filepath} to errors/{document.id}")
except Exception as job_error: except Exception as job_error:
logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}") logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")

View File

@@ -41,15 +41,10 @@ celery_app.conf.update(
def global_init(**kwargs): def global_init(**kwargs):
"""Initialize global variables.""" """Initialize global variables."""
logger.info(f"{'*' * 20}") logger.info(f"{'*' * 45}")
logger.info(f"{'--' * 5}" + " Starting MyDocManager worker " + f"{'--' * 5}") logger.info(f"{'--' * 5}" + " Starting MyDocManager worker " + f"{'--' * 5}")
logger.info(f"{'*' * 20}") logger.info(f"{'*' * 45}")
tmp_folder = settings.get_temp_folder()
if not os.path.exists(tmp_folder):
logger.info(f"Creating temporary folder: {tmp_folder}")
os.makedirs(tmp_folder)
else:
logger.info(f"Temporary folder already exists: {os.path.abspath(tmp_folder)}")
global_init() global_init()

View File

@@ -568,3 +568,137 @@ class TestFileTypeDetection:
"""Test unsupported file type raises ValueError.""" """Test unsupported file type raises ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"): with pytest.raises(ValueError, match="Unsupported file type"):
document_service._detect_file_type("/path/to/document.xyz") document_service._detect_file_type("/path/to/document.xyz")
class TestCreatePdf:
"""Tests for create_pdf method."""
@patch('app.services.document_service.convert_to_pdf')
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_create_pdf_successfully(
self,
mock_magic,
mock_convert_to_pdf,
document_service,
sample_file_bytes
):
"""Test creating PDF from an existing document."""
# Setup
mock_magic.return_value = "text/plain"
# Create a document first
created_doc = document_service.create_document(
"/test/test.txt",
sample_file_bytes,
"utf-8"
)
# Mock the PDF conversion
pdf_path = os.path.join(document_service.temp_folder, "converted.pdf")
mock_convert_to_pdf.return_value = pdf_path
# Write a sample PDF file that the conversion would create
pdf_content = b"This is PDF content"
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
with open(pdf_path, "wb") as f:
f.write(pdf_content)
# Execute
result = document_service.create_pdf(created_doc.id)
# Verify
assert result is True
# Get the updated document
updated_doc = document_service.get_document_by_id(created_doc.id)
assert updated_doc.pdf_file_hash is not None
# Verify the PDF content was saved
pdf_hash = document_service._calculate_file_hash(pdf_content)
assert updated_doc.pdf_file_hash == pdf_hash
# Verify convert_to_pdf was called with correct arguments
doc_path = document_service._get_document_path(created_doc.file_hash)
mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder)
# Verify content exists on disk
validate_file_saved(document_service, pdf_hash, pdf_content)
# Verify PDF hash was added to document
updated_doc = document_service.get_document_by_id(created_doc.id)
pdf_hash = document_service._calculate_file_hash(pdf_content)
assert updated_doc.pdf_file_hash == pdf_hash
@patch('app.services.document_service.convert_to_pdf')
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_reuse_existing_pdf(
self,
mock_magic,
mock_convert_to_pdf,
document_service,
sample_file_bytes
):
"""Test that if PDF already exists, it doesn't recreate it."""
# Setup
mock_magic.return_value = "text/plain"
# Create a document first
created_doc = document_service.create_document(
"/test/test.txt",
sample_file_bytes,
"utf-8"
)
# Create a fake PDF file and update the document
pdf_content = b"This is PDF content"
pdf_hash = document_service._calculate_file_hash(pdf_content)
document_service.save_content_if_needed(pdf_hash, pdf_content)
document_service.update_document(created_doc.id, {"pdf_file_hash": pdf_hash})
# Execute
result = document_service.create_pdf(created_doc.id)
# Verify
assert result is True
# Verify convert_to_pdf was NOT called
mock_convert_to_pdf.assert_not_called()
def test_i_cannot_create_pdf_for_nonexistent_document(
self,
document_service
):
"""Test behavior when document ID doesn't exist."""
# Execute with random ObjectId
result = document_service.create_pdf(ObjectId())
# Verify
assert result is False
@patch('app.services.document_service.magic.from_buffer')
def test_i_cannot_create_pdf_when_file_content_missing(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test behavior when file content doesn't exist."""
# Setup
mock_magic.return_value = "text/plain"
# Create a document
created_doc = document_service.create_document(
"/test/test.txt",
sample_file_bytes,
"utf-8"
)
# Simulate missing content by removing file
file_path = document_service._get_document_path(created_doc.file_hash)
os.remove(file_path)
# Execute
result = document_service.create_pdf(created_doc.id)
# Verify
assert result is False

View File

@@ -418,6 +418,25 @@ class TestUpdateStatus:
assert exc_info.value.current_status == ProcessingStatus.FAILED assert exc_info.value.current_status == ProcessingStatus.FAILED
assert exc_info.value.target_status == ProcessingStatus.FAILED assert exc_info.value.target_status == ProcessingStatus.FAILED
def test_i_can_update_job_status(
self,
job_service,
sample_document_id,
sample_task_id
):
"""Test that failed job cannot be marked as failed again."""
# Create, start, and fail a job
created_job = job_service.create_job(sample_document_id, sample_task_id)
job_service.mark_job_as_started(created_job.id)
# Execute without error message
result = job_service.update_job_status(created_job.id, ProcessingStatus.SAVING_OBJECT)
# Verify status transition
assert result is not None
assert result.status == ProcessingStatus.SAVING_OBJECT
assert result.error_message is None
class TestDeleteJob: class TestDeleteJob:
"""Tests for delete_job method.""" """Tests for delete_job method."""

View File

@@ -4,7 +4,7 @@ from pathlib import Path
import pytest import pytest
from tasks.common.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter from app.utils.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter
@pytest.fixture @pytest.fixture
@@ -20,10 +20,10 @@ def test_i_can_convert_text_to_pdf(temp_dir):
input_txt.write_text("Hello World!\nThis is a test.") input_txt.write_text("Hello World!\nThis is a test.")
converter = TextToPdfConverter(str(input_txt), output_dir=temp_dir) converter = TextToPdfConverter(str(input_txt), output_dir=temp_dir)
output_pdf = converter.convert() converter.convert()
assert Path(output_pdf).exists() assert Path(converter.output_path).exists()
assert output_pdf.endswith(".pdf") assert str(converter.output_path).endswith(".pdf")
def test_i_can_convert_image_to_pdf(temp_dir): def test_i_can_convert_image_to_pdf(temp_dir):
@@ -34,10 +34,10 @@ def test_i_can_convert_image_to_pdf(temp_dir):
image.save(input_img) image.save(input_img)
converter = ImageToPdfConverter(str(input_img), output_dir=temp_dir) converter = ImageToPdfConverter(str(input_img), output_dir=temp_dir)
output_pdf = converter.convert() converter.convert()
assert Path(output_pdf).exists() assert Path(converter.output_path).exists()
assert output_pdf.endswith(".pdf") assert str(converter.output_path).endswith(".pdf")
def test_i_can_convert_word_to_pdf(temp_dir): def test_i_can_convert_word_to_pdf(temp_dir):
@@ -49,7 +49,7 @@ def test_i_can_convert_word_to_pdf(temp_dir):
doc.save(input_docx) doc.save(input_docx)
converter = WordToPdfConverter(str(input_docx), output_dir=temp_dir) converter = WordToPdfConverter(str(input_docx), output_dir=temp_dir)
output_pdf = converter.convert() converter.convert()
assert Path(output_pdf).exists() assert Path(converter.output_path).exists()
assert output_pdf.endswith(".pdf") assert str(converter.output_path).endswith(".pdf")