I can put a new file and create the associated pdf
This commit is contained in:
@@ -40,6 +40,8 @@ services:
|
|||||||
- ./src/worker/tasks:/app/tasks # <- Added: shared access to worker tasks
|
- ./src/worker/tasks:/app/tasks # <- Added: shared access to worker tasks
|
||||||
- ./volumes/watched_files:/watched_files
|
- ./volumes/watched_files:/watched_files
|
||||||
- ./volumes/objects:/objects
|
- ./volumes/objects:/objects
|
||||||
|
- ./volumes/errors:/errors
|
||||||
|
- ./volumes/ignored:/ignored
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
- mongodb
|
- mongodb
|
||||||
@@ -62,6 +64,8 @@ services:
|
|||||||
- ./src/file-processor/app:/app/app # <- Added: shared access file-processor app
|
- ./src/file-processor/app:/app/app # <- Added: shared access file-processor app
|
||||||
- ./volumes/watched_files:/watched_files
|
- ./volumes/watched_files:/watched_files
|
||||||
- ./volumes/objects:/objects
|
- ./volumes/objects:/objects
|
||||||
|
- ./volumes/errors:/errors
|
||||||
|
- ./volumes/ignored:/ignored
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
- mongodb
|
- mongodb
|
||||||
|
|||||||
@@ -106,3 +106,13 @@ def get_watch_folder() -> str:
|
|||||||
def get_temp_folder() -> str:
|
def get_temp_folder() -> str:
|
||||||
"""Directory to store temporary files"""
|
"""Directory to store temporary files"""
|
||||||
return os.getenv("TEMP_DIRECTORY", "/tmp")
|
return os.getenv("TEMP_DIRECTORY", "/tmp")
|
||||||
|
|
||||||
|
|
||||||
|
def get_errors_folder() -> str:
|
||||||
|
"""Directory to store temporary files"""
|
||||||
|
return os.getenv("ERRORS_DIRECTORY", "/errors")
|
||||||
|
|
||||||
|
|
||||||
|
def get_ignored_folder() -> str:
|
||||||
|
"""Directory to store temporary files"""
|
||||||
|
return os.getenv("IGNORED_DIRECTORY", "/ignored")
|
||||||
|
|||||||
@@ -130,6 +130,47 @@ class FileDocumentRepository:
|
|||||||
except PyMongoError:
|
except PyMongoError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def find_document_with_pdf_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||||
|
"""
|
||||||
|
Find file document by file hash with a pdf_file_hash set (not None).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_hash (str): SHA256 hash of file content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
FileDocument or None: File document if found, None otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
file_doc = self.collection.find_one({"file_hash": file_hash,
|
||||||
|
"pdf_file_hash": {"$ne": None}})
|
||||||
|
if file_doc:
|
||||||
|
return FileDocument(**file_doc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
except PyMongoError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def find_same_document(self, filename: str, file_hash: str):
|
||||||
|
"""
|
||||||
|
Find document with the same file_name and the same file hash
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename (str):
|
||||||
|
file_hash (str): SHA256 hash of file content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
FileDocument or None: File document if found, None otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
file_doc = self.collection.find_one({"file_hash": file_hash,
|
||||||
|
"filename": filename})
|
||||||
|
if file_doc:
|
||||||
|
return FileDocument(**file_doc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
except PyMongoError:
|
||||||
|
return None
|
||||||
|
|
||||||
def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Find file document by exact filepath.
|
Find file document by exact filepath.
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
|
|||||||
dispatching Celery tasks, and managing processing jobs.
|
dispatching Celery tasks, and managing processing jobs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx'}
|
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx', '.jpg', '.png', '.jpeg'}
|
||||||
|
|
||||||
def __init__(self, document_service: DocumentService, job_service: JobService):
|
def __init__(self, document_service: DocumentService, job_service: JobService):
|
||||||
"""
|
"""
|
||||||
@@ -59,6 +59,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
|
|||||||
|
|
||||||
if file_extension not in self.SUPPORTED_EXTENSIONS:
|
if file_extension not in self.SUPPORTED_EXTENSIONS:
|
||||||
logger.info(f"Ignoring unsupported file type: {filepath}")
|
logger.info(f"Ignoring unsupported file type: {filepath}")
|
||||||
|
self.document_service.move_to_ignored(filepath, "unsupported file type")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info(f"Processing new file: {filepath}")
|
logger.info(f"Processing new file: {filepath}")
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ class FileDocument(BaseModel):
|
|||||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||||
|
pdf_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the associated pdf file content")
|
||||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||||
mime_type: str = Field(..., description="MIME type detected")
|
mime_type: str = Field(..., description="MIME type detected")
|
||||||
|
|||||||
@@ -6,7 +6,9 @@ while maintaining data consistency through MongoDB transactions.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Dict, Any
|
from typing import List, Optional, Dict, Any
|
||||||
@@ -14,13 +16,16 @@ from typing import List, Optional, Dict, Any
|
|||||||
import magic
|
import magic
|
||||||
from pymongo.errors import PyMongoError
|
from pymongo.errors import PyMongoError
|
||||||
|
|
||||||
from app.config.settings import get_objects_folder
|
from app.config.settings import get_objects_folder, get_temp_folder, get_errors_folder, get_ignored_folder
|
||||||
from app.database.repositories.document_repository import FileDocumentRepository
|
from app.database.repositories.document_repository import FileDocumentRepository
|
||||||
from app.models.document import (
|
from app.models.document import (
|
||||||
FileDocument,
|
FileDocument,
|
||||||
FileType,
|
FileType,
|
||||||
)
|
)
|
||||||
from app.models.types import PyObjectId
|
from app.models.types import PyObjectId
|
||||||
|
from app.utils.pdf_converter import convert_to_pdf
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DocumentService:
|
class DocumentService:
|
||||||
@@ -31,7 +36,11 @@ class DocumentService:
|
|||||||
and their content while ensuring data consistency through transactions.
|
and their content while ensuring data consistency through transactions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, database, objects_folder: str = None):
|
def __init__(self, database,
|
||||||
|
objects_folder: str = None,
|
||||||
|
temp_folder: str = None,
|
||||||
|
errors_folder: str = None,
|
||||||
|
ignored_folder: str = None):
|
||||||
"""
|
"""
|
||||||
Initialize the document service with repository dependencies.
|
Initialize the document service with repository dependencies.
|
||||||
|
|
||||||
@@ -43,6 +52,9 @@ class DocumentService:
|
|||||||
self.db = database
|
self.db = database
|
||||||
self.document_repository = FileDocumentRepository(self.db)
|
self.document_repository = FileDocumentRepository(self.db)
|
||||||
self.objects_folder = objects_folder or get_objects_folder()
|
self.objects_folder = objects_folder or get_objects_folder()
|
||||||
|
self.temp_folder = temp_folder or get_temp_folder()
|
||||||
|
self.errors_folder = errors_folder or get_errors_folder()
|
||||||
|
self.ignored_folder = ignored_folder or get_ignored_folder()
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.document_repository.initialize()
|
self.document_repository.initialize()
|
||||||
@@ -117,6 +129,39 @@ class DocumentService:
|
|||||||
|
|
||||||
return path.read_bytes()
|
return path.read_bytes()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_safe_path(file_path):
|
||||||
|
"""
|
||||||
|
If the path already exists, add a suffix to the filename.
|
||||||
|
Increment the suffix until a safe path is found.
|
||||||
|
:param file_path:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
path = Path(file_path)
|
||||||
|
|
||||||
|
# If the path doesn't exist, return it as is
|
||||||
|
if not path.exists():
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
# Split the filename and extension
|
||||||
|
stem = path.stem
|
||||||
|
suffix = path.suffix
|
||||||
|
directory = path.parent
|
||||||
|
|
||||||
|
# Try incrementing numbers until a unique path is found
|
||||||
|
counter = 1
|
||||||
|
while True:
|
||||||
|
# Create new filename with counter
|
||||||
|
new_filename = f"{stem}_{counter}{suffix}"
|
||||||
|
new_path = os.path.join(directory, new_filename)
|
||||||
|
|
||||||
|
# Check if this new path exists
|
||||||
|
if not os.path.exists(new_path):
|
||||||
|
return new_path
|
||||||
|
|
||||||
|
# Increment counter for next attempt
|
||||||
|
counter += 1
|
||||||
|
|
||||||
def _get_document_path(self, file_hash):
|
def _get_document_path(self, file_hash):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -125,6 +170,9 @@ class DocumentService:
|
|||||||
"""
|
"""
|
||||||
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
||||||
|
|
||||||
|
def exists(self, file_hash):
|
||||||
|
return os.path.exists(self._get_document_path(file_hash))
|
||||||
|
|
||||||
def save_content_if_needed(self, file_hash, content: bytes):
|
def save_content_if_needed(self, file_hash, content: bytes):
|
||||||
target_path = self._get_document_path(file_hash)
|
target_path = self._get_document_path(file_hash)
|
||||||
if os.path.exists(target_path):
|
if os.path.exists(target_path):
|
||||||
@@ -136,6 +184,18 @@ class DocumentService:
|
|||||||
with open(target_path, "wb") as f:
|
with open(target_path, "wb") as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
|
def move_to_errors(self, document_id, file_path):
|
||||||
|
logger.info(f"Moving file {file_path} to error folder")
|
||||||
|
error_file_name = f"{document_id}_{os.path.basename(file_path)}"
|
||||||
|
error_file_path = self._get_safe_path(os.path.join(self.errors_folder, error_file_name))
|
||||||
|
shutil.move(file_path, error_file_path)
|
||||||
|
|
||||||
|
def move_to_ignored(self, file_path, reason="Unknown"):
|
||||||
|
logger.info(f"Moving file {file_path} to ignored folder")
|
||||||
|
ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(file_path)
|
||||||
|
ignored_file_path = self._get_safe_path(os.path.join(self.ignored_folder, ignored_file_name))
|
||||||
|
shutil.move(file_path, ignored_file_path)
|
||||||
|
|
||||||
def create_document(
|
def create_document(
|
||||||
self,
|
self,
|
||||||
file_path: str,
|
file_path: str,
|
||||||
@@ -171,7 +231,15 @@ class DocumentService:
|
|||||||
detected_at = datetime.now()
|
detected_at = datetime.now()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"Creating Document for {file_path}")
|
||||||
|
# Skip the document if it already exists
|
||||||
|
same_document = self.document_repository.find_same_document(filename, file_hash)
|
||||||
|
if same_document is not None:
|
||||||
|
logger.info(f" Document with same hash already exists. Skipping...")
|
||||||
|
self.move_to_ignored(file_path, f"already exists ({same_document.id})")
|
||||||
|
|
||||||
self.save_content_if_needed(file_hash, file_bytes)
|
self.save_content_if_needed(file_hash, file_bytes)
|
||||||
|
logger.info(f" Saved content to {self._get_document_path(file_hash)}")
|
||||||
|
|
||||||
# Create FileDocument
|
# Create FileDocument
|
||||||
file_data = FileDocument(
|
file_data = FileDocument(
|
||||||
@@ -188,6 +256,7 @@ class DocumentService:
|
|||||||
)
|
)
|
||||||
|
|
||||||
created_file = self.document_repository.create_document(file_data)
|
created_file = self.document_repository.create_document(file_data)
|
||||||
|
logger.info(f" Created document with id '{created_file.id}'")
|
||||||
|
|
||||||
return created_file
|
return created_file
|
||||||
|
|
||||||
@@ -195,6 +264,50 @@ class DocumentService:
|
|||||||
# Transaction will automatically rollback if supported
|
# Transaction will automatically rollback if supported
|
||||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||||
|
|
||||||
|
def create_pdf(self, document_id: PyObjectId):
|
||||||
|
"""
|
||||||
|
For all files, a controlled pdf version will be created for standard visualization and action
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
logger.info(f"Creating PDF document for {document_id}")
|
||||||
|
document = self.get_document_by_id(document_id)
|
||||||
|
if document is None:
|
||||||
|
logger.error(f" Document not found")
|
||||||
|
raise ValueError(f"Document {document_id} not found")
|
||||||
|
|
||||||
|
# try to find another document that has the same hash
|
||||||
|
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
|
||||||
|
|
||||||
|
# the pdf will be created only if it does not exist yet
|
||||||
|
if (document_with_same_hash is not None and
|
||||||
|
document_with_same_hash.pdf_file_hash and
|
||||||
|
self.exists(document_with_same_hash.pdf_file_hash)):
|
||||||
|
logger.info(f"Found document with same hash. Will use pdf {document_with_same_hash.pdf_file_hash}")
|
||||||
|
self.update_document(document_id, {"pdf_file_hash": document_with_same_hash.pdf_file_hash})
|
||||||
|
return True
|
||||||
|
|
||||||
|
# get the content of the file
|
||||||
|
logger.info(f" No document with same hash found and valid pdf found. Will create new pdf")
|
||||||
|
file_bytes = self.get_document_content_by_hash(document.file_hash)
|
||||||
|
if file_bytes is None:
|
||||||
|
logger.error(f"Content for document {document_id} not found. hash = {document.file_hash}.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# create the pdf file
|
||||||
|
temp_pdf_file = convert_to_pdf(self._get_document_path(document.file_hash), self.temp_folder)
|
||||||
|
pdf_file_hash = self._calculate_file_hash(self._read_file_bytes(temp_pdf_file))
|
||||||
|
self.save_content_if_needed(pdf_file_hash, self._read_file_bytes(temp_pdf_file))
|
||||||
|
logger.info(f" Created new pdf file with hash {pdf_file_hash}")
|
||||||
|
|
||||||
|
# remove the temporary file
|
||||||
|
os.remove(temp_pdf_file)
|
||||||
|
logger.info(f" Removed temporary pdf file {temp_pdf_file}")
|
||||||
|
|
||||||
|
# update the document
|
||||||
|
self.update_document(document_id, {"pdf_file_hash": pdf_file_hash})
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Retrieve a document by its ID.
|
Retrieve a document by its ID.
|
||||||
@@ -219,6 +332,9 @@ class DocumentService:
|
|||||||
"""
|
"""
|
||||||
return self.document_repository.find_document_by_hash(file_hash)
|
return self.document_repository.find_document_by_hash(file_hash)
|
||||||
|
|
||||||
|
def get_document_with_pdf_hash(self, file_hash) -> Optional[FileDocument]:
|
||||||
|
return self.document_repository.find_document_with_pdf_hash(file_hash)
|
||||||
|
|
||||||
def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||||
"""
|
"""
|
||||||
Retrieve a document by its file path.
|
Retrieve a document by its file path.
|
||||||
|
|||||||
@@ -111,7 +111,9 @@ class JobService:
|
|||||||
current_job = self.repository.find_job_by_id(job_id)
|
current_job = self.repository.find_job_by_id(job_id)
|
||||||
|
|
||||||
# Validate status transition
|
# Validate status transition
|
||||||
if current_job.status != ProcessingStatus.PROCESSING:
|
if current_job.status in (ProcessingStatus.PENDING,
|
||||||
|
ProcessingStatus.COMPLETED,
|
||||||
|
ProcessingStatus.FAILED):
|
||||||
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED)
|
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED)
|
||||||
|
|
||||||
# Update status
|
# Update status
|
||||||
@@ -141,7 +143,7 @@ class JobService:
|
|||||||
current_job = self.repository.find_job_by_id(job_id)
|
current_job = self.repository.find_job_by_id(job_id)
|
||||||
|
|
||||||
# Validate status transition
|
# Validate status transition
|
||||||
if current_job.status != ProcessingStatus.PROCESSING:
|
if current_job.status in (ProcessingStatus.PENDING, ProcessingStatus.COMPLETED, ProcessingStatus.FAILED):
|
||||||
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED)
|
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED)
|
||||||
|
|
||||||
# Update status with error message
|
# Update status with error message
|
||||||
@@ -151,6 +153,11 @@ class JobService:
|
|||||||
error_message
|
error_message
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def update_job_status(self, job_id: PyObjectId,
|
||||||
|
status: ProcessingStatus,
|
||||||
|
error_message: str = None) -> ProcessingJob:
|
||||||
|
return self.repository.update_job_status(job_id, status, error_message)
|
||||||
|
|
||||||
def delete_job(self, job_id: PyObjectId) -> bool:
|
def delete_job(self, job_id: PyObjectId) -> bool:
|
||||||
"""
|
"""
|
||||||
Delete a job from the database.
|
Delete a job from the database.
|
||||||
|
|||||||
@@ -20,12 +20,19 @@ def detect_file_type(file_path: str) -> str:
|
|||||||
UnsupportedFileTypeError: If file type is not supported.
|
UnsupportedFileTypeError: If file type is not supported.
|
||||||
"""
|
"""
|
||||||
mime = magic.from_file(file_path, mime=True)
|
mime = magic.from_file(file_path, mime=True)
|
||||||
|
extension = Path(file_path).suffix
|
||||||
if mime.startswith("text/"):
|
if mime.startswith("text/"):
|
||||||
return "text"
|
return "text"
|
||||||
elif mime.startswith("image/"):
|
elif mime.startswith("image/"):
|
||||||
return "image"
|
return "image"
|
||||||
elif mime in ("application/vnd.openxmlformats-officedocument.wordprocessingml.document",):
|
elif mime in ("application/vnd.openxmlformats-officedocument.wordprocessingml.document",):
|
||||||
return "word"
|
return "word"
|
||||||
|
elif mime == "application/pdf":
|
||||||
|
return "pdf"
|
||||||
|
elif mime == "application/vnd.ms-powerpoint":
|
||||||
|
return "powerpoint"
|
||||||
|
elif mime == "application/octet-stream" and extension in (".jpg", ".jpeg", ".png", ".gif"):
|
||||||
|
return "image"
|
||||||
else:
|
else:
|
||||||
raise UnsupportedFileTypeError(f"Unsupported file type: {mime}")
|
raise UnsupportedFileTypeError(f"Unsupported file type: {mime}")
|
||||||
|
|
||||||
|
|||||||
@@ -6,14 +6,14 @@ and update processing job statuses throughout the task lifecycle.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.database.connection import get_database
|
from app.database.connection import get_database
|
||||||
|
from app.models.job import ProcessingStatus
|
||||||
from app.services.document_service import DocumentService
|
from app.services.document_service import DocumentService
|
||||||
from app.services.job_service import JobService
|
from app.services.job_service import JobService
|
||||||
from tasks.common.document_utils import save_as_object
|
|
||||||
from tasks.common.pdf_converter import convert_to_pdf
|
|
||||||
from tasks.main import celery_app
|
from tasks.main import celery_app
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -26,7 +26,8 @@ def get_services():
|
|||||||
return document_service, job_service
|
return document_service, job_service
|
||||||
|
|
||||||
|
|
||||||
@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
#@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||||
|
@celery_app.task(bind=True)
|
||||||
def process_document(self, filepath: str) -> Dict[str, Any]:
|
def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Process a document file and extract its content.
|
Process a document file and extract its content.
|
||||||
@@ -46,41 +47,31 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
|||||||
Raises:
|
Raises:
|
||||||
Exception: Any processing error (will trigger retry)
|
Exception: Any processing error (will trigger retry)
|
||||||
"""
|
"""
|
||||||
task_id = self.request.id
|
task_id = self.request.id
|
||||||
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
||||||
|
|
||||||
# get services
|
# get services
|
||||||
document_service, job_service = get_services()
|
document_service, job_service = get_services()
|
||||||
|
|
||||||
job = None
|
job = None
|
||||||
|
document = None
|
||||||
try:
|
try:
|
||||||
# Step 1: Create the document and a new job record for the document
|
# Step 1: Create the document and a new job record for the document
|
||||||
document = document_service.create_document(filepath)
|
document = document_service.create_document(filepath)
|
||||||
job = job_service.create_job(task_id=task_id, document_id=document.id)
|
job = job_service.create_job(task_id=task_id, document_id=document.id)
|
||||||
job_service.mark_job_as_started(job_id=job.id)
|
job_service.mark_job_as_started(job_id=job.id)
|
||||||
logger.info(f"Task {task_id} created for document {document.id} with file path: {filepath} and job id: {job.id}")
|
logger.info(f"Task {task_id} created for document {document.id} from file path: {filepath} and job id: {job.id}")
|
||||||
|
|
||||||
logger.info(f"Job {task_id} marked as PROCESSING")
|
logger.info(f"Task {task_id} : Creating associated PDF")
|
||||||
|
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.SAVING_PDF)
|
||||||
raw_file_hash = save_as_object(filepath)
|
document_service.create_pdf(document.id)
|
||||||
logger.info(f"Job {task_id} saved document as object: {raw_file_hash}")
|
|
||||||
|
|
||||||
# Step 4: Create the pdf version of the document
|
# remove the file from the watch folder
|
||||||
pdf_file_hash = convert_to_pdf(filepath, raw_file_hash)
|
os.remove(filepath)
|
||||||
logger.info(f"Job {task_id} saved PDF with hash: {pdf_file_hash}")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Step 3: Mark job as started
|
|
||||||
|
|
||||||
# Step 4: Create the pdf version of the document
|
|
||||||
pdf_file_path = convert_to_pdf(filepath, settings.get_temp_folder())
|
|
||||||
digest = save_as_object(pdf_file_path)
|
|
||||||
logger.info(f"Job {task_id} internal PDF file created: {digest}")
|
|
||||||
|
|
||||||
# Step x: Mark job as completed
|
# Step x: Mark job as completed
|
||||||
job_service.mark_job_as_completed(job_id=job.id)
|
job_service.mark_job_as_completed(job_id=job.id)
|
||||||
logger.info(f"Job {task_id} marked as COMPLETED")
|
logger.info(f"Task {task_id} marked as COMPLETED")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"task_id": task_id,
|
"task_id": task_id,
|
||||||
@@ -99,6 +90,11 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
|||||||
logger.info(f"Job {task_id} marked as FAILED")
|
logger.info(f"Job {task_id} marked as FAILED")
|
||||||
else:
|
else:
|
||||||
logger.error(f"Failed to process {filepath}. error = {str(e)}")
|
logger.error(f"Failed to process {filepath}. error = {str(e)}")
|
||||||
|
|
||||||
|
if document is not None:
|
||||||
|
document_service.move_to_errors(document.id, filepath)
|
||||||
|
logger.info(f"Moved file {filepath} to errors/{document.id}")
|
||||||
|
|
||||||
except Exception as job_error:
|
except Exception as job_error:
|
||||||
logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
|
logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
|
||||||
|
|
||||||
|
|||||||
@@ -41,15 +41,10 @@ celery_app.conf.update(
|
|||||||
|
|
||||||
def global_init(**kwargs):
|
def global_init(**kwargs):
|
||||||
"""Initialize global variables."""
|
"""Initialize global variables."""
|
||||||
logger.info(f"{'*' * 20}")
|
logger.info(f"{'*' * 45}")
|
||||||
logger.info(f"{'--' * 5}" + " Starting MyDocManager worker " + f"{'--' * 5}")
|
logger.info(f"{'--' * 5}" + " Starting MyDocManager worker " + f"{'--' * 5}")
|
||||||
logger.info(f"{'*' * 20}")
|
logger.info(f"{'*' * 45}")
|
||||||
tmp_folder = settings.get_temp_folder()
|
|
||||||
if not os.path.exists(tmp_folder):
|
|
||||||
logger.info(f"Creating temporary folder: {tmp_folder}")
|
|
||||||
os.makedirs(tmp_folder)
|
|
||||||
else:
|
|
||||||
logger.info(f"Temporary folder already exists: {os.path.abspath(tmp_folder)}")
|
|
||||||
|
|
||||||
global_init()
|
global_init()
|
||||||
|
|
||||||
|
|||||||
@@ -568,3 +568,137 @@ class TestFileTypeDetection:
|
|||||||
"""Test unsupported file type raises ValueError."""
|
"""Test unsupported file type raises ValueError."""
|
||||||
with pytest.raises(ValueError, match="Unsupported file type"):
|
with pytest.raises(ValueError, match="Unsupported file type"):
|
||||||
document_service._detect_file_type("/path/to/document.xyz")
|
document_service._detect_file_type("/path/to/document.xyz")
|
||||||
|
|
||||||
|
|
||||||
|
class TestCreatePdf:
|
||||||
|
"""Tests for create_pdf method."""
|
||||||
|
|
||||||
|
@patch('app.services.document_service.convert_to_pdf')
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_create_pdf_successfully(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
mock_convert_to_pdf,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test creating PDF from an existing document."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "text/plain"
|
||||||
|
|
||||||
|
# Create a document first
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.txt",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock the PDF conversion
|
||||||
|
pdf_path = os.path.join(document_service.temp_folder, "converted.pdf")
|
||||||
|
mock_convert_to_pdf.return_value = pdf_path
|
||||||
|
|
||||||
|
# Write a sample PDF file that the conversion would create
|
||||||
|
pdf_content = b"This is PDF content"
|
||||||
|
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
|
||||||
|
with open(pdf_path, "wb") as f:
|
||||||
|
f.write(pdf_content)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = document_service.create_pdf(created_doc.id)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Get the updated document
|
||||||
|
updated_doc = document_service.get_document_by_id(created_doc.id)
|
||||||
|
assert updated_doc.pdf_file_hash is not None
|
||||||
|
|
||||||
|
# Verify the PDF content was saved
|
||||||
|
pdf_hash = document_service._calculate_file_hash(pdf_content)
|
||||||
|
assert updated_doc.pdf_file_hash == pdf_hash
|
||||||
|
|
||||||
|
# Verify convert_to_pdf was called with correct arguments
|
||||||
|
doc_path = document_service._get_document_path(created_doc.file_hash)
|
||||||
|
mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder)
|
||||||
|
|
||||||
|
# Verify content exists on disk
|
||||||
|
validate_file_saved(document_service, pdf_hash, pdf_content)
|
||||||
|
|
||||||
|
# Verify PDF hash was added to document
|
||||||
|
updated_doc = document_service.get_document_by_id(created_doc.id)
|
||||||
|
pdf_hash = document_service._calculate_file_hash(pdf_content)
|
||||||
|
assert updated_doc.pdf_file_hash == pdf_hash
|
||||||
|
|
||||||
|
@patch('app.services.document_service.convert_to_pdf')
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_can_reuse_existing_pdf(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
mock_convert_to_pdf,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test that if PDF already exists, it doesn't recreate it."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "text/plain"
|
||||||
|
|
||||||
|
# Create a document first
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.txt",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a fake PDF file and update the document
|
||||||
|
pdf_content = b"This is PDF content"
|
||||||
|
pdf_hash = document_service._calculate_file_hash(pdf_content)
|
||||||
|
document_service.save_content_if_needed(pdf_hash, pdf_content)
|
||||||
|
document_service.update_document(created_doc.id, {"pdf_file_hash": pdf_hash})
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = document_service.create_pdf(created_doc.id)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Verify convert_to_pdf was NOT called
|
||||||
|
mock_convert_to_pdf.assert_not_called()
|
||||||
|
|
||||||
|
def test_i_cannot_create_pdf_for_nonexistent_document(
|
||||||
|
self,
|
||||||
|
document_service
|
||||||
|
):
|
||||||
|
"""Test behavior when document ID doesn't exist."""
|
||||||
|
# Execute with random ObjectId
|
||||||
|
result = document_service.create_pdf(ObjectId())
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
@patch('app.services.document_service.magic.from_buffer')
|
||||||
|
def test_i_cannot_create_pdf_when_file_content_missing(
|
||||||
|
self,
|
||||||
|
mock_magic,
|
||||||
|
document_service,
|
||||||
|
sample_file_bytes
|
||||||
|
):
|
||||||
|
"""Test behavior when file content doesn't exist."""
|
||||||
|
# Setup
|
||||||
|
mock_magic.return_value = "text/plain"
|
||||||
|
|
||||||
|
# Create a document
|
||||||
|
created_doc = document_service.create_document(
|
||||||
|
"/test/test.txt",
|
||||||
|
sample_file_bytes,
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Simulate missing content by removing file
|
||||||
|
file_path = document_service._get_document_path(created_doc.file_hash)
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
# Execute
|
||||||
|
result = document_service.create_pdf(created_doc.id)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
assert result is False
|
||||||
|
|||||||
@@ -417,6 +417,25 @@ class TestUpdateStatus:
|
|||||||
# Verify exception details
|
# Verify exception details
|
||||||
assert exc_info.value.current_status == ProcessingStatus.FAILED
|
assert exc_info.value.current_status == ProcessingStatus.FAILED
|
||||||
assert exc_info.value.target_status == ProcessingStatus.FAILED
|
assert exc_info.value.target_status == ProcessingStatus.FAILED
|
||||||
|
|
||||||
|
def test_i_can_update_job_status(
|
||||||
|
self,
|
||||||
|
job_service,
|
||||||
|
sample_document_id,
|
||||||
|
sample_task_id
|
||||||
|
):
|
||||||
|
"""Test that failed job cannot be marked as failed again."""
|
||||||
|
# Create, start, and fail a job
|
||||||
|
created_job = job_service.create_job(sample_document_id, sample_task_id)
|
||||||
|
job_service.mark_job_as_started(created_job.id)
|
||||||
|
|
||||||
|
# Execute without error message
|
||||||
|
result = job_service.update_job_status(created_job.id, ProcessingStatus.SAVING_OBJECT)
|
||||||
|
|
||||||
|
# Verify status transition
|
||||||
|
assert result is not None
|
||||||
|
assert result.status == ProcessingStatus.SAVING_OBJECT
|
||||||
|
assert result.error_message is None
|
||||||
|
|
||||||
|
|
||||||
class TestDeleteJob:
|
class TestDeleteJob:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tasks.common.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter
|
from app.utils.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@@ -20,10 +20,10 @@ def test_i_can_convert_text_to_pdf(temp_dir):
|
|||||||
input_txt.write_text("Hello World!\nThis is a test.")
|
input_txt.write_text("Hello World!\nThis is a test.")
|
||||||
|
|
||||||
converter = TextToPdfConverter(str(input_txt), output_dir=temp_dir)
|
converter = TextToPdfConverter(str(input_txt), output_dir=temp_dir)
|
||||||
output_pdf = converter.convert()
|
converter.convert()
|
||||||
|
|
||||||
assert Path(output_pdf).exists()
|
assert Path(converter.output_path).exists()
|
||||||
assert output_pdf.endswith(".pdf")
|
assert str(converter.output_path).endswith(".pdf")
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_convert_image_to_pdf(temp_dir):
|
def test_i_can_convert_image_to_pdf(temp_dir):
|
||||||
@@ -34,10 +34,10 @@ def test_i_can_convert_image_to_pdf(temp_dir):
|
|||||||
image.save(input_img)
|
image.save(input_img)
|
||||||
|
|
||||||
converter = ImageToPdfConverter(str(input_img), output_dir=temp_dir)
|
converter = ImageToPdfConverter(str(input_img), output_dir=temp_dir)
|
||||||
output_pdf = converter.convert()
|
converter.convert()
|
||||||
|
|
||||||
assert Path(output_pdf).exists()
|
assert Path(converter.output_path).exists()
|
||||||
assert output_pdf.endswith(".pdf")
|
assert str(converter.output_path).endswith(".pdf")
|
||||||
|
|
||||||
|
|
||||||
def test_i_can_convert_word_to_pdf(temp_dir):
|
def test_i_can_convert_word_to_pdf(temp_dir):
|
||||||
@@ -49,7 +49,7 @@ def test_i_can_convert_word_to_pdf(temp_dir):
|
|||||||
doc.save(input_docx)
|
doc.save(input_docx)
|
||||||
|
|
||||||
converter = WordToPdfConverter(str(input_docx), output_dir=temp_dir)
|
converter = WordToPdfConverter(str(input_docx), output_dir=temp_dir)
|
||||||
output_pdf = converter.convert()
|
converter.convert()
|
||||||
|
|
||||||
assert Path(output_pdf).exists()
|
assert Path(converter.output_path).exists()
|
||||||
assert output_pdf.endswith(".pdf")
|
assert str(converter.output_path).endswith(".pdf")
|
||||||
Reference in New Issue
Block a user