Working default workflow (file -> celery -> redis -> worker)
This commit is contained in:
@@ -59,6 +59,7 @@ services:
|
|||||||
- PYTHONPATH=/app
|
- PYTHONPATH=/app
|
||||||
volumes:
|
volumes:
|
||||||
- ./src/worker:/app
|
- ./src/worker:/app
|
||||||
|
- ./src/file-processor/app:/app/app # <- Added: shared access file-processor app
|
||||||
- ./volumes/watched_files:/watched_files
|
- ./volumes/watched_files:/watched_files
|
||||||
depends_on:
|
depends_on:
|
||||||
- redis
|
- redis
|
||||||
|
|||||||
@@ -6,9 +6,10 @@ in MongoDB with proper error handling and type safety.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
|
|
||||||
from bson import ObjectId
|
from bson import ObjectId
|
||||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
|
||||||
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
|
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
|
||||||
|
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||||
|
|
||||||
from app.database.connection import get_extra_args
|
from app.database.connection import get_extra_args
|
||||||
from app.models.document import FileDocument
|
from app.models.document import FileDocument
|
||||||
@@ -40,7 +41,6 @@ class FileDocumentRepository:
|
|||||||
"""Initialize file repository with database connection."""
|
"""Initialize file repository with database connection."""
|
||||||
self.db = database
|
self.db = database
|
||||||
self.collection: AsyncIOMotorCollection = self.db.documents
|
self.collection: AsyncIOMotorCollection = self.db.documents
|
||||||
self._ensure_indexes()
|
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
|
|||||||
logger.info(f"Processing new file: {filepath}")
|
logger.info(f"Processing new file: {filepath}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from tasks.document_processing import process_document
|
from tasks.main import process_document
|
||||||
celery_result = process_document.delay(filepath)
|
celery_result = process_document.delay(filepath)
|
||||||
celery_task_id = celery_result.id
|
celery_task_id = celery_result.id
|
||||||
logger.info(f"Dispatched Celery task with ID: {celery_task_id}")
|
logger.info(f"Dispatched Celery task with ID: {celery_task_id}")
|
||||||
|
|||||||
@@ -35,12 +35,12 @@ class JobService:
|
|||||||
await self.repository.initialize()
|
await self.repository.initialize()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def create_job(self, file_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob:
|
async def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob:
|
||||||
"""
|
"""
|
||||||
Create a new processing job.
|
Create a new processing job.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_id: Reference to the file document
|
document_id: Reference to the file document
|
||||||
task_id: Optional Celery task UUID
|
task_id: Optional Celery task UUID
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -49,7 +49,7 @@ class JobService:
|
|||||||
Raises:
|
Raises:
|
||||||
JobRepositoryError: If database operation fails
|
JobRepositoryError: If database operation fails
|
||||||
"""
|
"""
|
||||||
return await self.repository.create_job(file_id, task_id)
|
return await self.repository.create_job(document_id, task_id)
|
||||||
|
|
||||||
async def get_job_by_id(self, job_id: PyObjectId) -> ProcessingJob:
|
async def get_job_by_id(self, job_id: PyObjectId) -> ProcessingJob:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -3,6 +3,12 @@ FROM python:3.12-slim
|
|||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install libmagic
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libmagic1 \
|
||||||
|
file \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Copy requirements and install dependencies
|
# Copy requirements and install dependencies
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|||||||
@@ -1,4 +1,12 @@
|
|||||||
|
bcrypt==4.3.0
|
||||||
celery==5.5.3
|
celery==5.5.3
|
||||||
redis==6.4.0
|
email-validator==2.3.0
|
||||||
|
fastapi==0.116.1
|
||||||
|
httptools==0.6.4
|
||||||
|
motor==3.7.1
|
||||||
pymongo==4.15.0
|
pymongo==4.15.0
|
||||||
|
pydantic==2.11.9
|
||||||
|
redis==6.4.0
|
||||||
|
uvicorn==0.35.0
|
||||||
|
python-magic==0.4.27
|
||||||
|
watchdog==6.0.0
|
||||||
@@ -8,81 +8,14 @@ and update processing job statuses throughout the task lifecycle.
|
|||||||
import logging
|
import logging
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
from tasks.main import app as celery_app
|
from app.config import settings
|
||||||
|
from app.database.connection import get_database
|
||||||
|
from app.services.document_service import DocumentService
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
async def process_document_async(self, filepath: str) -> Dict[str, Any]:
|
||||||
# def process_document(self, document_service, job_service, filepath: str) -> Dict[str, Any]:
|
|
||||||
# """
|
|
||||||
# Process a document file and extract its content.
|
|
||||||
#
|
|
||||||
# This task:
|
|
||||||
# 1. Updates the processing job status to PROCESSING
|
|
||||||
# 2. Performs document content extraction
|
|
||||||
# 3. Updates job status to COMPLETED or FAILED based on result
|
|
||||||
#
|
|
||||||
# Args:
|
|
||||||
# self : Celery task instance
|
|
||||||
# job_service : Instance of JobService
|
|
||||||
# document_service : Instance of DocumentService
|
|
||||||
# filepath: Full path to the document file to process
|
|
||||||
#
|
|
||||||
# Returns:
|
|
||||||
# Dictionary containing processing results
|
|
||||||
#
|
|
||||||
# Raises:
|
|
||||||
# Exception: Any processing error (will trigger retry)
|
|
||||||
# """
|
|
||||||
# task_id = self.request.id
|
|
||||||
# logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
|
||||||
#
|
|
||||||
# try:
|
|
||||||
# # Step 1: Mark job as started
|
|
||||||
# await job_service.mark_job_as_started(task_id=task_id)
|
|
||||||
# logger.info(f"Job {task_id} marked as PROCESSING")
|
|
||||||
#
|
|
||||||
# # Step 2: Process the document (extract content, OCR, etc.)
|
|
||||||
# document = await self.document_service.create_document(filepath)
|
|
||||||
# logger.info(f"Created document record with ID: {document.id}")
|
|
||||||
#
|
|
||||||
# result = document_service.extract_document_content(filepath)
|
|
||||||
# logger.info(f"Document content extracted successfully for task {task_id}")
|
|
||||||
#
|
|
||||||
# # Step 3: Mark job as completed
|
|
||||||
# await job_service.mark_job_as_completed(task_id=task_id)
|
|
||||||
# logger.info(f"Job {task_id} marked as COMPLETED")
|
|
||||||
#
|
|
||||||
# return {
|
|
||||||
# "task_id": task_id,
|
|
||||||
# "filepath": filepath,
|
|
||||||
# "status": "completed",
|
|
||||||
# "content_length": len(result.get("content", "")),
|
|
||||||
# "extraction_method": result.get("extraction_method"),
|
|
||||||
# "processing_time": result.get("processing_time")
|
|
||||||
# }
|
|
||||||
#
|
|
||||||
# except Exception as e:
|
|
||||||
# error_message = f"Document processing failed: {str(e)}"
|
|
||||||
# logger.error(f"Task {task_id} failed: {error_message}")
|
|
||||||
#
|
|
||||||
# try:
|
|
||||||
# # Mark job as failed
|
|
||||||
# job_service.mark_job_as_failed(task_id=task_id, error_message=error_message)
|
|
||||||
# logger.info(f"Job {task_id} marked as FAILED")
|
|
||||||
# except Exception as job_error:
|
|
||||||
# logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
|
|
||||||
#
|
|
||||||
# # Re-raise the exception to trigger Celery retry mechanism
|
|
||||||
# raise
|
|
||||||
|
|
||||||
|
|
||||||
@celery_app.task(name="tasks.document_processing.process_document",
|
|
||||||
bind=True,
|
|
||||||
autoretry_for=(Exception,),
|
|
||||||
retry_kwargs={'max_retries': 3, 'countdown': 60})
|
|
||||||
def process_document(self, filepath: str) -> Dict[str, Any]:
|
|
||||||
"""
|
"""
|
||||||
Process a document file and extract its content.
|
Process a document file and extract its content.
|
||||||
|
|
||||||
@@ -93,8 +26,6 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
self : Celery task instance
|
self : Celery task instance
|
||||||
job_service : Instance of JobService
|
|
||||||
document_service : Instance of DocumentService
|
|
||||||
filepath: Full path to the document file to process
|
filepath: Full path to the document file to process
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -106,74 +37,47 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
|||||||
task_id = self.request.id
|
task_id = self.request.id
|
||||||
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
||||||
|
|
||||||
|
database = get_database()
|
||||||
|
document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder())
|
||||||
|
from app.services.job_service import JobService
|
||||||
|
job_service = JobService(database=database)
|
||||||
|
|
||||||
@celery_app.task(bind=True)
|
job = None
|
||||||
def cleanup_old_processing_jobs(self, days_old: int = 30) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Clean up old processing jobs from the database.
|
|
||||||
|
|
||||||
This maintenance task removes completed and failed jobs older than
|
|
||||||
the specified number of days.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
days_old: Number of days after which to clean up jobs
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary containing cleanup statistics
|
|
||||||
"""
|
|
||||||
task_id = self.request.id
|
|
||||||
logger.info(f"Starting cleanup task {task_id} for jobs older than {days_old} days")
|
|
||||||
|
|
||||||
job_service = JobService()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Perform cleanup
|
# Step 1: Insert the document in DB
|
||||||
cleanup_result = job_service.cleanup_old_jobs(days_old=days_old)
|
document = await document_service.create_document(filepath)
|
||||||
|
logger.info(f"Job {task_id} created for document {document.id} with file path: {filepath}")
|
||||||
|
|
||||||
logger.info(
|
# Step 2: Create a new job record for the document
|
||||||
f"Cleanup task {task_id} completed: "
|
job = await job_service.create_job(task_id=task_id, document_id=document.id)
|
||||||
f"deleted {cleanup_result['deleted_count']} jobs"
|
|
||||||
)
|
# Step 3: Mark job as started
|
||||||
|
await job_service.mark_job_as_started(job_id=job.id)
|
||||||
|
logger.info(f"Job {task_id} marked as PROCESSING")
|
||||||
|
|
||||||
|
# Step 4: Mark job as completed
|
||||||
|
await job_service.mark_job_as_completed(job_id=job.id)
|
||||||
|
logger.info(f"Job {task_id} marked as COMPLETED")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"task_id": task_id,
|
"task_id": task_id,
|
||||||
|
"filepath": filepath,
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
"deleted_count": cleanup_result["deleted_count"],
|
|
||||||
"days_old": days_old
|
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = f"Cleanup task failed: {str(e)}"
|
error_message = f"Document processing failed: {str(e)}"
|
||||||
logger.error(f"Cleanup task {task_id} failed: {error_message}")
|
logger.error(f"Task {task_id} failed: {error_message}")
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
@celery_app.task(bind=True)
|
|
||||||
def get_processing_statistics(self) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Generate processing statistics for monitoring.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary containing current processing statistics
|
|
||||||
"""
|
|
||||||
task_id = self.request.id
|
|
||||||
logger.info(f"Generating processing statistics for task {task_id}")
|
|
||||||
|
|
||||||
job_service = JobService()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stats = job_service.get_processing_statistics()
|
# Mark job as failed
|
||||||
|
if job is not None:
|
||||||
|
await job_service.mark_job_as_failed(job_id=job.id, error_message=error_message)
|
||||||
|
logger.info(f"Job {task_id} marked as FAILED")
|
||||||
|
else:
|
||||||
|
logger.error(f"Failed to process {filepath}. error = {str(e)}")
|
||||||
|
except Exception as job_error:
|
||||||
|
logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
|
||||||
|
|
||||||
logger.info(f"Statistics generated for task {task_id}")
|
# Re-raise the exception to trigger Celery retry mechanism
|
||||||
|
|
||||||
return {
|
|
||||||
"task_id": task_id,
|
|
||||||
"status": "completed",
|
|
||||||
"statistics": stats,
|
|
||||||
"timestamp": stats.get("generated_at")
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
error_message = f"Statistics generation failed: {str(e)}"
|
|
||||||
logger.error(f"Statistics task {task_id} failed: {error_message}")
|
|
||||||
raise
|
raise
|
||||||
|
|||||||
@@ -3,25 +3,26 @@ Celery worker for MyDocManager document processing tasks.
|
|||||||
|
|
||||||
This module contains all Celery tasks for processing documents.
|
This module contains all Celery tasks for processing documents.
|
||||||
"""
|
"""
|
||||||
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
|
|
||||||
from celery import Celery
|
from celery import Celery
|
||||||
|
|
||||||
|
from tasks.document_processing import process_document_async
|
||||||
|
|
||||||
# Environment variables
|
# Environment variables
|
||||||
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
||||||
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
||||||
|
|
||||||
# Initialize Celery app
|
# Initialize Celery app
|
||||||
app = Celery(
|
celery_app = Celery(
|
||||||
"mydocmanager_worker",
|
"mydocmanager_worker",
|
||||||
broker=REDIS_URL,
|
broker=REDIS_URL,
|
||||||
backend=REDIS_URL
|
backend=REDIS_URL,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Celery configuration
|
# Celery configuration
|
||||||
app.conf.update(
|
celery_app.conf.update(
|
||||||
task_serializer="json",
|
task_serializer="json",
|
||||||
accept_content=["json"],
|
accept_content=["json"],
|
||||||
result_serializer="json",
|
result_serializer="json",
|
||||||
@@ -33,82 +34,10 @@ app.conf.update(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.task(bind=True)
|
@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||||
def test_task(self, message: str):
|
def process_document(self, filepath: str):
|
||||||
"""
|
return asyncio.run(process_document_async(self, filepath))
|
||||||
Test task for validating worker functionality.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
message: Test message to process
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Task result with processing information
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
print(f"[WORKER] Starting test task with message: {message}")
|
|
||||||
|
|
||||||
# Simulate some work
|
|
||||||
for i in range(5):
|
|
||||||
print(f"[WORKER] Processing step {i + 1}/5...")
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
# Update task progress
|
|
||||||
self.update_state(
|
|
||||||
state="PROGRESS",
|
|
||||||
meta={
|
|
||||||
"current": i + 1,
|
|
||||||
"total": 5,
|
|
||||||
"message": f"Processing step {i + 1}"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"status": "completed",
|
|
||||||
"message": f"Successfully processed: {message}",
|
|
||||||
"processed_at": time.time(),
|
|
||||||
"worker_id": self.request.id
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"[WORKER] Test task completed successfully: {result}")
|
|
||||||
return result
|
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[WORKER] Test task failed: {str(exc)}")
|
|
||||||
raise self.retry(exc=exc, countdown=60, max_retries=3)
|
|
||||||
|
|
||||||
|
|
||||||
@app.task(bind=True)
|
|
||||||
def process_document_task(self, file_path: str):
|
|
||||||
"""
|
|
||||||
Placeholder task for document processing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to the document to process
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Processing result
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
print(f"[WORKER] Starting document processing for: {file_path}")
|
|
||||||
|
|
||||||
# Placeholder for document processing logic
|
|
||||||
time.sleep(2) # Simulate processing time
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"status": "completed",
|
|
||||||
"file_path": file_path,
|
|
||||||
"processed_at": time.time(),
|
|
||||||
"content": f"Placeholder content for {file_path}",
|
|
||||||
"worker_id": self.request.id
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"[WORKER] Document processing completed: {file_path}")
|
|
||||||
return result
|
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"[WORKER] Document processing failed for {file_path}: {str(exc)}")
|
|
||||||
raise self.retry(exc=exc, countdown=60, max_retries=3)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.start()
|
celery_app.start()
|
||||||
|
|||||||
Reference in New Issue
Block a user