Working on tasks
This commit is contained in:
@@ -8,7 +8,7 @@ COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY tasks/ .
|
||||
COPY . .
|
||||
|
||||
# Command will be overridden by docker-compose
|
||||
CMD ["celery", "-A", "main", "worker", "--loglevel=info"]
|
||||
179
src/worker/tasks/document_processing.py
Normal file
179
src/worker/tasks/document_processing.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""
|
||||
Celery tasks for document processing with ProcessingJob status management.
|
||||
|
||||
This module contains Celery tasks that handle document content extraction
|
||||
and update processing job statuses throughout the task lifecycle.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict
|
||||
|
||||
from tasks.main import app as celery_app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||
# def process_document(self, document_service, job_service, filepath: str) -> Dict[str, Any]:
|
||||
# """
|
||||
# Process a document file and extract its content.
|
||||
#
|
||||
# This task:
|
||||
# 1. Updates the processing job status to PROCESSING
|
||||
# 2. Performs document content extraction
|
||||
# 3. Updates job status to COMPLETED or FAILED based on result
|
||||
#
|
||||
# Args:
|
||||
# self : Celery task instance
|
||||
# job_service : Instance of JobService
|
||||
# document_service : Instance of DocumentService
|
||||
# filepath: Full path to the document file to process
|
||||
#
|
||||
# Returns:
|
||||
# Dictionary containing processing results
|
||||
#
|
||||
# Raises:
|
||||
# Exception: Any processing error (will trigger retry)
|
||||
# """
|
||||
# task_id = self.request.id
|
||||
# logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
||||
#
|
||||
# try:
|
||||
# # Step 1: Mark job as started
|
||||
# await job_service.mark_job_as_started(task_id=task_id)
|
||||
# logger.info(f"Job {task_id} marked as PROCESSING")
|
||||
#
|
||||
# # Step 2: Process the document (extract content, OCR, etc.)
|
||||
# document = await self.document_service.create_document(filepath)
|
||||
# logger.info(f"Created document record with ID: {document.id}")
|
||||
#
|
||||
# result = document_service.extract_document_content(filepath)
|
||||
# logger.info(f"Document content extracted successfully for task {task_id}")
|
||||
#
|
||||
# # Step 3: Mark job as completed
|
||||
# await job_service.mark_job_as_completed(task_id=task_id)
|
||||
# logger.info(f"Job {task_id} marked as COMPLETED")
|
||||
#
|
||||
# return {
|
||||
# "task_id": task_id,
|
||||
# "filepath": filepath,
|
||||
# "status": "completed",
|
||||
# "content_length": len(result.get("content", "")),
|
||||
# "extraction_method": result.get("extraction_method"),
|
||||
# "processing_time": result.get("processing_time")
|
||||
# }
|
||||
#
|
||||
# except Exception as e:
|
||||
# error_message = f"Document processing failed: {str(e)}"
|
||||
# logger.error(f"Task {task_id} failed: {error_message}")
|
||||
#
|
||||
# try:
|
||||
# # Mark job as failed
|
||||
# job_service.mark_job_as_failed(task_id=task_id, error_message=error_message)
|
||||
# logger.info(f"Job {task_id} marked as FAILED")
|
||||
# except Exception as job_error:
|
||||
# logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
|
||||
#
|
||||
# # Re-raise the exception to trigger Celery retry mechanism
|
||||
# raise
|
||||
|
||||
|
||||
@celery_app.task(name="tasks.document_processing.process_document",
|
||||
bind=True,
|
||||
autoretry_for=(Exception,),
|
||||
retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||
def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a document file and extract its content.
|
||||
|
||||
This task:
|
||||
1. Updates the processing job status to PROCESSING
|
||||
2. Performs document content extraction
|
||||
3. Updates job status to COMPLETED or FAILED based on result
|
||||
|
||||
Args:
|
||||
self : Celery task instance
|
||||
job_service : Instance of JobService
|
||||
document_service : Instance of DocumentService
|
||||
filepath: Full path to the document file to process
|
||||
|
||||
Returns:
|
||||
Dictionary containing processing results
|
||||
|
||||
Raises:
|
||||
Exception: Any processing error (will trigger retry)
|
||||
"""
|
||||
task_id = self.request.id
|
||||
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def cleanup_old_processing_jobs(self, days_old: int = 30) -> Dict[str, Any]:
|
||||
"""
|
||||
Clean up old processing jobs from the database.
|
||||
|
||||
This maintenance task removes completed and failed jobs older than
|
||||
the specified number of days.
|
||||
|
||||
Args:
|
||||
days_old: Number of days after which to clean up jobs
|
||||
|
||||
Returns:
|
||||
Dictionary containing cleanup statistics
|
||||
"""
|
||||
task_id = self.request.id
|
||||
logger.info(f"Starting cleanup task {task_id} for jobs older than {days_old} days")
|
||||
|
||||
job_service = JobService()
|
||||
|
||||
try:
|
||||
# Perform cleanup
|
||||
cleanup_result = job_service.cleanup_old_jobs(days_old=days_old)
|
||||
|
||||
logger.info(
|
||||
f"Cleanup task {task_id} completed: "
|
||||
f"deleted {cleanup_result['deleted_count']} jobs"
|
||||
)
|
||||
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"status": "completed",
|
||||
"deleted_count": cleanup_result["deleted_count"],
|
||||
"days_old": days_old
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Cleanup task failed: {str(e)}"
|
||||
logger.error(f"Cleanup task {task_id} failed: {error_message}")
|
||||
raise
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def get_processing_statistics(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate processing statistics for monitoring.
|
||||
|
||||
Returns:
|
||||
Dictionary containing current processing statistics
|
||||
"""
|
||||
task_id = self.request.id
|
||||
logger.info(f"Generating processing statistics for task {task_id}")
|
||||
|
||||
job_service = JobService()
|
||||
|
||||
try:
|
||||
stats = job_service.get_processing_statistics()
|
||||
|
||||
logger.info(f"Statistics generated for task {task_id}")
|
||||
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"status": "completed",
|
||||
"statistics": stats,
|
||||
"timestamp": stats.get("generated_at")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Statistics generation failed: {str(e)}"
|
||||
logger.error(f"Statistics task {task_id} failed: {error_message}")
|
||||
raise
|
||||
@@ -6,6 +6,7 @@ This module contains all Celery tasks for processing documents.
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from celery import Celery
|
||||
|
||||
# Environment variables
|
||||
@@ -110,4 +111,4 @@ def process_document_task(self, file_path: str):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.start()
|
||||
app.start()
|
||||
|
||||
Reference in New Issue
Block a user