Working on tasks

This commit is contained in:
2025-09-21 22:51:34 +02:00
parent 98c43feadf
commit 34f7854b3c
14 changed files with 617 additions and 169 deletions

View File

@@ -8,7 +8,7 @@ COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY tasks/ .
COPY . .
# Command will be overridden by docker-compose
CMD ["celery", "-A", "main", "worker", "--loglevel=info"]

View File

@@ -0,0 +1,179 @@
"""
Celery tasks for document processing with ProcessingJob status management.
This module contains Celery tasks that handle document content extraction
and update processing job statuses throughout the task lifecycle.
"""
import logging
from typing import Any, Dict
from tasks.main import app as celery_app
logger = logging.getLogger(__name__)
# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
# def process_document(self, document_service, job_service, filepath: str) -> Dict[str, Any]:
# """
# Process a document file and extract its content.
#
# This task:
# 1. Updates the processing job status to PROCESSING
# 2. Performs document content extraction
# 3. Updates job status to COMPLETED or FAILED based on result
#
# Args:
# self : Celery task instance
# job_service : Instance of JobService
# document_service : Instance of DocumentService
# filepath: Full path to the document file to process
#
# Returns:
# Dictionary containing processing results
#
# Raises:
# Exception: Any processing error (will trigger retry)
# """
# task_id = self.request.id
# logger.info(f"Starting document processing task {task_id} for file: {filepath}")
#
# try:
# # Step 1: Mark job as started
# await job_service.mark_job_as_started(task_id=task_id)
# logger.info(f"Job {task_id} marked as PROCESSING")
#
# # Step 2: Process the document (extract content, OCR, etc.)
# document = await self.document_service.create_document(filepath)
# logger.info(f"Created document record with ID: {document.id}")
#
# result = document_service.extract_document_content(filepath)
# logger.info(f"Document content extracted successfully for task {task_id}")
#
# # Step 3: Mark job as completed
# await job_service.mark_job_as_completed(task_id=task_id)
# logger.info(f"Job {task_id} marked as COMPLETED")
#
# return {
# "task_id": task_id,
# "filepath": filepath,
# "status": "completed",
# "content_length": len(result.get("content", "")),
# "extraction_method": result.get("extraction_method"),
# "processing_time": result.get("processing_time")
# }
#
# except Exception as e:
# error_message = f"Document processing failed: {str(e)}"
# logger.error(f"Task {task_id} failed: {error_message}")
#
# try:
# # Mark job as failed
# job_service.mark_job_as_failed(task_id=task_id, error_message=error_message)
# logger.info(f"Job {task_id} marked as FAILED")
# except Exception as job_error:
# logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
#
# # Re-raise the exception to trigger Celery retry mechanism
# raise
@celery_app.task(name="tasks.document_processing.process_document",
bind=True,
autoretry_for=(Exception,),
retry_kwargs={'max_retries': 3, 'countdown': 60})
def process_document(self, filepath: str) -> Dict[str, Any]:
"""
Process a document file and extract its content.
This task:
1. Updates the processing job status to PROCESSING
2. Performs document content extraction
3. Updates job status to COMPLETED or FAILED based on result
Args:
self : Celery task instance
job_service : Instance of JobService
document_service : Instance of DocumentService
filepath: Full path to the document file to process
Returns:
Dictionary containing processing results
Raises:
Exception: Any processing error (will trigger retry)
"""
task_id = self.request.id
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
@celery_app.task(bind=True)
def cleanup_old_processing_jobs(self, days_old: int = 30) -> Dict[str, Any]:
"""
Clean up old processing jobs from the database.
This maintenance task removes completed and failed jobs older than
the specified number of days.
Args:
days_old: Number of days after which to clean up jobs
Returns:
Dictionary containing cleanup statistics
"""
task_id = self.request.id
logger.info(f"Starting cleanup task {task_id} for jobs older than {days_old} days")
job_service = JobService()
try:
# Perform cleanup
cleanup_result = job_service.cleanup_old_jobs(days_old=days_old)
logger.info(
f"Cleanup task {task_id} completed: "
f"deleted {cleanup_result['deleted_count']} jobs"
)
return {
"task_id": task_id,
"status": "completed",
"deleted_count": cleanup_result["deleted_count"],
"days_old": days_old
}
except Exception as e:
error_message = f"Cleanup task failed: {str(e)}"
logger.error(f"Cleanup task {task_id} failed: {error_message}")
raise
@celery_app.task(bind=True)
def get_processing_statistics(self) -> Dict[str, Any]:
"""
Generate processing statistics for monitoring.
Returns:
Dictionary containing current processing statistics
"""
task_id = self.request.id
logger.info(f"Generating processing statistics for task {task_id}")
job_service = JobService()
try:
stats = job_service.get_processing_statistics()
logger.info(f"Statistics generated for task {task_id}")
return {
"task_id": task_id,
"status": "completed",
"statistics": stats,
"timestamp": stats.get("generated_at")
}
except Exception as e:
error_message = f"Statistics generation failed: {str(e)}"
logger.error(f"Statistics task {task_id} failed: {error_message}")
raise

View File

@@ -6,6 +6,7 @@ This module contains all Celery tasks for processing documents.
import os
import time
from celery import Celery
# Environment variables
@@ -110,4 +111,4 @@ def process_document_task(self, file_path: str):
if __name__ == "__main__":
app.start()
app.start()