Working on tasks

2025-09-21 22:51:34 +02:00
parent 98c43feadf
commit 34f7854b3c
14 changed files with 617 additions and 169 deletions
--- a/src/worker/Dockerfile
+++ b/src/worker/Dockerfile
@@ -8,7 +8,7 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt

 # Copy application code
-COPY tasks/ .
+COPY . .

 # Command will be overridden by docker-compose
 CMD ["celery", "-A", "main", "worker", "--loglevel=info"]
--- a/src/worker/tasks/document_processing.py
+++ b/src/worker/tasks/document_processing.py
@@ -0,0 +1,179 @@
+"""
+Celery tasks for document processing with ProcessingJob status management.
+
+This module contains Celery tasks that handle document content extraction
+and update processing job statuses throughout the task lifecycle.
+"""
+
+import logging
+from typing import Any, Dict
+
+from tasks.main import app as celery_app
+
+logger = logging.getLogger(__name__)
+
+
+# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
+# def process_document(self, document_service, job_service, filepath: str) -> Dict[str, Any]:
+#   """
+#   Process a document file and extract its content.
+#
+#   This task:
+#   1. Updates the processing job status to PROCESSING
+#   2. Performs document content extraction
+#   3. Updates job status to COMPLETED or FAILED based on result
+#
+#   Args:
+#       self : Celery task instance
+#       job_service : Instance of JobService
+#       document_service : Instance of DocumentService
+#       filepath: Full path to the document file to process
+#
+#   Returns:
+#       Dictionary containing processing results
+#
+#   Raises:
+#       Exception: Any processing error (will trigger retry)
+#   """
+#   task_id = self.request.id
+#   logger.info(f"Starting document processing task {task_id} for file: {filepath}")
+#
+#   try:
+#     # Step 1: Mark job as started
+#     await job_service.mark_job_as_started(task_id=task_id)
+#     logger.info(f"Job {task_id} marked as PROCESSING")
+#
+#     # Step 2: Process the document (extract content, OCR, etc.)
+#     document = await self.document_service.create_document(filepath)
+#     logger.info(f"Created document record with ID: {document.id}")
+#
+#     result = document_service.extract_document_content(filepath)
+#     logger.info(f"Document content extracted successfully for task {task_id}")
+#
+#     # Step 3: Mark job as completed
+#     await job_service.mark_job_as_completed(task_id=task_id)
+#     logger.info(f"Job {task_id} marked as COMPLETED")
+#
+#     return {
+#         "task_id": task_id,
+#         "filepath": filepath,
+#         "status": "completed",
+#         "content_length": len(result.get("content", "")),
+#         "extraction_method": result.get("extraction_method"),
+#         "processing_time": result.get("processing_time")
+#     }
+#
+#   except Exception as e:
+#     error_message = f"Document processing failed: {str(e)}"
+#     logger.error(f"Task {task_id} failed: {error_message}")
+#
+#     try:
+#       # Mark job as failed
+#       job_service.mark_job_as_failed(task_id=task_id, error_message=error_message)
+#       logger.info(f"Job {task_id} marked as FAILED")
+#     except Exception as job_error:
+#       logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
+#
+#     # Re-raise the exception to trigger Celery retry mechanism
+#     raise
+
+
+@celery_app.task(name="tasks.document_processing.process_document",
+                 bind=True,
+                 autoretry_for=(Exception,),
+                 retry_kwargs={'max_retries': 3, 'countdown': 60})
+def process_document(self, filepath: str) -> Dict[str, Any]:
+  """
+  Process a document file and extract its content.
+
+  This task:
+  1. Updates the processing job status to PROCESSING
+  2. Performs document content extraction
+  3. Updates job status to COMPLETED or FAILED based on result
+
+  Args:
+      self : Celery task instance
+      job_service : Instance of JobService
+      document_service : Instance of DocumentService
+      filepath: Full path to the document file to process
+
+  Returns:
+      Dictionary containing processing results
+
+  Raises:
+      Exception: Any processing error (will trigger retry)
+  """
+  task_id = self.request.id
+  logger.info(f"Starting document processing task {task_id} for file: {filepath}")
+
+
+@celery_app.task(bind=True)
+def cleanup_old_processing_jobs(self, days_old: int = 30) -> Dict[str, Any]:
+  """
+  Clean up old processing jobs from the database.
+
+  This maintenance task removes completed and failed jobs older than
+  the specified number of days.
+
+  Args:
+      days_old: Number of days after which to clean up jobs
+
+  Returns:
+      Dictionary containing cleanup statistics
+  """
+  task_id = self.request.id
+  logger.info(f"Starting cleanup task {task_id} for jobs older than {days_old} days")
+  
+  job_service = JobService()
+  
+  try:
+    # Perform cleanup
+    cleanup_result = job_service.cleanup_old_jobs(days_old=days_old)
+    
+    logger.info(
+      f"Cleanup task {task_id} completed: "
+      f"deleted {cleanup_result['deleted_count']} jobs"
+    )
+    
+    return {
+        "task_id": task_id,
+        "status": "completed",
+        "deleted_count": cleanup_result["deleted_count"],
+        "days_old": days_old
+    }
+  
+  except Exception as e:
+    error_message = f"Cleanup task failed: {str(e)}"
+    logger.error(f"Cleanup task {task_id} failed: {error_message}")
+    raise
+
+
+@celery_app.task(bind=True)
+def get_processing_statistics(self) -> Dict[str, Any]:
+  """
+  Generate processing statistics for monitoring.
+
+  Returns:
+      Dictionary containing current processing statistics
+  """
+  task_id = self.request.id
+  logger.info(f"Generating processing statistics for task {task_id}")
+  
+  job_service = JobService()
+  
+  try:
+    stats = job_service.get_processing_statistics()
+    
+    logger.info(f"Statistics generated for task {task_id}")
+    
+    return {
+        "task_id": task_id,
+        "status": "completed",
+        "statistics": stats,
+        "timestamp": stats.get("generated_at")
+    }
+  
+  except Exception as e:
+    error_message = f"Statistics generation failed: {str(e)}"
+    logger.error(f"Statistics task {task_id} failed: {error_message}")
+    raise
--- a/src/worker/tasks/main.py
+++ b/src/worker/tasks/main.py
@@ -6,6 +6,7 @@ This module contains all Celery tasks for processing documents.

 import os
 import time
+
 from celery import Celery

 # Environment variables
@@ -110,4 +111,4 @@ def process_document_task(self, file_path: str):


 if __name__ == "__main__":
-  app.start()
+  app.start()