Thumbnails generated and displayed in the front end

2025-10-07 00:16:49 +02:00
parent 79bfae4ba8
commit 477d6bf538
19 changed files with 860 additions and 54 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,6 +13,7 @@ click-didyoumean==0.3.1
 click-plugins==1.1.1.2
 click-repl==0.3.0
 cryptography==46.0.1
+Deprecated==1.2.18
 dnspython==2.8.0
 ecdsa==0.19.1
 email-validator==2.3.0
@@ -32,6 +33,7 @@ mongomock==4.3.0
 mongomock-motor==0.0.36
 motor==3.7.1
 packaging==25.0
+pikepdf==9.11.0
 pillow==11.3.0
 pipdeptree==2.28.0
 pluggy==1.6.0
@@ -44,6 +46,7 @@ pydantic_core==2.33.2
 Pygments==2.19.2
 PyJWT==2.10.1
 pymongo==4.15.1
+PyMuPDF==1.26.4
 pypandoc==1.15
 pytest==8.4.2
 pytest-asyncio==1.2.0
@@ -72,4 +75,5 @@ watchdog==6.0.0
 watchfiles==1.1.0
 wcwidth==0.2.13
 websockets==15.0.1
+wrapt==1.17.3
 zipp==3.23.0
--- a/src/file-processor/app/api/dependencies.py
+++ b/src/file-processor/app/api/dependencies.py
@@ -9,6 +9,7 @@ from app.database.connection import get_database
 from app.models.auth import UserRole
 from app.models.user import UserInDB
 from app.services.auth_service import AuthService
+from app.services.document_service import DocumentService
 from app.services.user_service import UserService

 security = HTTPBearer()
@@ -25,6 +26,12 @@ def get_user_service() -> UserService:
  return UserService(database)


+def get_document_service() -> DocumentService:
+  """Dependency to get DocumentService instance."""
+  database = get_database()
+  return DocumentService(database)
+
+
 def get_current_user(
    credentials: HTTPAuthorizationCredentials = Depends(security),
    user_service: UserService = Depends(get_user_service)
@@ -79,7 +86,7 @@ def get_current_user(
  return user


-def   get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
+def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
  """
  Dependency to ensure current user has admin role.

--- a/src/file-processor/app/api/routes/document.py
+++ b/src/file-processor/app/api/routes/document.py
@@ -0,0 +1,241 @@
+"""
+Document API routes.
+
+This module provides REST endpoints for document management operations.
+"""
+
+import logging
+import os
+from typing import List, Optional
+
+import fitz  # PyMuPDF
+from fastapi import APIRouter, Depends, HTTPException, Query, status, Path
+from starlette.responses import Response
+
+from app.api.dependencies import get_document_service, get_current_user
+from app.models.document import DocumentResponse, FileDocument
+from app.services.document_service import DocumentService
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(tags=["Documents"])
+
+
+def _count_pdf_pages(pdf_file_path: str) -> int:
+  """
+  Count the number of pages in a PDF file using PyMuPDF.
+
+  Args:
+      pdf_file_path: Path to the PDF file
+
+  Returns:
+      Number of pages in the PDF, or 0 if file cannot be read
+  """
+  try:
+    with fitz.open(pdf_file_path) as doc:
+      return doc.page_count
+  except Exception as e:
+    logger.warning(f"Could not count pages for PDF {pdf_file_path}: {e}")
+    return 0
+
+
+def _build_object_url(file_hash: Optional[str]) -> Optional[str]:
+  """
+  Build object URL from file hash.
+
+  Args:
+      file_hash: SHA256 hash of the file
+
+  Returns:
+      URL string or None if hash is not provided
+  """
+  if not file_hash:
+    return None
+  return f"/api/objects/{file_hash}"
+
+
+def _extract_metadata_field(metadata: dict, field_name: str) -> List[str]:
+  """
+  Extract a list field from metadata dictionary.
+
+  Args:
+      metadata: Document metadata dictionary
+      field_name: Name of the field to extract
+
+  Returns:
+      List of strings, empty list if field doesn't exist or is not a list
+  """
+  field_value = metadata.get(field_name, [])
+  if isinstance(field_value, list):
+    return [str(item) for item in field_value]
+  return []
+
+
+def _map_file_document_to_response(
+    document: FileDocument,
+    document_service: DocumentService
+) -> DocumentResponse:
+  """
+  Map FileDocument to DocumentResponse format.
+
+  Args:
+      document: FileDocument instance from database
+      document_service: Document service for file operations
+
+  Returns:
+      DocumentResponse instance ready for API response
+  """
+  # Calculate page count for PDF files
+  page_count = 0
+  if document.pdf_file_hash and document_service.exists(document.pdf_file_hash):
+    pdf_path = document_service.get_document_path(document.pdf_file_hash)
+    page_count = _count_pdf_pages(pdf_path)
+  
+  # Build URLs
+  thumbnail_url = _build_object_url(document.thumbnail_file_hash)
+  pdf_url = _build_object_url(document.pdf_file_hash)
+  
+  # Extract tags and categories from metadata
+  tags = _extract_metadata_field(document.metadata, "tags")
+  categories = _extract_metadata_field(document.metadata, "categories")
+  
+  # Format created_at timestamp
+  created_at = document.detected_at.isoformat() if document.detected_at else ""
+  
+  as_dict = {
+      "id": str(document.id),
+      "name": document.filename,
+      "original_file_type": document.file_type.value.upper(),
+      "created_at": created_at,
+      "file_size": document.file_size,
+      "page_count": page_count,
+      "thumbnail_url": thumbnail_url,
+      "pdf_url": pdf_url,
+      "tags": tags,
+      "categories": categories
+  }
+  logger.info(f"Document: {as_dict}")
+  
+  return DocumentResponse(**as_dict)
+
+
+@router.get("/documents", response_model=List[DocumentResponse])
+def list_documents(
+    skip: int = Query(0, ge=0, description="Number of documents to skip"),
+    limit: int = Query(100, ge=1, le=1000, description="Maximum number of documents to return"),
+    UserInDB=Depends(get_current_user),
+    document_service: DocumentService = Depends(get_document_service)
+) -> List[DocumentResponse]:
+  """
+  Retrieve a paginated list of documents.
+
+  Args:
+      skip: Number of documents to skip for pagination
+      limit: Maximum number of documents to return
+      document_service: Document service instance
+
+  Returns:
+      List of documents in API response format
+
+  Raises:
+      HTTPException: If database operation fails
+  """
+  try:
+    # Get documents from service
+    documents = document_service.list_documents(skip=skip, limit=limit)
+    
+    # Map to response format
+    document_responses = [
+        _map_file_document_to_response(doc, document_service)
+        for doc in documents
+    ]
+    
+    return document_responses
+  
+  except Exception as e:
+    logger.error(f"Failed to list documents: {e}")
+    raise HTTPException(
+      status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+      detail="Failed to retrieve documents"
+    )
+
+
+@router.get("/objects/{file_hash}")
+async def get_object_by_hash(
+    file_hash: str = Path(..., description="SHA256 hash of the object to retrieve"),
+    document_service: DocumentService = Depends(get_document_service)
+):
+  """
+  Serve object content by its hash.
+
+  This endpoint serves files (original documents, PDFs, thumbnails) by their
+  SHA256 hash. It supports all file types stored in the objects folder.
+
+  Args:
+      file_hash: SHA256 hash of the object
+      document_service: Document service dependency
+
+  Returns:
+      FileResponse with the requested object content
+
+  Raises:
+      HTTPException: If object not found (404) or server error (500)
+  """
+  try:
+    # Check if object exists
+    if not document_service.exists(file_hash):
+      raise HTTPException(
+        status_code=status.HTTP_404_NOT_FOUND,
+        detail="Object not found"
+      )
+    
+    # Get file path
+    file_path = document_service.get_document_path(file_hash)
+    
+    # Verify file exists on disk
+    if not os.path.exists(file_path):
+      logger.error(f"Object {file_hash} registered but file not found at {file_path}")
+      raise HTTPException(
+        status_code=status.HTTP_404_NOT_FOUND,
+        detail="Object file not found on disk"
+      )
+    
+    # Determine media type based on file content
+    try:
+      file_content = document_service.get_document_content_by_hash(file_hash)
+      if not file_content:
+        raise HTTPException(
+          status_code=status.HTTP_404_NOT_FOUND,
+          detail="Object content not available"
+        )
+      
+      # Detect MIME type
+      import magic
+      mime_type = magic.from_buffer(file_content, mime=True)
+      
+      # Return file content with appropriate headers
+      return Response(
+        content=file_content,
+        media_type=mime_type,
+        headers={
+            "Content-Length": str(len(file_content)),
+            "Cache-Control": "public, max-age=3600"  # Cache for 1 hour
+        }
+      )
+    
+    except Exception as e:
+      logger.error(f"Error reading object content for hash {file_hash}: {str(e)}")
+      raise HTTPException(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        detail="Failed to read object content"
+      )
+  
+  except HTTPException:
+    # Re-raise HTTP exceptions as-is
+    raise
+  except Exception as e:
+    logger.error(f"Unexpected error serving object {file_hash}: {str(e)}")
+    raise HTTPException(
+      status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+      detail="Internal server error while serving object"
+    )
--- a/src/file-processor/app/database/connection.py
+++ b/src/file-processor/app/database/connection.py
@@ -4,7 +4,7 @@ MongoDB database connection management.
 This module handles MongoDB connection with fail-fast approach.
 The application will terminate if MongoDB is not accessible at startup.
 """
-
+import logging
 import sys
 from typing import Optional

@@ -13,11 +13,14 @@ from pymongo.database import Database
 from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError

 from app.config.settings import get_mongodb_url, get_mongodb_database_name
+from app.utils.security import safe_connection_string

 # Global variables for singleton pattern
 _client: Optional[MongoClient] = None
 _database: Optional[Database] = None

+logger = logging.getLogger(__name__)
+

 def create_mongodb_client() -> MongoClient:
  """
@@ -43,16 +46,16 @@ def create_mongodb_client() -> MongoClient:
    # Test connection by running admin command
    client.admin.command('ping')
    
-    print(f"Successfully connected to MongoDB at {mongodb_url}")
+    logger.info(f"Successfully connected to MongoDB at {safe_connection_string(mongodb_url)}")
    return client
  
  except (ConnectionFailure, ServerSelectionTimeoutError) as e:
-    print(f"ERROR: Failed to connect to MongoDB at {mongodb_url}")
-    print(f"Connection error: {str(e)}")
-    print("MongoDB is required for this application. Please ensure MongoDB is running and accessible.")
+    logger.error(f"ERROR: Failed to connect to MongoDB at {safe_connection_string(mongodb_url)}")
+    logger.error(f"Connection error: {str(e)}")
+    logger.error("MongoDB is required for this application. Please ensure MongoDB is running and accessible.")
    sys.exit(1)
  except Exception as e:
-    print(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}")
+    logger.error(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}")
    sys.exit(1)


@@ -74,7 +77,7 @@ def get_database() -> Database:
    
    database_name = get_mongodb_database_name()
    _database = _client[database_name]
-    print(f"Connected to database: {database_name}")
+    logger.info(f"Connected to database: {database_name}")
  
  return _database

@@ -92,7 +95,7 @@ def close_database_connection():
    _client.close()
    _client = None
    _database = None
-    print("MongoDB connection closed")
+    logger.info("MongoDB connection closed")


 def get_mongodb_client() -> Optional[MongoClient]:
--- a/src/file-processor/app/main.py
+++ b/src/file-processor/app/main.py
@@ -17,6 +17,7 @@ from fastapi.middleware.cors import CORSMiddleware

 from app.api.routes.auth import router as auth_router
 from app.api.routes.users import router as users_router
+from app.api.routes.document import router as documents_router
 from app.config import settings
 from app.database.connection import get_database
 from app.file_watcher import create_file_watcher, FileWatcher
@@ -111,7 +112,7 @@ app.add_middleware(
 # Include routers
 app.include_router(auth_router, prefix="/auth", tags=["Authentication"])
 app.include_router(users_router, prefix="/users", tags=["User Management"])
-# app.include_router(documents_router, prefix="/documents", tags=["Documents"])
+app.include_router(documents_router, prefix="/api", tags=["Documents"])
 # app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"])


--- a/src/file-processor/app/models/document.py
+++ b/src/file-processor/app/models/document.py
@@ -7,10 +7,9 @@ stored in MongoDB collections.

 from datetime import datetime
 from enum import Enum
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional

-from bson import ObjectId
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field, field_validator, ConfigDict

 from app.models.types import PyObjectId

@@ -50,6 +49,7 @@ class FileDocument(BaseModel):
  detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
  file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
  pdf_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the associated pdf file content")
+  thumbnail_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the thumbnail")
  encoding: str = Field(default="utf-8", description="Character encoding for text files")
  file_size: int = Field(..., ge=0, description="File size in bytes")
  mime_type: str = Field(..., description="MIME type detected")
@@ -69,3 +69,28 @@ class FileDocument(BaseModel):
    if not v.strip():
      raise ValueError("Filename cannot be empty")
    return v.strip()
+
+
+class DocumentResponse(BaseModel):
+  """
+  Response model for document API endpoints.
+  
+  Represents a document in the format expected by the frontend application.
+  Field names are automatically converted from snake_case to camelCase.
+  """
+  
+  model_config = ConfigDict(alias_generator=lambda field_name: ''.join(
+    word.capitalize() if i > 0 else word
+    for i, word in enumerate(field_name.split('_'))
+  ), populate_by_name=True)
+  
+  id: str = Field(..., description="Document unique identifier")
+  name: str = Field(..., description="Document filename")
+  original_file_type: str = Field(..., description="Original file type before conversion")
+  created_at: str = Field(..., description="ISO timestamp when document was created")
+  file_size: int = Field(..., description="File size in bytes")
+  page_count: int = Field(..., description="Number of pages in the document")
+  thumbnail_url: Optional[str] = Field(default=None, description="URL to document thumbnail")
+  pdf_url: Optional[str] = Field(default=None, description="URL to PDF version of document")
+  tags: List[str] = Field(default_factory=list, description="Document tags")
+  categories: List[str] = Field(default_factory=list, description="Document categories")
--- a/src/file-processor/app/models/job.py
+++ b/src/file-processor/app/models/job.py
@@ -16,6 +16,7 @@ class ProcessingStatus(str, Enum):
  COMPLETED = "completed"
  SAVING_OBJECT = "saving_object"
  SAVING_PDF = "saving_pdf"
+  CREATING_THUMBNAIL = "creating_thumbnail"
  FAILED = "failed"


--- a/src/file-processor/app/services/document_service.py
+++ b/src/file-processor/app/services/document_service.py
@@ -24,10 +24,22 @@ from app.models.document import (
 )
 from app.models.types import PyObjectId
 from app.utils.pdf_converter import convert_to_pdf
+from app.utils.pdf_thumbmail import PDFThumbnailGenerator
+from app.utils.security import generate_uuid_filename

 logger = logging.getLogger(__name__)


+class DocumentAlreadyExists(Exception):
+  def __init__(self, message):
+    self.message = message
+
+
+class DocumentProcessingError(Exception):
+  def __init__(self, message):
+    self.message = message
+
+
 class DocumentService:
  """
  Service for orchestrated document and content management.
@@ -162,7 +174,7 @@ class DocumentService:
      # Increment counter for next attempt
      counter += 1
  
-  def _get_document_path(self, file_hash):
+  def get_document_path(self, file_hash):
    """

    :param file_hash:
@@ -171,10 +183,12 @@ class DocumentService:
    return os.path.join(self.objects_folder, file_hash[:24], file_hash)
  
  def exists(self, file_hash):
-    return os.path.exists(self._get_document_path(file_hash))
+    if file_hash is None:
+      return False
+    return os.path.exists(self.get_document_path(file_hash))
  
  def save_content_if_needed(self, file_hash, content: bytes):
-    target_path = self._get_document_path(file_hash)
+    target_path = self.get_document_path(file_hash)
    if os.path.exists(target_path):
      return
    
@@ -192,7 +206,8 @@ class DocumentService:
  
  def move_to_ignored(self, file_path, reason="Unknown"):
    logger.info(f"Moving file {file_path} to ignored folder")
-    ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(file_path)
+    ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(
+      file_path)
    ignored_file_path = self._get_safe_path(os.path.join(self.ignored_folder, ignored_file_name))
    shutil.move(file_path, ignored_file_path)
  
@@ -231,15 +246,16 @@ class DocumentService:
    detected_at = datetime.now()
    
    try:
-      logger.info(f"Creating Document for {file_path}")
+      logger.info(f'Creating Document for "{file_path}"')
      # Skip the document if it already exists
      same_document = self.document_repository.find_same_document(filename, file_hash)
      if same_document is not None:
        logger.info(f"  Document with same hash already exists. Skipping...")
        self.move_to_ignored(file_path, f"already exists ({same_document.id})")
+        raise DocumentAlreadyExists(f"Document with same hash already exists ({same_document.id})")
      
      self.save_content_if_needed(file_hash, file_bytes)
-      logger.info(f"  Saved content to {self._get_document_path(file_hash)}")
+      logger.info(f"  Saved content to {self.get_document_path(file_hash)}")
      
      # Create FileDocument
      file_data = FileDocument(
@@ -255,11 +271,13 @@ class DocumentService:
        mime_type=mime_type
      )
      
-      created_file = self.document_repository.create_document(file_data)
-      logger.info(f"  Created document with id '{created_file.id}'")
+      created_document = self.document_repository.create_document(file_data)
+      logger.info(f"  Created document with id '{created_document.id}'")
      
-      return created_file
+      return created_document
    
+    except DocumentAlreadyExists as e:
+      raise e
    except Exception as e:
      # Transaction will automatically rollback if supported
      raise PyMongoError(f"Failed to create document: {str(e)}")
@@ -273,40 +291,69 @@ class DocumentService:
    document = self.get_document_by_id(document_id)
    if document is None:
      logger.error(f"  Document not found")
-      raise ValueError(f"Document {document_id} not found")
+      raise DocumentProcessingError(f"Document {document_id} not found.")
    
    # try to find another document that has the same hash
    document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
    
    # the pdf will be created only if it does not exist yet
-    if (document_with_same_hash is not None and
-        document_with_same_hash.pdf_file_hash and
-        self.exists(document_with_same_hash.pdf_file_hash)):
-      logger.info(f"Found document with same hash. Will use pdf {document_with_same_hash.pdf_file_hash}")
+    if document_with_same_hash and self.exists(document_with_same_hash.pdf_file_hash):
+      logger.info(f'Found document with same hash. Will use pdf "{document_with_same_hash.pdf_file_hash}".')
      self.update_document(document_id, {"pdf_file_hash": document_with_same_hash.pdf_file_hash})
-      return True
+      return
    
    # get the content of the file
-    logger.info(f"  No document with same hash found and valid pdf found. Will create new pdf")
+    logger.info(f"  No document with same hash and valid pdf found. Will create new pdf content.")
    file_bytes = self.get_document_content_by_hash(document.file_hash)
    if file_bytes is None:
-      logger.error(f"Content for document {document_id} not found. hash = {document.file_hash}.")
-      return False
+      logger.error(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".')
+      raise DocumentProcessingError(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".')
    
    # create the pdf file
-    temp_pdf_file = convert_to_pdf(self._get_document_path(document.file_hash), self.temp_folder)
+    temp_pdf_file = convert_to_pdf(self.get_document_path(document.file_hash), self.temp_folder)
    pdf_file_hash = self._calculate_file_hash(self._read_file_bytes(temp_pdf_file))
    self.save_content_if_needed(pdf_file_hash, self._read_file_bytes(temp_pdf_file))
-    logger.info(f"  Created new pdf file with hash {pdf_file_hash}")
-    
-    # remove the temporary file
-    os.remove(temp_pdf_file)
-    logger.info(f"  Removed temporary pdf file {temp_pdf_file}")
+    os.remove(temp_pdf_file)  # remove the temporary file
+    logger.info(f'  Created new pdf file with hash "{pdf_file_hash}"')
    
    # update the document
    self.update_document(document_id, {"pdf_file_hash": pdf_file_hash})
+  
+  def create_thumbnail(self, document_id: PyObjectId):
+    logger.info(f'Creating thumbnail document for "{document_id}"')
+    document = self.get_document_by_id(document_id)
+    if document is None:
+      logger.error(f"  Document not found !")
+      raise DocumentProcessingError(f"Document {document_id} not found.")
    
-    return True
+    # try to find another document that has the same hash
+    document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
+    
+    # We will use the thumbnail of the pdf if it exists
+    if document_with_same_hash and self.exists(document_with_same_hash.thumbnail_file_hash):
+      logger.info(f"  Found document with same hash. Will use thumbnail {document_with_same_hash.thumbnail_file_hash}")
+      self.update_document(document_id, {"thumbnail_file_hash": document_with_same_hash.thumbnail_file_hash})
+      return
+    
+    logger.info(f"  No document with same hash and valid thumbnail found. Will create new thumbnail")
+    
+    if not self.exists(document.pdf_file_hash):
+      logger.error(f"  PDF file not found.")
+      raise DocumentProcessingError(f"PDF file for document {document_id} not found")
+    
+    tmp_thumbnail_path = os.path.join(self.temp_folder, f"{generate_uuid_filename()}.png")
+    with PDFThumbnailGenerator(self.get_document_path(document.pdf_file_hash)) as gen:
+      # create the thumbnail
+      gen.create_thumbnail(tmp_thumbnail_path, page_num=0, width=200)
+      thumbnail_file_hash = self._calculate_file_hash(self._read_file_bytes(tmp_thumbnail_path))
+      
+      # save the thumbnail to the objects folder
+      self.save_content_if_needed(thumbnail_file_hash, self._read_file_bytes(tmp_thumbnail_path))
+      os.remove(tmp_thumbnail_path)
+      
+      # update the document
+      self.update_document(document_id, {"thumbnail_file_hash": thumbnail_file_hash})
+      logger.info(f"  Created thumbnail {thumbnail_file_hash}")
  
  def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
    """
@@ -348,7 +395,7 @@ class DocumentService:
    return self.document_repository.find_document_by_filepath(filepath)
  
  def get_document_content_by_hash(self, file_hash):
-    target_path = self._get_document_path(file_hash)
+    target_path = self.get_document_path(file_hash)
    if not os.path.exists(target_path):
      return None
    
@@ -439,7 +486,7 @@ class DocumentService:
      # If no other files reference this content, delete it
      if not remaining_files:
        try:
-          os.remove(self._get_document_path(document.file_hash))
+          os.remove(self.get_document_path(document.file_hash))
        except Exception:
          pass
      
--- a/src/file-processor/app/utils/pdf_annotation.py
+++ b/src/file-processor/app/utils/pdf_annotation.py
@@ -0,0 +1,241 @@
+import fitz  # PyMuPDF
+
+
+class PDFAnnotator:
+  def __init__(self, pdf_path):
+    self.doc = fitz.open(pdf_path)
+  
+  def add_highlight(self, rect, page_num=0, color=(1, 1, 0)):
+    """
+    Add highlight annotation
+
+    Args:
+        rect: (x0, y0, x1, y1) coordinates or fitz.Rect object
+        page_num: Page number (0-indexed), default first page
+        color: RGB tuple (0-1 range), default yellow
+    """
+    page = self.doc[page_num]
+    annot = page.add_highlight_annot(rect)
+    annot.set_colors(stroke=color)
+    annot.update()
+    return annot
+  
+  def add_rectangle(self, rect, page_num=0, color=(1, 0, 0), width=2):
+    """
+    Add rectangle annotation (border only)
+
+    Args:
+        rect: (x0, y0, x1, y1) coordinates or fitz.Rect object
+        page_num: Page number (0-indexed), default first page
+        color: RGB tuple (0-1 range), default red
+        width: Line width in points
+    """
+    page = self.doc[page_num]
+    annot = page.add_rect_annot(rect)
+    annot.set_colors(stroke=color)
+    annot.set_border(width=width)
+    annot.update()
+    return annot
+  
+  def add_text_note(self, point, text, page_num=0, icon="Note"):
+    """
+    Add sticky note annotation
+
+    Args:
+        point: (x, y) position tuple
+        text: Note content string
+        page_num: Page number (0-indexed), default first page
+        icon: "Note", "Comment", "Help", "Insert", "Key", etc.
+    """
+    page = self.doc[page_num]
+    annot = page.add_text_annot(point, text, icon=icon)
+    annot.update()
+    return annot
+  
+  def add_free_text(self, rect, text, page_num=0, fontsize=12,
+                    color=(0, 0, 0)):
+    """
+    Add free text annotation (visible text box)
+
+    Args:
+        rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect
+        text: Text content string
+        page_num: Page number (0-indexed), default first page
+        fontsize: Font size in points
+        color: Text color RGB tuple (0-1 range)
+    """
+    page = self.doc[page_num]
+    annot = page.add_freetext_annot(
+      rect,
+      text,
+      fontsize=fontsize,
+      text_color=color
+    )
+    annot.update()
+    return annot
+  
+  def add_arrow(self, start_point, end_point, page_num=0,
+                color=(1, 0, 0), width=2):
+    """
+    Add arrow annotation
+
+    Args:
+        start_point: (x, y) tuple for arrow start
+        end_point: (x, y) tuple for arrow end
+        page_num: Page number (0-indexed), default first page
+        color: Arrow color RGB tuple (0-1 range), default red
+        width: Line width in points
+    """
+    page = self.doc[page_num]
+    annot = page.add_line_annot(start_point, end_point)
+    annot.set_colors(stroke=color)
+    annot.set_border(width=width)
+    # Set arrow at end - use integer constant
+    annot.set_line_ends(0, 1)  # 1 = ClosedArrow
+    annot.update()
+    return annot
+  
+  def add_stamp(self, rect, page_num=0, stamp_type=0):
+    """
+    Add stamp annotation
+
+    Args:
+        rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect
+        page_num: Page number (0-indexed), default first page
+        stamp_type: Integer for stamp type:
+                   0=Approved, 1=AsIs, 2=Confidential,
+                   3=Departmental, 4=Draft, 5=Experimental,
+                   6=Expired, 7=Final, 8=ForComment,
+                   9=ForPublicRelease, 10=NotApproved, etc.
+    """
+    page = self.doc[page_num]
+    annot = page.add_stamp_annot(rect, stamp=stamp_type)
+    annot.update()
+    return annot
+  
+  def add_redaction(self, rect, page_num=0, fill_color=(0, 0, 0)):
+    """
+    Add redaction annotation (marks area for redaction)
+    Note: Use apply_redactions() to permanently remove content
+
+    Args:
+        rect: (x0, y0, x1, y1) area to redact, tuple or fitz.Rect
+        page_num: Page number (0-indexed), default first page
+        fill_color: RGB tuple (0-1 range) for redacted area, default black
+    """
+    page = self.doc[page_num]
+    annot = page.add_redact_annot(rect, fill=fill_color)
+    annot.update()
+    return annot
+  
+  def apply_redactions(self, page_num=0, images=2, graphics=2, text=2):
+    """
+    Apply all redaction annotations on a page (permanent removal)
+
+    Args:
+        page_num: Page number (0-indexed), default first page
+        images: 2=remove, 1=blank, 0=ignore
+        graphics: 2=remove, 1=blank, 0=ignore
+        text: 2=remove, 1=blank, 0=ignore
+
+    Returns:
+        True if redactions were applied, False otherwise
+    """
+    page = self.doc[page_num]
+    # Check if page has redaction annotations
+    has_redactions = any(annot.type[0] == 12 for annot in page.annots())
+    
+    if has_redactions:
+      page.apply_redactions(images=images, graphics=graphics, text=text)
+      return True
+    return False
+  
+  def get_all_annotations(self, page_num=0):
+    """
+    Retrieve all annotations from a page
+
+    Args:
+        page_num: Page number (0-indexed), default first page
+
+    Returns:
+        List of dicts with annotation information
+    """
+    page = self.doc[page_num]
+    annotations = []
+    
+    for annot in page.annots():
+      info = {
+          'type': annot.type[1],  # Annotation type name
+          'rect': annot.rect,
+          'content': annot.info.get('content', ''),
+          'author': annot.info.get('title', ''),
+          'created': annot.info.get('creationDate', ''),
+          'colors': annot.colors
+      }
+      annotations.append(info)
+    
+    return annotations
+  
+  def remove_all_annotations(self, page_num=0):
+    """
+    Remove all annotations from a page
+
+    Args:
+        page_num: Page number (0-indexed), default first page
+    """
+    page = self.doc[page_num]
+    for annot in page.annots():
+      page.delete_annot(annot)
+  
+  def save(self, output_path):
+    """Save the annotated PDF"""
+    self.doc.save(output_path)
+  
+  def close(self):
+    self.doc.close()
+  
+  def __enter__(self):
+    return self
+  
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self.close()
+
+
+# Example usage
+if __name__ == "__main__":
+  with PDFAnnotator("input.pdf") as annotator:
+    # Add yellow highlight
+    annotator.add_highlight((100, 100, 300, 120), page_num=0,
+                            color=(1, 1, 0))
+    
+    # Add red rectangle border
+    annotator.add_rectangle((100, 150, 300, 250), page_num=0,
+                            color=(1, 0, 0), width=3)
+    
+    # Add sticky note
+    annotator.add_text_note((400, 100), "This is important!",
+                            page_num=0, icon="Comment")
+    
+    # Add visible text box
+    annotator.add_free_text((100, 300, 400, 350), "DRAFT VERSION",
+                            page_num=0, fontsize=20, color=(1, 0, 0))
+    
+    # Add arrow pointing to something
+    annotator.add_arrow((450, 100), (500, 200), page_num=0,
+                        color=(0, 0, 1), width=2)
+    
+    # Add "Approved" stamp
+    annotator.add_stamp((450, 300, 550, 350), page_num=0, stamp_type=0)
+    
+    # Add redaction (black box over sensitive info)
+    annotator.add_redaction((100, 400, 300, 420), page_num=0)
+    annotator.apply_redactions(page_num=0)
+    
+    # List all annotations
+    annots = annotator.get_all_annotations(page_num=0)
+    print(f"Found {len(annots)} annotations:")
+    for a in annots:
+      print(f"  - {a['type']} at {a['rect']}")
+    
+    # Save annotated PDF
+    annotator.save("output_annotated.pdf")
--- a/src/file-processor/app/utils/pdf_converter.py
+++ b/src/file-processor/app/utils/pdf_converter.py
@@ -127,6 +127,15 @@ class TextToPdfConverter(BaseConverter):
    return self


+class PdfToPdfConverter(BaseConverter):
+  """Converter for PDF files to PDF."""
+  
+  def convert(self) -> Self:
+    # copy self.input_path to self.output_path
+    os.system(f"cp {self.input_path} {self.output_path}")
+    return self
+
+
 class ImageToPdfConverter(BaseConverter):
  """Converter for image files to PDF."""
  
@@ -191,6 +200,8 @@ def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
    converter = ImageToPdfConverter(filepath, output_dir=output_dir)
  elif file_type == "word":
    converter = WordToPdfConverter(filepath, output_dir=output_dir)
+  elif file_type == "pdf":
+    converter = PdfToPdfConverter(filepath, output_dir=output_dir)
  else:
    raise ValueError(f"Unsupported file type: {file_type}")
  
--- a/src/file-processor/app/utils/pdf_thumbmail.py
+++ b/src/file-processor/app/utils/pdf_thumbmail.py
@@ -0,0 +1,167 @@
+from pathlib import Path
+
+import fitz  # PyMuPDF
+
+
+class PDFThumbnailGenerator:
+  def __init__(self, pdf_path):
+    """
+    Initialize PDF thumbnail generator
+
+    Args:
+        pdf_path: Path to the PDF file (string or Path object)
+    """
+    self.pdf_path = pdf_path
+    self.doc = fitz.open(pdf_path)
+  
+  def create_thumbnail(self, output_path, page_num=0, width=200, rotation=0, zoom_factor=1.0):
+    """
+    Create a thumbnail with zoom and rotation
+
+    Args:
+        output_path: Path to save the thumbnail (string or Path)
+        page_num: Page number (0-indexed), default first page
+        width: Desired width in pixels, default 200
+        rotation: Rotation angle in degrees (0, 90, 180, 270), default 0
+        zoom_factor: Additional zoom multiplier (1.0 = normal, 2.0 = 2x), default 1.0
+
+    Returns:
+        Dict with thumbnail info (width, height, rotation, zoom)
+    """
+    page = self.doc[page_num]
+    
+    # Apply rotation to page
+    page.set_rotation(rotation)
+    
+    # Calculate zoom to achieve desired width
+    base_zoom = width / page.rect.width
+    final_zoom = base_zoom * zoom_factor
+    
+    # Create transformation matrix
+    mat = fitz.Matrix(final_zoom, final_zoom)
+    
+    # Render page to pixmap
+    pix = page.get_pixmap(matrix=mat, alpha=False)
+    
+    # Save thumbnail
+    pix.save(output_path)
+    
+    return {
+        'width': pix.width,
+        'height': pix.height,
+        'rotation': rotation,
+        'zoom': zoom_factor
+    }
+  
+  def create_cropped_thumbnail(self, output_path, crop_rect=None, page_num=0, width=200):
+    """
+    Create a thumbnail of a specific region (zoom on area)
+
+    Args:
+        output_path: Path to save the thumbnail (string or Path)
+        crop_rect: Tuple (x0, y0, x1, y1) in PDF coordinates for cropping,
+                  or None for full page, default None
+        page_num: Page number (0-indexed), default first page
+        width: Desired width in pixels, default 200
+
+    Returns:
+        Tuple (width, height) of the generated thumbnail
+    """
+    page = self.doc[page_num]
+    
+    if crop_rect:
+      # Create rectangle for cropping
+      rect = fitz.Rect(crop_rect)
+      zoom = width / rect.width
+    else:
+      rect = page.rect
+      zoom = width / page.rect.width
+    
+    mat = fitz.Matrix(zoom, zoom)
+    
+    # Render only the specified rectangle
+    pix = page.get_pixmap(matrix=mat, clip=rect)
+    pix.save(output_path)
+    
+    return pix.width, pix.height
+  
+  def get_page_info(self, page_num=0):
+    """
+    Get information about a specific page
+
+    Args:
+        page_num: Page number (0-indexed), default first page
+
+    Returns:
+        Dict with page information (width, height, rotation, number, total_pages)
+    """
+    page = self.doc[page_num]
+    return {
+        'width': page.rect.width,
+        'height': page.rect.height,
+        'rotation': page.rotation,
+        'number': page_num + 1,
+        'total_pages': len(self.doc)
+    }
+  
+  def create_multi_resolution_thumbnails(self, output_folder, page_num=0, sizes=(150, 300, 600)):
+    """
+    Create multiple thumbnails at different resolutions
+
+    Args:
+        output_folder: Folder path to save thumbnails (string or Path)
+        page_num: Page number (0-indexed), default first page
+        sizes: List of widths in pixels, default [150, 300, 600]
+
+    Returns:
+        Dict mapping each size to thumbnail info
+    """
+    output_folder = Path(output_folder)
+    output_folder.mkdir(exist_ok=True, parents=True)
+    
+    results = {}
+    for size in sizes:
+      output_path = output_folder / f"thumb_{size}px.png"
+      info = self.create_thumbnail(output_path, page_num=page_num, width=size)
+      results[size] = info
+    
+    return results
+  
+  def close(self):
+    """Close the PDF document and free resources"""
+    self.doc.close()
+  
+  def __enter__(self):
+    return self
+  
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self.close()
+
+
+# Example usage
+if __name__ == "__main__":
+  # Basic usage with context manager
+  with PDFThumbnailGenerator("example.pdf") as gen:
+    # Standard thumbnail
+    gen.create_thumbnail("thumb_standard.png", page_num=0, width=200)
+    
+    # Rotated thumbnail
+    gen.create_thumbnail("thumb_rotated.png", page_num=0,
+                         width=200, rotation=90)
+    
+    # Zoomed thumbnail (2x zoom)
+    gen.create_thumbnail("thumb_zoomed.png", page_num=0,
+                         width=200, zoom_factor=2.0)
+    
+    # Cropped/zoomed on specific area (x0, y0, x1, y1)
+    gen.create_cropped_thumbnail("thumb_crop.png",
+                                 crop_rect=(100, 100, 400, 400),
+                                 page_num=0, width=300)
+    
+    # Multiple resolutions
+    gen.create_multi_resolution_thumbnails("thumbnails/", page_num=0,
+                                           sizes=[150, 300, 600])
+    
+    # Get page information
+    info = gen.get_page_info(page_num=0)
+    print(f"Page info: {info}")
--- a/src/file-processor/app/utils/security.py
+++ b/src/file-processor/app/utils/security.py
@@ -4,9 +4,10 @@ Password security utilities using bcrypt for secure password hashing.
 This module provides secure password hashing and verification functions
 using the bcrypt algorithm with automatic salt generation.
 """
+import re
+import uuid

 import bcrypt
-from typing import Union


 def hash_password(password: str) -> str:
@@ -71,4 +72,33 @@ def verify_password(password: str, hashed_password: str) -> bool:
    # bcrypt raises ValueError for malformed hashes
    raise RuntimeError(f"Invalid hash format: {str(e)}")
  except Exception as e:
-    raise RuntimeError(f"Failed to verify password: {str(e)}")
+    raise RuntimeError(f"Failed to verify password: {str(e)}")
+
+
+def generate_uuid_filename() -> str:
+  """Generate a unique filename using UUID4."""
+  return str(uuid.uuid4())
+
+
+def safe_connection_string(connection_string: str) -> str:
+  """
+  Mask the password in a MongoDB connection string.
+
+  Args:
+      connection_string (str): The complete MongoDB connection string
+
+  Returns:
+      str: The connection string with password replaced by asterisks
+
+  Example:
+      >>> mask_mongodb_password("mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin")
+      "mongodb://admin:***@mongodb:27017/mydocmanager?authSource=admin"
+  """
+  # Pattern to detect password in MongoDB URL
+  # Format: mongodb://username:password@host:port/database
+  pattern = r'(mongodb://[^:]+:)([^@]+)(@.*)'
+  
+  # Replace password with asterisks
+  masked_string = re.sub(pattern, r'\1*****\3', connection_string)
+  
+  return masked_string
--- a/src/file-processor/requirements.txt
+++ b/src/file-processor/requirements.txt
@@ -10,6 +10,7 @@ pillow==11.3.0
 pydantic==2.11.9
 PyJWT==2.10.1
 pymongo==4.15.0
+PyMuPDF==1.26.4
 pypandoc==1.15
 python-multipart==0.0.20
 redis==6.4.0
--- a/src/frontend/src/components/common/Menu.jsx
+++ b/src/frontend/src/components/common/Menu.jsx
@@ -1,11 +1,13 @@
 import {FaBuffer, FaPlus} from "react-icons/fa6";
+import { Link } from "react-router-dom";

 const Menu = () => {
  return (
    <div className="p-4">
      <ul className="menu">
        <li className="menu-title">Exploration</li>
-        <li><a><FaBuffer/>To Review</a></li>
+        <li><Link to="/dashboard"><FaBuffer/>Dashboard</Link></li>
+        <li><Link to="/documents"><FaBuffer/>To Review</Link></li>
        <li className="menu-title mt-4">Catégories</li>
        <li><a><i className="fas fa-plus"></i>Item</a></li>
      </ul>
--- a/src/frontend/src/components/documents/DocumentCard.jsx
+++ b/src/frontend/src/components/documents/DocumentCard.jsx
@@ -64,8 +64,8 @@ const DocumentCard = memo(({ document, viewMode, onEdit, onDelete }) => {
  const renderThumbnail = () => (
    <figure className="relative overflow-hidden">
      <img
-        src={thumbnailUrl}
-        alt={`${name} thumbnail`}
+        src={`http://localhost:8000${thumbnailUrl}`}
+        alt={`${thumbnailUrl} thumbnail`}
        className={`w-full object-cover ${
          viewMode === 'small' ? 'h-32' : viewMode === 'large' ? 'h-48' : 'h-64'
        }`}
--- a/src/frontend/src/services/documentService.js
+++ b/src/frontend/src/services/documentService.js
@@ -5,17 +5,24 @@
 */

 import { mockDocuments, availableTags, availableCategories } from '../utils/mockData';
+import api from '../utils/api';

 // Simulate network delay
 const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));

 /**
- * Fetches all documents
+ * Fetches all documents from the API
 * @returns {Promise<Array>} Array of document objects
 */
 export const getAllDocuments = async () => {
-  await delay(500); // Simulate network latency
-  return [...mockDocuments];
+  try {
+    const response = await api.get('/api/documents');
+    return response.data;
+  } catch (error) {
+    console.error('Failed to fetch documents:', error);
+    // Fallback to mock data in case of API error during development
+    console.warn('Falling back to mock data');
+  }
 };

 /**
--- a/src/worker/requirements.txt
+++ b/src/worker/requirements.txt
@@ -10,6 +10,7 @@ pillow==11.3.0
 pydantic==2.11.9
 PyJWT==2.10.1
 pymongo==4.15.0
+PyMuPDF==1.26.4
 pypandoc==1.15
 python-multipart==0.0.20
 redis==6.4.0
--- a/src/worker/tasks/document_processing.py
+++ b/src/worker/tasks/document_processing.py
@@ -12,7 +12,7 @@ from typing import Any, Dict
 from app.config import settings
 from app.database.connection import get_database
 from app.models.job import ProcessingStatus
-from app.services.document_service import DocumentService
+from app.services.document_service import DocumentService, DocumentAlreadyExists
 from app.services.job_service import JobService
 from tasks.main import celery_app

@@ -26,7 +26,7 @@ def get_services():
  return document_service, job_service


-#@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
+# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
@celery_app.task(bind=True)
 def process_document(self, filepath: str) -> Dict[str, Any]:
  """
@@ -48,7 +48,7 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
      Exception: Any processing error (will trigger retry)
  """
  task_id = self.request.id
-  logger.info(f"Starting document processing task {task_id} for file: {filepath}")
+  logger.info(f'Task {task_id} : Starting document processing for file: "{filepath}"')
  
  # get services
  document_service, job_service = get_services()
@@ -60,12 +60,16 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
    document = document_service.create_document(filepath)
    job = job_service.create_job(task_id=task_id, document_id=document.id)
    job_service.mark_job_as_started(job_id=job.id)
-    logger.info(f"Task {task_id} created for document {document.id} from file path: {filepath} and job id: {job.id}")
+    logger.info(f'Task {task_id} : Created document "{document.id}". Started job "{job.id}"')
    
    logger.info(f"Task {task_id} : Creating associated PDF")
    job_service.update_job_status(job_id=job.id, status=ProcessingStatus.SAVING_PDF)
    document_service.create_pdf(document.id)
    
+    logger.info(f"Task {task_id} : Creating thumbnail")
+    job_service.update_job_status(job_id=job.id, status=ProcessingStatus.CREATING_THUMBNAIL)
+    document_service.create_thumbnail(document.id)
+    
    # remove the file from the watch folder
    os.remove(filepath)
    
@@ -79,6 +83,19 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
        "status": "completed",
    }
  
+  except DocumentAlreadyExists as e:
+    logger.info(f"Task {task_id} completed: {str(e)}")
+    if job is not None:
+      job_service.mark_job_as_completed(job_id=job.id)
+      logger.info(f"Job {task_id} marked as COMPLETED")
+    
+    return {
+        "task_id": task_id,
+        "filepath": filepath,
+        "status": "completed",
+        "message": str(e),
+    }
+  
  except Exception as e:
    error_message = f"Document processing failed: {str(e)}"
    logger.error(f"Task {task_id} failed: {error_message}")
--- a/tests/services/test_document_service.py
+++ b/tests/services/test_document_service.py
@@ -618,7 +618,7 @@ class TestCreatePdf:
    assert updated_doc.pdf_file_hash == pdf_hash
    
    # Verify convert_to_pdf was called with correct arguments
-    doc_path = document_service._get_document_path(created_doc.file_hash)
+    doc_path = document_service.get_document_path(created_doc.file_hash)
    mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder)
    
    # Verify content exists on disk
@@ -694,7 +694,7 @@ class TestCreatePdf:
    )
    
    # Simulate missing content by removing file
-    file_path = document_service._get_document_path(created_doc.file_hash)
+    file_path = document_service.get_document_path(created_doc.file_hash)
    os.remove(file_path)
    
    # Execute