Compare commits

6 Commits

43 changed files with 2845 additions and 186 deletions

32
Makefile Normal file
View File

@@ -0,0 +1,32 @@
.PHONY: init up down restart logs clean
init:
@echo "Creating directories and setting permissions..."
@mkdir -p ./volumes/watched_files ./volumes/objects
@chown -R 1002:1002 ./volumes/watched_files ./volumes/objects
@echo "✓ Directories initialized"
up: init
@echo "Starting services..."
@docker-compose up -d
@echo "✓ Services started"
down:
@docker-compose down
restart:
@docker-compose restart
logs:
@docker-compose logs -f
clean: down
@echo "Cleaning volumes..."
@sudo rm -rf ./volumes
@echo "✓ Volumes cleaned"
rebuild: clean init
@echo "Rebuilding images..."
@docker-compose build --no-cache
@docker-compose up -d
@echo "✓ Services rebuilt and started"

View File

@@ -40,6 +40,8 @@ services:
- ./src/worker/tasks:/app/tasks # <- Added: shared access to worker tasks
- ./volumes/watched_files:/watched_files
- ./volumes/objects:/objects
- ./volumes/errors:/errors
- ./volumes/ignored:/ignored
depends_on:
- redis
- mongodb
@@ -61,6 +63,9 @@ services:
- ./src/worker:/app
- ./src/file-processor/app:/app/app # <- Added: shared access file-processor app
- ./volumes/watched_files:/watched_files
- ./volumes/objects:/objects
- ./volumes/errors:/errors
- ./volumes/ignored:/ignored
depends_on:
- redis
- mongodb

View File

@@ -13,6 +13,7 @@ click-didyoumean==0.3.1
click-plugins==1.1.1.2
click-repl==0.3.0
cryptography==46.0.1
Deprecated==1.2.18
dnspython==2.8.0
ecdsa==0.19.1
email-validator==2.3.0
@@ -32,6 +33,7 @@ mongomock==4.3.0
mongomock-motor==0.0.36
motor==3.7.1
packaging==25.0
pikepdf==9.11.0
pillow==11.3.0
pipdeptree==2.28.0
pluggy==1.6.0
@@ -44,6 +46,7 @@ pydantic_core==2.33.2
Pygments==2.19.2
PyJWT==2.10.1
pymongo==4.15.1
PyMuPDF==1.26.4
pypandoc==1.15
pytest==8.4.2
pytest-asyncio==1.2.0
@@ -72,4 +75,5 @@ watchdog==6.0.0
watchfiles==1.1.0
wcwidth==0.2.13
websockets==15.0.1
wrapt==1.17.3
zipp==3.23.0

View File

@@ -12,10 +12,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
texlive-xetex \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Change the user
USER 1002:1002
# Copy application code
COPY . .
@@ -24,5 +28,6 @@ ENV PYTHONPATH=/app
# Expose port
EXPOSE 8000
# Command will be overridden by docker-compose
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -9,6 +9,7 @@ from app.database.connection import get_database
from app.models.auth import UserRole
from app.models.user import UserInDB
from app.services.auth_service import AuthService
from app.services.document_service import DocumentService
from app.services.user_service import UserService
security = HTTPBearer()
@@ -25,6 +26,12 @@ def get_user_service() -> UserService:
return UserService(database)
def get_document_service() -> DocumentService:
"""Dependency to get DocumentService instance."""
database = get_database()
return DocumentService(database)
def get_current_user(
credentials: HTTPAuthorizationCredentials = Depends(security),
user_service: UserService = Depends(get_user_service)
@@ -79,7 +86,7 @@ def get_current_user(
return user
def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
"""
Dependency to ensure current user has admin role.

View File

@@ -0,0 +1,241 @@
"""
Document API routes.
This module provides REST endpoints for document management operations.
"""
import logging
import os
from typing import List, Optional
import fitz # PyMuPDF
from fastapi import APIRouter, Depends, HTTPException, Query, status, Path
from starlette.responses import Response
from app.api.dependencies import get_document_service, get_current_user
from app.models.document import DocumentResponse, FileDocument
from app.services.document_service import DocumentService
logger = logging.getLogger(__name__)
router = APIRouter(tags=["Documents"])
def _count_pdf_pages(pdf_file_path: str) -> int:
"""
Count the number of pages in a PDF file using PyMuPDF.
Args:
pdf_file_path: Path to the PDF file
Returns:
Number of pages in the PDF, or 0 if file cannot be read
"""
try:
with fitz.open(pdf_file_path) as doc:
return doc.page_count
except Exception as e:
logger.warning(f"Could not count pages for PDF {pdf_file_path}: {e}")
return 0
def _build_object_url(file_hash: Optional[str]) -> Optional[str]:
"""
Build object URL from file hash.
Args:
file_hash: SHA256 hash of the file
Returns:
URL string or None if hash is not provided
"""
if not file_hash:
return None
return f"/api/objects/{file_hash}"
def _extract_metadata_field(metadata: dict, field_name: str) -> List[str]:
"""
Extract a list field from metadata dictionary.
Args:
metadata: Document metadata dictionary
field_name: Name of the field to extract
Returns:
List of strings, empty list if field doesn't exist or is not a list
"""
field_value = metadata.get(field_name, [])
if isinstance(field_value, list):
return [str(item) for item in field_value]
return []
def _map_file_document_to_response(
document: FileDocument,
document_service: DocumentService
) -> DocumentResponse:
"""
Map FileDocument to DocumentResponse format.
Args:
document: FileDocument instance from database
document_service: Document service for file operations
Returns:
DocumentResponse instance ready for API response
"""
# Calculate page count for PDF files
page_count = 0
if document.pdf_file_hash and document_service.exists(document.pdf_file_hash):
pdf_path = document_service.get_document_path(document.pdf_file_hash)
page_count = _count_pdf_pages(pdf_path)
# Build URLs
thumbnail_url = _build_object_url(document.thumbnail_file_hash)
pdf_url = _build_object_url(document.pdf_file_hash)
# Extract tags and categories from metadata
tags = _extract_metadata_field(document.metadata, "tags")
categories = _extract_metadata_field(document.metadata, "categories")
# Format created_at timestamp
created_at = document.detected_at.isoformat() if document.detected_at else ""
as_dict = {
"id": str(document.id),
"name": document.filename,
"original_file_type": document.file_type.value.upper(),
"created_at": created_at,
"file_size": document.file_size,
"page_count": page_count,
"thumbnail_url": thumbnail_url,
"pdf_url": pdf_url,
"tags": tags,
"categories": categories
}
logger.info(f"Document: {as_dict}")
return DocumentResponse(**as_dict)
@router.get("/documents", response_model=List[DocumentResponse])
def list_documents(
skip: int = Query(0, ge=0, description="Number of documents to skip"),
limit: int = Query(100, ge=1, le=1000, description="Maximum number of documents to return"),
UserInDB=Depends(get_current_user),
document_service: DocumentService = Depends(get_document_service)
) -> List[DocumentResponse]:
"""
Retrieve a paginated list of documents.
Args:
skip: Number of documents to skip for pagination
limit: Maximum number of documents to return
document_service: Document service instance
Returns:
List of documents in API response format
Raises:
HTTPException: If database operation fails
"""
try:
# Get documents from service
documents = document_service.list_documents(skip=skip, limit=limit)
# Map to response format
document_responses = [
_map_file_document_to_response(doc, document_service)
for doc in documents
]
return document_responses
except Exception as e:
logger.error(f"Failed to list documents: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to retrieve documents"
)
@router.get("/objects/{file_hash}")
async def get_object_by_hash(
file_hash: str = Path(..., description="SHA256 hash of the object to retrieve"),
document_service: DocumentService = Depends(get_document_service)
):
"""
Serve object content by its hash.
This endpoint serves files (original documents, PDFs, thumbnails) by their
SHA256 hash. It supports all file types stored in the objects folder.
Args:
file_hash: SHA256 hash of the object
document_service: Document service dependency
Returns:
FileResponse with the requested object content
Raises:
HTTPException: If object not found (404) or server error (500)
"""
try:
# Check if object exists
if not document_service.exists(file_hash):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Object not found"
)
# Get file path
file_path = document_service.get_document_path(file_hash)
# Verify file exists on disk
if not os.path.exists(file_path):
logger.error(f"Object {file_hash} registered but file not found at {file_path}")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Object file not found on disk"
)
# Determine media type based on file content
try:
file_content = document_service.get_document_content_by_hash(file_hash)
if not file_content:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Object content not available"
)
# Detect MIME type
import magic
mime_type = magic.from_buffer(file_content, mime=True)
# Return file content with appropriate headers
return Response(
content=file_content,
media_type=mime_type,
headers={
"Content-Length": str(len(file_content)),
"Cache-Control": "public, max-age=3600" # Cache for 1 hour
}
)
except Exception as e:
logger.error(f"Error reading object content for hash {file_hash}: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to read object content"
)
except HTTPException:
# Re-raise HTTP exceptions as-is
raise
except Exception as e:
logger.error(f"Unexpected error serving object {file_hash}: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Internal server error while serving object"
)

View File

@@ -105,4 +105,14 @@ def get_watch_folder() -> str:
def get_temp_folder() -> str:
"""Directory to store temporary files"""
return os.getenv("TEMP_DIRECTORY", "/temp")
return os.getenv("TEMP_DIRECTORY", "/tmp")
def get_errors_folder() -> str:
"""Directory to store temporary files"""
return os.getenv("ERRORS_DIRECTORY", "/errors")
def get_ignored_folder() -> str:
"""Directory to store temporary files"""
return os.getenv("IGNORED_DIRECTORY", "/ignored")

View File

@@ -4,7 +4,7 @@ MongoDB database connection management.
This module handles MongoDB connection with fail-fast approach.
The application will terminate if MongoDB is not accessible at startup.
"""
import logging
import sys
from typing import Optional
@@ -13,11 +13,14 @@ from pymongo.database import Database
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
from app.config.settings import get_mongodb_url, get_mongodb_database_name
from app.utils.security import safe_connection_string
# Global variables for singleton pattern
_client: Optional[MongoClient] = None
_database: Optional[Database] = None
logger = logging.getLogger(__name__)
def create_mongodb_client() -> MongoClient:
"""
@@ -43,16 +46,16 @@ def create_mongodb_client() -> MongoClient:
# Test connection by running admin command
client.admin.command('ping')
print(f"Successfully connected to MongoDB at {mongodb_url}")
logger.info(f"Successfully connected to MongoDB at {safe_connection_string(mongodb_url)}")
return client
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
print(f"ERROR: Failed to connect to MongoDB at {mongodb_url}")
print(f"Connection error: {str(e)}")
print("MongoDB is required for this application. Please ensure MongoDB is running and accessible.")
logger.error(f"ERROR: Failed to connect to MongoDB at {safe_connection_string(mongodb_url)}")
logger.error(f"Connection error: {str(e)}")
logger.error("MongoDB is required for this application. Please ensure MongoDB is running and accessible.")
sys.exit(1)
except Exception as e:
print(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}")
logger.error(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}")
sys.exit(1)
@@ -74,7 +77,7 @@ def get_database() -> Database:
database_name = get_mongodb_database_name()
_database = _client[database_name]
print(f"Connected to database: {database_name}")
logger.info(f"Connected to database: {database_name}")
return _database
@@ -92,7 +95,7 @@ def close_database_connection():
_client.close()
_client = None
_database = None
print("MongoDB connection closed")
logger.info("MongoDB connection closed")
def get_mongodb_client() -> Optional[MongoClient]:

View File

@@ -130,6 +130,47 @@ class FileDocumentRepository:
except PyMongoError:
return None
def find_document_with_pdf_hash(self, file_hash: str) -> Optional[FileDocument]:
"""
Find file document by file hash with a pdf_file_hash set (not None).
Args:
file_hash (str): SHA256 hash of file content
Returns:
FileDocument or None: File document if found, None otherwise
"""
try:
file_doc = self.collection.find_one({"file_hash": file_hash,
"pdf_file_hash": {"$ne": None}})
if file_doc:
return FileDocument(**file_doc)
return None
except PyMongoError:
return None
def find_same_document(self, filename: str, file_hash: str):
"""
Find document with the same file_name and the same file hash
Args:
filename (str):
file_hash (str): SHA256 hash of file content
Returns:
FileDocument or None: File document if found, None otherwise
"""
try:
file_doc = self.collection.find_one({"file_hash": file_hash,
"filename": filename})
if file_doc:
return FileDocument(**file_doc)
return None
except PyMongoError:
return None
def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
"""
Find file document by exact filepath.

View File

@@ -30,7 +30,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
dispatching Celery tasks, and managing processing jobs.
"""
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx'}
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx', '.jpg', '.png', '.jpeg'}
def __init__(self, document_service: DocumentService, job_service: JobService):
"""
@@ -59,6 +59,7 @@ class DocumentFileEventHandler(FileSystemEventHandler):
if file_extension not in self.SUPPORTED_EXTENSIONS:
logger.info(f"Ignoring unsupported file type: {filepath}")
self.document_service.move_to_ignored(filepath, "unsupported file type")
return
logger.info(f"Processing new file: {filepath}")

View File

@@ -17,6 +17,7 @@ from fastapi.middleware.cors import CORSMiddleware
from app.api.routes.auth import router as auth_router
from app.api.routes.users import router as users_router
from app.api.routes.document import router as documents_router
from app.config import settings
from app.database.connection import get_database
from app.file_watcher import create_file_watcher, FileWatcher
@@ -111,7 +112,7 @@ app.add_middleware(
# Include routers
app.include_router(auth_router, prefix="/auth", tags=["Authentication"])
app.include_router(users_router, prefix="/users", tags=["User Management"])
# app.include_router(documents_router, prefix="/documents", tags=["Documents"])
app.include_router(documents_router, prefix="/api", tags=["Documents"])
# app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"])

View File

@@ -7,10 +7,9 @@ stored in MongoDB collections.
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
from bson import ObjectId
from pydantic import BaseModel, Field, field_validator
from pydantic import BaseModel, Field, field_validator, ConfigDict
from app.models.types import PyObjectId
@@ -49,6 +48,8 @@ class FileDocument(BaseModel):
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
pdf_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the associated pdf file content")
thumbnail_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the thumbnail")
encoding: str = Field(default="utf-8", description="Character encoding for text files")
file_size: int = Field(..., ge=0, description="File size in bytes")
mime_type: str = Field(..., description="MIME type detected")
@@ -68,3 +69,28 @@ class FileDocument(BaseModel):
if not v.strip():
raise ValueError("Filename cannot be empty")
return v.strip()
class DocumentResponse(BaseModel):
"""
Response model for document API endpoints.
Represents a document in the format expected by the frontend application.
Field names are automatically converted from snake_case to camelCase.
"""
model_config = ConfigDict(alias_generator=lambda field_name: ''.join(
word.capitalize() if i > 0 else word
for i, word in enumerate(field_name.split('_'))
), populate_by_name=True)
id: str = Field(..., description="Document unique identifier")
name: str = Field(..., description="Document filename")
original_file_type: str = Field(..., description="Original file type before conversion")
created_at: str = Field(..., description="ISO timestamp when document was created")
file_size: int = Field(..., description="File size in bytes")
page_count: int = Field(..., description="Number of pages in the document")
thumbnail_url: Optional[str] = Field(default=None, description="URL to document thumbnail")
pdf_url: Optional[str] = Field(default=None, description="URL to PDF version of document")
tags: List[str] = Field(default_factory=list, description="Document tags")
categories: List[str] = Field(default_factory=list, description="Document categories")

View File

@@ -14,6 +14,9 @@ class ProcessingStatus(str, Enum):
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
SAVING_OBJECT = "saving_object"
SAVING_PDF = "saving_pdf"
CREATING_THUMBNAIL = "creating_thumbnail"
FAILED = "failed"

View File

@@ -6,7 +6,9 @@ while maintaining data consistency through MongoDB transactions.
"""
import hashlib
import logging
import os
import shutil
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
@@ -14,13 +16,28 @@ from typing import List, Optional, Dict, Any
import magic
from pymongo.errors import PyMongoError
from app.config.settings import get_objects_folder
from app.config.settings import get_objects_folder, get_temp_folder, get_errors_folder, get_ignored_folder
from app.database.repositories.document_repository import FileDocumentRepository
from app.models.document import (
FileDocument,
FileType,
)
from app.models.types import PyObjectId
from app.utils.pdf_converter import convert_to_pdf
from app.utils.pdf_thumbmail import PDFThumbnailGenerator
from app.utils.security import generate_uuid_filename
logger = logging.getLogger(__name__)
class DocumentAlreadyExists(Exception):
def __init__(self, message):
self.message = message
class DocumentProcessingError(Exception):
def __init__(self, message):
self.message = message
class DocumentService:
@@ -31,7 +48,11 @@ class DocumentService:
and their content while ensuring data consistency through transactions.
"""
def __init__(self, database, objects_folder: str = None):
def __init__(self, database,
objects_folder: str = None,
temp_folder: str = None,
errors_folder: str = None,
ignored_folder: str = None):
"""
Initialize the document service with repository dependencies.
@@ -43,6 +64,9 @@ class DocumentService:
self.db = database
self.document_repository = FileDocumentRepository(self.db)
self.objects_folder = objects_folder or get_objects_folder()
self.temp_folder = temp_folder or get_temp_folder()
self.errors_folder = errors_folder or get_errors_folder()
self.ignored_folder = ignored_folder or get_ignored_folder()
def initialize(self):
self.document_repository.initialize()
@@ -117,7 +141,40 @@ class DocumentService:
return path.read_bytes()
def _get_document_path(self, file_hash):
@staticmethod
def _get_safe_path(file_path):
"""
If the path already exists, add a suffix to the filename.
Increment the suffix until a safe path is found.
:param file_path:
:return:
"""
path = Path(file_path)
# If the path doesn't exist, return it as is
if not path.exists():
return file_path
# Split the filename and extension
stem = path.stem
suffix = path.suffix
directory = path.parent
# Try incrementing numbers until a unique path is found
counter = 1
while True:
# Create new filename with counter
new_filename = f"{stem}_{counter}{suffix}"
new_path = os.path.join(directory, new_filename)
# Check if this new path exists
if not os.path.exists(new_path):
return new_path
# Increment counter for next attempt
counter += 1
def get_document_path(self, file_hash):
"""
:param file_hash:
@@ -125,8 +182,13 @@ class DocumentService:
"""
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
def exists(self, file_hash):
if file_hash is None:
return False
return os.path.exists(self.get_document_path(file_hash))
def save_content_if_needed(self, file_hash, content: bytes):
target_path = self._get_document_path(file_hash)
target_path = self.get_document_path(file_hash)
if os.path.exists(target_path):
return
@@ -136,6 +198,19 @@ class DocumentService:
with open(target_path, "wb") as f:
f.write(content)
def move_to_errors(self, document_id, file_path):
logger.info(f"Moving file {file_path} to error folder")
error_file_name = f"{document_id}_{os.path.basename(file_path)}"
error_file_path = self._get_safe_path(os.path.join(self.errors_folder, error_file_name))
shutil.move(file_path, error_file_path)
def move_to_ignored(self, file_path, reason="Unknown"):
logger.info(f"Moving file {file_path} to ignored folder")
ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(
file_path)
ignored_file_path = self._get_safe_path(os.path.join(self.ignored_folder, ignored_file_name))
shutil.move(file_path, ignored_file_path)
def create_document(
self,
file_path: str,
@@ -171,7 +246,16 @@ class DocumentService:
detected_at = datetime.now()
try:
logger.info(f'Creating Document for "{file_path}"')
# Skip the document if it already exists
same_document = self.document_repository.find_same_document(filename, file_hash)
if same_document is not None:
logger.info(f" Document with same hash already exists. Skipping...")
self.move_to_ignored(file_path, f"already exists ({same_document.id})")
raise DocumentAlreadyExists(f"Document with same hash already exists ({same_document.id})")
self.save_content_if_needed(file_hash, file_bytes)
logger.info(f" Saved content to {self.get_document_path(file_hash)}")
# Create FileDocument
file_data = FileDocument(
@@ -187,14 +271,90 @@ class DocumentService:
mime_type=mime_type
)
created_file = self.document_repository.create_document(file_data)
created_document = self.document_repository.create_document(file_data)
logger.info(f" Created document with id '{created_document.id}'")
return created_file
return created_document
except DocumentAlreadyExists as e:
raise e
except Exception as e:
# Transaction will automatically rollback if supported
raise PyMongoError(f"Failed to create document: {str(e)}")
def create_pdf(self, document_id: PyObjectId):
"""
For all files, a controlled pdf version will be created for standard visualization and action
:return:
"""
logger.info(f"Creating PDF document for {document_id}")
document = self.get_document_by_id(document_id)
if document is None:
logger.error(f" Document not found")
raise DocumentProcessingError(f"Document {document_id} not found.")
# try to find another document that has the same hash
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
# the pdf will be created only if it does not exist yet
if document_with_same_hash and self.exists(document_with_same_hash.pdf_file_hash):
logger.info(f'Found document with same hash. Will use pdf "{document_with_same_hash.pdf_file_hash}".')
self.update_document(document_id, {"pdf_file_hash": document_with_same_hash.pdf_file_hash})
return
# get the content of the file
logger.info(f" No document with same hash and valid pdf found. Will create new pdf content.")
file_bytes = self.get_document_content_by_hash(document.file_hash)
if file_bytes is None:
logger.error(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".')
raise DocumentProcessingError(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".')
# create the pdf file
temp_pdf_file = convert_to_pdf(self.get_document_path(document.file_hash), self.temp_folder)
pdf_file_hash = self._calculate_file_hash(self._read_file_bytes(temp_pdf_file))
self.save_content_if_needed(pdf_file_hash, self._read_file_bytes(temp_pdf_file))
os.remove(temp_pdf_file) # remove the temporary file
logger.info(f' Created new pdf file with hash "{pdf_file_hash}"')
# update the document
self.update_document(document_id, {"pdf_file_hash": pdf_file_hash})
def create_thumbnail(self, document_id: PyObjectId):
logger.info(f'Creating thumbnail document for "{document_id}"')
document = self.get_document_by_id(document_id)
if document is None:
logger.error(f" Document not found !")
raise DocumentProcessingError(f"Document {document_id} not found.")
# try to find another document that has the same hash
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
# We will use the thumbnail of the pdf if it exists
if document_with_same_hash and self.exists(document_with_same_hash.thumbnail_file_hash):
logger.info(f" Found document with same hash. Will use thumbnail {document_with_same_hash.thumbnail_file_hash}")
self.update_document(document_id, {"thumbnail_file_hash": document_with_same_hash.thumbnail_file_hash})
return
logger.info(f" No document with same hash and valid thumbnail found. Will create new thumbnail")
if not self.exists(document.pdf_file_hash):
logger.error(f" PDF file not found.")
raise DocumentProcessingError(f"PDF file for document {document_id} not found")
tmp_thumbnail_path = os.path.join(self.temp_folder, f"{generate_uuid_filename()}.png")
with PDFThumbnailGenerator(self.get_document_path(document.pdf_file_hash)) as gen:
# create the thumbnail
gen.create_thumbnail(tmp_thumbnail_path, page_num=0, width=200)
thumbnail_file_hash = self._calculate_file_hash(self._read_file_bytes(tmp_thumbnail_path))
# save the thumbnail to the objects folder
self.save_content_if_needed(thumbnail_file_hash, self._read_file_bytes(tmp_thumbnail_path))
os.remove(tmp_thumbnail_path)
# update the document
self.update_document(document_id, {"thumbnail_file_hash": thumbnail_file_hash})
logger.info(f" Created thumbnail {thumbnail_file_hash}")
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
"""
Retrieve a document by its ID.
@@ -219,6 +379,9 @@ class DocumentService:
"""
return self.document_repository.find_document_by_hash(file_hash)
def get_document_with_pdf_hash(self, file_hash) -> Optional[FileDocument]:
return self.document_repository.find_document_with_pdf_hash(file_hash)
def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
"""
Retrieve a document by its file path.
@@ -232,7 +395,7 @@ class DocumentService:
return self.document_repository.find_document_by_filepath(filepath)
def get_document_content_by_hash(self, file_hash):
target_path = self._get_document_path(file_hash)
target_path = self.get_document_path(file_hash)
if not os.path.exists(target_path):
return None
@@ -323,7 +486,7 @@ class DocumentService:
# If no other files reference this content, delete it
if not remaining_files:
try:
os.remove(self._get_document_path(document.file_hash))
os.remove(self.get_document_path(document.file_hash))
except Exception:
pass

View File

@@ -111,7 +111,9 @@ class JobService:
current_job = self.repository.find_job_by_id(job_id)
# Validate status transition
if current_job.status != ProcessingStatus.PROCESSING:
if current_job.status in (ProcessingStatus.PENDING,
ProcessingStatus.COMPLETED,
ProcessingStatus.FAILED):
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED)
# Update status
@@ -141,7 +143,7 @@ class JobService:
current_job = self.repository.find_job_by_id(job_id)
# Validate status transition
if current_job.status != ProcessingStatus.PROCESSING:
if current_job.status in (ProcessingStatus.PENDING, ProcessingStatus.COMPLETED, ProcessingStatus.FAILED):
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED)
# Update status with error message
@@ -151,6 +153,11 @@ class JobService:
error_message
)
def update_job_status(self, job_id: PyObjectId,
status: ProcessingStatus,
error_message: str = None) -> ProcessingJob:
return self.repository.update_job_status(job_id, status, error_message)
def delete_job(self, job_id: PyObjectId) -> bool:
"""
Delete a job from the database.

View File

@@ -0,0 +1,241 @@
import fitz # PyMuPDF
class PDFAnnotator:
def __init__(self, pdf_path):
self.doc = fitz.open(pdf_path)
def add_highlight(self, rect, page_num=0, color=(1, 1, 0)):
"""
Add highlight annotation
Args:
rect: (x0, y0, x1, y1) coordinates or fitz.Rect object
page_num: Page number (0-indexed), default first page
color: RGB tuple (0-1 range), default yellow
"""
page = self.doc[page_num]
annot = page.add_highlight_annot(rect)
annot.set_colors(stroke=color)
annot.update()
return annot
def add_rectangle(self, rect, page_num=0, color=(1, 0, 0), width=2):
"""
Add rectangle annotation (border only)
Args:
rect: (x0, y0, x1, y1) coordinates or fitz.Rect object
page_num: Page number (0-indexed), default first page
color: RGB tuple (0-1 range), default red
width: Line width in points
"""
page = self.doc[page_num]
annot = page.add_rect_annot(rect)
annot.set_colors(stroke=color)
annot.set_border(width=width)
annot.update()
return annot
def add_text_note(self, point, text, page_num=0, icon="Note"):
"""
Add sticky note annotation
Args:
point: (x, y) position tuple
text: Note content string
page_num: Page number (0-indexed), default first page
icon: "Note", "Comment", "Help", "Insert", "Key", etc.
"""
page = self.doc[page_num]
annot = page.add_text_annot(point, text, icon=icon)
annot.update()
return annot
def add_free_text(self, rect, text, page_num=0, fontsize=12,
color=(0, 0, 0)):
"""
Add free text annotation (visible text box)
Args:
rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect
text: Text content string
page_num: Page number (0-indexed), default first page
fontsize: Font size in points
color: Text color RGB tuple (0-1 range)
"""
page = self.doc[page_num]
annot = page.add_freetext_annot(
rect,
text,
fontsize=fontsize,
text_color=color
)
annot.update()
return annot
def add_arrow(self, start_point, end_point, page_num=0,
color=(1, 0, 0), width=2):
"""
Add arrow annotation
Args:
start_point: (x, y) tuple for arrow start
end_point: (x, y) tuple for arrow end
page_num: Page number (0-indexed), default first page
color: Arrow color RGB tuple (0-1 range), default red
width: Line width in points
"""
page = self.doc[page_num]
annot = page.add_line_annot(start_point, end_point)
annot.set_colors(stroke=color)
annot.set_border(width=width)
# Set arrow at end - use integer constant
annot.set_line_ends(0, 1) # 1 = ClosedArrow
annot.update()
return annot
def add_stamp(self, rect, page_num=0, stamp_type=0):
"""
Add stamp annotation
Args:
rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect
page_num: Page number (0-indexed), default first page
stamp_type: Integer for stamp type:
0=Approved, 1=AsIs, 2=Confidential,
3=Departmental, 4=Draft, 5=Experimental,
6=Expired, 7=Final, 8=ForComment,
9=ForPublicRelease, 10=NotApproved, etc.
"""
page = self.doc[page_num]
annot = page.add_stamp_annot(rect, stamp=stamp_type)
annot.update()
return annot
def add_redaction(self, rect, page_num=0, fill_color=(0, 0, 0)):
"""
Add redaction annotation (marks area for redaction)
Note: Use apply_redactions() to permanently remove content
Args:
rect: (x0, y0, x1, y1) area to redact, tuple or fitz.Rect
page_num: Page number (0-indexed), default first page
fill_color: RGB tuple (0-1 range) for redacted area, default black
"""
page = self.doc[page_num]
annot = page.add_redact_annot(rect, fill=fill_color)
annot.update()
return annot
def apply_redactions(self, page_num=0, images=2, graphics=2, text=2):
"""
Apply all redaction annotations on a page (permanent removal)
Args:
page_num: Page number (0-indexed), default first page
images: 2=remove, 1=blank, 0=ignore
graphics: 2=remove, 1=blank, 0=ignore
text: 2=remove, 1=blank, 0=ignore
Returns:
True if redactions were applied, False otherwise
"""
page = self.doc[page_num]
# Check if page has redaction annotations
has_redactions = any(annot.type[0] == 12 for annot in page.annots())
if has_redactions:
page.apply_redactions(images=images, graphics=graphics, text=text)
return True
return False
def get_all_annotations(self, page_num=0):
"""
Retrieve all annotations from a page
Args:
page_num: Page number (0-indexed), default first page
Returns:
List of dicts with annotation information
"""
page = self.doc[page_num]
annotations = []
for annot in page.annots():
info = {
'type': annot.type[1], # Annotation type name
'rect': annot.rect,
'content': annot.info.get('content', ''),
'author': annot.info.get('title', ''),
'created': annot.info.get('creationDate', ''),
'colors': annot.colors
}
annotations.append(info)
return annotations
def remove_all_annotations(self, page_num=0):
"""
Remove all annotations from a page
Args:
page_num: Page number (0-indexed), default first page
"""
page = self.doc[page_num]
for annot in page.annots():
page.delete_annot(annot)
def save(self, output_path):
"""Save the annotated PDF"""
self.doc.save(output_path)
def close(self):
self.doc.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
# Example usage
if __name__ == "__main__":
with PDFAnnotator("input.pdf") as annotator:
# Add yellow highlight
annotator.add_highlight((100, 100, 300, 120), page_num=0,
color=(1, 1, 0))
# Add red rectangle border
annotator.add_rectangle((100, 150, 300, 250), page_num=0,
color=(1, 0, 0), width=3)
# Add sticky note
annotator.add_text_note((400, 100), "This is important!",
page_num=0, icon="Comment")
# Add visible text box
annotator.add_free_text((100, 300, 400, 350), "DRAFT VERSION",
page_num=0, fontsize=20, color=(1, 0, 0))
# Add arrow pointing to something
annotator.add_arrow((450, 100), (500, 200), page_num=0,
color=(0, 0, 1), width=2)
# Add "Approved" stamp
annotator.add_stamp((450, 300, 550, 350), page_num=0, stamp_type=0)
# Add redaction (black box over sensitive info)
annotator.add_redaction((100, 400, 300, 420), page_num=0)
annotator.apply_redactions(page_num=0)
# List all annotations
annots = annotator.get_all_annotations(page_num=0)
print(f"Found {len(annots)} annotations:")
for a in annots:
print(f" - {a['type']} at {a['rect']}")
# Save annotated PDF
annotator.save("output_annotated.pdf")

View File

@@ -0,0 +1,210 @@
import datetime
import hashlib
import os
import uuid
from abc import ABC
from pathlib import Path
from typing import Self
import pikepdf
import pypandoc
from PIL import Image
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from tasks.common.converter_utils import detect_file_type
class BaseConverter(ABC):
"""Abstract base class for file converters to PDF."""
def __init__(self, input_path: str, output_dir: str = ".") -> None:
self.input_path = Path(input_path)
self.output_dir = Path(output_dir)
self.output_path = self.output_dir / f"{self.generate_uuid_filename()}.pdf"
def convert(self) -> Self:
"""Convert input file to PDF and return the output path."""
pass
@staticmethod
def generate_uuid_filename() -> str:
"""Generate a unique filename using UUID4."""
return str(uuid.uuid4())
def get_deterministic_date(self) -> str:
"""
Generate a deterministic date based on file content.
This ensures the same file always produces the same PDF.
"""
# Option 1: Use a fixed date
# return "D:20000101000000"
# Option 2: Generate date from content hash (recommended)
with open(self.input_path, 'rb') as f:
content = f.read()
content_hash = hashlib.sha256(content).hexdigest()
# Use first 14 characters of hash to create a valid date
# Format: D:YYYYMMDDHHmmss
hash_int = int(content_hash[:14], 16)
# Create a date between 2000-2099 to keep it reasonable
year = 2000 + (hash_int % 100)
month = 1 + (hash_int % 12)
day = 1 + (hash_int % 28) # Stay safe with 28 days
hour = hash_int % 24
minute = hash_int % 60
second = hash_int % 60
return f"D:{year:04d}{month:02d}{day:02d}{hour:02d}{minute:02d}{second:02d}"
def get_file_creation_date(self):
# Get file creation time (or modification time)
ts = os.path.getctime(self.input_path) # getmtime(self.input_path) for last modification
dt = datetime.datetime.fromtimestamp(ts)
# PDF expects format D:YYYYMMDDHHmmss
creation_date = dt.strftime("D:%Y%m%d%H%M%S")
return creation_date
def clean_pdf(self) -> Self:
"""Remove all non-deterministic metadata from PDF."""
with pikepdf.open(self.output_path, allow_overwriting_input=True) as pdf:
# Remove XMP metadata if it exists
if hasattr(pdf.Root, 'Metadata'):
del pdf.Root.Metadata
# Clear all document info by deleting each key
for key in list(pdf.docinfo.keys()):
del pdf.docinfo[key]
# Set deterministic metadata
pdf.docinfo["/Producer"] = "MyConverter"
pdf.docinfo["/Creator"] = "MyConverter"
pdf.docinfo["/CreationDate"] = self.get_deterministic_date()
pdf.docinfo["/ModDate"] = self.get_deterministic_date()
pdf.docinfo["/Title"] = self.input_path.name
# Save with deterministic IDs
# compress=True ensures consistent compression
# deterministic_id=True (if available) or static_id=True
pdf.save(
self.output_path,
fix_metadata_version=True,
compress_streams=True,
stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
object_stream_mode=pikepdf.ObjectStreamMode.disable,
deterministic_id=True # Use this if pikepdf >= 8.0.0, otherwise use static_id=True
)
return self
class TextToPdfConverter(BaseConverter):
"""Converter for text files to PDF."""
def convert(self) -> Self:
c = canvas.Canvas(str(self.output_path), pagesize=A4)
# Fix metadata with deterministic values
info = c._doc.info
info.producer = "MyConverter"
info.creationDate = self.get_file_creation_date()
info.title = os.path.basename(self.input_path)
width, height = A4
with open(self.input_path, "r", encoding="utf-8") as f:
y = height - 50
for line in f:
c.drawString(50, y, line.strip())
y -= 15
if y < 50:
c.showPage()
y = height - 50
c.save()
return self
class PdfToPdfConverter(BaseConverter):
"""Converter for PDF files to PDF."""
def convert(self) -> Self:
# copy self.input_path to self.output_path
os.system(f"cp {self.input_path} {self.output_path}")
return self
class ImageToPdfConverter(BaseConverter):
"""Converter for image files to PDF."""
def convert(self) -> Self:
image = Image.open(self.input_path)
rgb_image = image.convert("RGB")
rgb_image.save(self.output_path)
return self
class WordToPdfConverter(BaseConverter):
"""Converter for Word files (.docx) to PDF using pypandoc."""
def convert(self) -> Self:
pypandoc.convert_file(
str(self.input_path), "pdf", outputfile=str(self.output_path)
)
return self
# Placeholders for future extensions
class HtmlToPdfConverter(BaseConverter):
"""Placeholder for HTML to PDF converter."""
def convert(self) -> Self:
raise NotImplementedError("HTML to PDF conversion not implemented.")
class ExcelToPdfConverter(BaseConverter):
"""Placeholder for Excel to PDF converter."""
def convert(self) -> Self:
raise NotImplementedError("Excel to PDF conversion not implemented.")
class MarkdownToPdfConverter(BaseConverter):
"""Placeholder for Markdown to PDF converter."""
def convert(self) -> Self:
raise NotImplementedError("Markdown to PDF conversion not implemented.")
def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
"""
Convert any supported file to PDF.
Args:
filepath (str): Path to the input file.
output_dir (str): Directory to save the output PDF.
Returns:
str: Path to the generated PDF.
Raises:
UnsupportedFileTypeError: If the input file type is not supported.
"""
file_type = detect_file_type(filepath)
if file_type == "text":
converter = TextToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "image":
converter = ImageToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "word":
converter = WordToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "pdf":
converter = PdfToPdfConverter(filepath, output_dir=output_dir)
else:
raise ValueError(f"Unsupported file type: {file_type}")
converter.convert()
converter.clean_pdf()
return str(converter.output_path)

View File

@@ -0,0 +1,167 @@
from pathlib import Path
import fitz # PyMuPDF
class PDFThumbnailGenerator:
def __init__(self, pdf_path):
"""
Initialize PDF thumbnail generator
Args:
pdf_path: Path to the PDF file (string or Path object)
"""
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
def create_thumbnail(self, output_path, page_num=0, width=200, rotation=0, zoom_factor=1.0):
"""
Create a thumbnail with zoom and rotation
Args:
output_path: Path to save the thumbnail (string or Path)
page_num: Page number (0-indexed), default first page
width: Desired width in pixels, default 200
rotation: Rotation angle in degrees (0, 90, 180, 270), default 0
zoom_factor: Additional zoom multiplier (1.0 = normal, 2.0 = 2x), default 1.0
Returns:
Dict with thumbnail info (width, height, rotation, zoom)
"""
page = self.doc[page_num]
# Apply rotation to page
page.set_rotation(rotation)
# Calculate zoom to achieve desired width
base_zoom = width / page.rect.width
final_zoom = base_zoom * zoom_factor
# Create transformation matrix
mat = fitz.Matrix(final_zoom, final_zoom)
# Render page to pixmap
pix = page.get_pixmap(matrix=mat, alpha=False)
# Save thumbnail
pix.save(output_path)
return {
'width': pix.width,
'height': pix.height,
'rotation': rotation,
'zoom': zoom_factor
}
def create_cropped_thumbnail(self, output_path, crop_rect=None, page_num=0, width=200):
"""
Create a thumbnail of a specific region (zoom on area)
Args:
output_path: Path to save the thumbnail (string or Path)
crop_rect: Tuple (x0, y0, x1, y1) in PDF coordinates for cropping,
or None for full page, default None
page_num: Page number (0-indexed), default first page
width: Desired width in pixels, default 200
Returns:
Tuple (width, height) of the generated thumbnail
"""
page = self.doc[page_num]
if crop_rect:
# Create rectangle for cropping
rect = fitz.Rect(crop_rect)
zoom = width / rect.width
else:
rect = page.rect
zoom = width / page.rect.width
mat = fitz.Matrix(zoom, zoom)
# Render only the specified rectangle
pix = page.get_pixmap(matrix=mat, clip=rect)
pix.save(output_path)
return pix.width, pix.height
def get_page_info(self, page_num=0):
"""
Get information about a specific page
Args:
page_num: Page number (0-indexed), default first page
Returns:
Dict with page information (width, height, rotation, number, total_pages)
"""
page = self.doc[page_num]
return {
'width': page.rect.width,
'height': page.rect.height,
'rotation': page.rotation,
'number': page_num + 1,
'total_pages': len(self.doc)
}
def create_multi_resolution_thumbnails(self, output_folder, page_num=0, sizes=(150, 300, 600)):
"""
Create multiple thumbnails at different resolutions
Args:
output_folder: Folder path to save thumbnails (string or Path)
page_num: Page number (0-indexed), default first page
sizes: List of widths in pixels, default [150, 300, 600]
Returns:
Dict mapping each size to thumbnail info
"""
output_folder = Path(output_folder)
output_folder.mkdir(exist_ok=True, parents=True)
results = {}
for size in sizes:
output_path = output_folder / f"thumb_{size}px.png"
info = self.create_thumbnail(output_path, page_num=page_num, width=size)
results[size] = info
return results
def close(self):
"""Close the PDF document and free resources"""
self.doc.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
# Example usage
if __name__ == "__main__":
# Basic usage with context manager
with PDFThumbnailGenerator("example.pdf") as gen:
# Standard thumbnail
gen.create_thumbnail("thumb_standard.png", page_num=0, width=200)
# Rotated thumbnail
gen.create_thumbnail("thumb_rotated.png", page_num=0,
width=200, rotation=90)
# Zoomed thumbnail (2x zoom)
gen.create_thumbnail("thumb_zoomed.png", page_num=0,
width=200, zoom_factor=2.0)
# Cropped/zoomed on specific area (x0, y0, x1, y1)
gen.create_cropped_thumbnail("thumb_crop.png",
crop_rect=(100, 100, 400, 400),
page_num=0, width=300)
# Multiple resolutions
gen.create_multi_resolution_thumbnails("thumbnails/", page_num=0,
sizes=[150, 300, 600])
# Get page information
info = gen.get_page_info(page_num=0)
print(f"Page info: {info}")

View File

@@ -4,9 +4,10 @@ Password security utilities using bcrypt for secure password hashing.
This module provides secure password hashing and verification functions
using the bcrypt algorithm with automatic salt generation.
"""
import re
import uuid
import bcrypt
from typing import Union
def hash_password(password: str) -> str:
@@ -71,4 +72,33 @@ def verify_password(password: str, hashed_password: str) -> bool:
# bcrypt raises ValueError for malformed hashes
raise RuntimeError(f"Invalid hash format: {str(e)}")
except Exception as e:
raise RuntimeError(f"Failed to verify password: {str(e)}")
raise RuntimeError(f"Failed to verify password: {str(e)}")
def generate_uuid_filename() -> str:
"""Generate a unique filename using UUID4."""
return str(uuid.uuid4())
def safe_connection_string(connection_string: str) -> str:
"""
Mask the password in a MongoDB connection string.
Args:
connection_string (str): The complete MongoDB connection string
Returns:
str: The connection string with password replaced by asterisks
Example:
>>> mask_mongodb_password("mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin")
"mongodb://admin:***@mongodb:27017/mydocmanager?authSource=admin"
"""
# Pattern to detect password in MongoDB URL
# Format: mongodb://username:password@host:port/database
pattern = r'(mongodb://[^:]+:)([^@]+)(@.*)'
# Replace password with asterisks
masked_string = re.sub(pattern, r'\1*****\3', connection_string)
return masked_string

View File

@@ -5,10 +5,12 @@ email-validator==2.3.0
fastapi==0.116.1
httptools==0.6.4
motor==3.7.1
pikepdf==9.11.0
pillow==11.3.0
pydantic==2.11.9
PyJWT==2.10.1
pymongo==4.15.0
PyMuPDF==1.26.4
pypandoc==1.15
python-multipart==0.0.20
redis==6.4.0

View File

@@ -1,12 +1,93 @@
# React + Vite
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
# MyDocManager Frontend
Currently, two official plugins are available:
## Overview
MyDocManager Frontend is a modern web application built with React and Vite that serves as the user interface for the MyDocManager document management system. The application provides a seamless experience for users to manage, process, and organize their documents with an intuitive and responsive interface.
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) for Fast Refresh
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
## Project Structure
frontend/
├── public/ # Public assets and static files
├── src/ # Source code
│ ├── assets/ # Icons, images, and other static assets
│ ├── components/ # Reusable UI components
│ │ ├── auth/ # Authentication-related components
│ │ └── common/ # Shared components (Header, Layout, etc.)
│ ├── contexts/ # React contexts for state management
│ ├── hooks/ # Custom React hooks
│ ├── pages/ # Page components representing full views
│ ├── services/ # API service interfaces
│ └── utils/ # Utility functions and helpers
├── Dockerfile # Container configuration for deployment
├── package.json # Dependencies and scripts
├── tailwind.config.js # Tailwind CSS configuration
└── vite.config.js # Vite bundler configuration
## Expanding the ESLint configuration
If you are developing a production application, we recommend using TypeScript with type-aware lint rules enabled. Check out the [TS template](https://github.com/vitejs/vite/tree/main/packages/create-vite/template-react-ts) for information on how to integrate TypeScript and [`typescript-eslint`](https://typescript-eslint.io) in your project.
## Key Components
### Authentication
- **AuthContext**: Provides authentication state and methods throughout the application
- **AuthLayout**: Layout wrapper specifically for authentication screens
- **LoginForm**: Form component for user authentication
- **ProtectedRoute**: Route guard that ensures authenticated access to protected pages
### UI Components
- **Layout**: Main application layout structure with menu and content areas
- **Header**: Application header with navigation and user controls
- **Menu**: Side navigation menu with application links
- **ThemeSwitcher**: Toggle for switching between light and dark themes
### Pages
- **LoginPage**: User authentication page
- **DashboardPage**: Main dashboard view for authenticated users
### Services
- **authService**: Handles API communication for authentication operations
- **api**: Base API utility for making HTTP requests to the backend
## Getting Started
### Prerequisites
- Node.js (latest LTS version)
- npm or yarn package manager
### Installation
1. Clone the repository
2. Navigate to the frontend directory
3. Install dependencies:
```
npm install
```
### Development
Run the development server:
```
npm run dev
```
This will start the application in development mode at http://localhost:5173
### Building for Production
Create a production build:
```
npm run build
```
## Technologies
- React 19.1.1
- Vite 7.1.2
- Tailwind CSS 4.1.13
- DaisyUI 5.1.24
- React Router 7.9.3
- Axios for API requests
## Features
- Responsive design with Tailwind CSS
- Authentication and authorization
- Light/dark theme support
- Document management interface
- Secure API communication
## Project Integration
This frontend application works in conjunction with the backend services and workers defined in other parts of the MyDocManager project to provide a complete document management solution.

View File

@@ -4,6 +4,7 @@ import ProtectedRoute from './components/common/ProtectedRoute';
import Layout from './components/common/Layout';
import LoginPage from './pages/LoginPage';
import DashboardPage from './pages/DashboardPage';
import DocumentsPage from './pages/DocumentsPage';
function App() {
return (
@@ -16,7 +17,8 @@ function App() {
{/* Protected Routes */}
<Route path="/" element={<ProtectedRoute><Layout /></ProtectedRoute>}>
<Route index element={<Navigate to="/dashboard" replace />} />
<Route index element={<Navigate to="/documents" replace />} />
<Route path="documents" element={<DocumentsPage />} />
<Route path="dashboard" element={<DashboardPage />} />
<Route path="documents" element={<div>Documents Page - Coming Soon</div>} />
<Route path="users" element={<div>User Management - Coming Soon</div>} />

View File

@@ -1,11 +1,13 @@
import {FaBuffer, FaPlus} from "react-icons/fa6";
import { Link } from "react-router-dom";
const Menu = () => {
return (
<div className="p-4">
<ul className="menu">
<li className="menu-title">Exploration</li>
<li><a><FaBuffer/>To Review</a></li>
<li><Link to="/dashboard"><FaBuffer/>Dashboard</Link></li>
<li><Link to="/documents"><FaBuffer/>To Review</Link></li>
<li className="menu-title mt-4">Catégories</li>
<li><a><i className="fas fa-plus"></i>Item</a></li>
</ul>

View File

@@ -0,0 +1,68 @@
/**
* DeleteConfirmModal Component
* Modal dialog to confirm document deletion
*/
import React from 'react';
/**
* DeleteConfirmModal component
* @param {Object} props
* @param {boolean} props.isOpen - Whether the modal is open
* @param {Object|null} props.document - Document to delete
* @param {function(): void} props.onClose - Callback when modal is closed
* @param {function(): void} props.onConfirm - Callback when deletion is confirmed
* @param {boolean} props.isDeleting - Whether deletion is in progress
* @returns {JSX.Element}
*/
const DeleteConfirmModal = ({
isOpen,
document,
onClose,
onConfirm,
isDeleting = false
}) => {
if (!isOpen || !document) return null;
return (
<dialog className="modal modal-open">
<div className="modal-box">
<h3 className="font-bold text-lg">Confirm Deletion</h3>
<p className="py-4">
Are you sure you want to delete <span className="font-semibold">"{document.name}"</span>?
</p>
<p className="text-sm text-gray-500">
This action cannot be undone.
</p>
<div className="modal-action">
<button
className="btn btn-ghost"
onClick={onClose}
disabled={isDeleting}
>
Cancel
</button>
<button
className="btn btn-error"
onClick={onConfirm}
disabled={isDeleting}
>
{isDeleting ? (
<>
<span className="loading loading-spinner loading-sm"></span>
Deleting...
</>
) : (
'Delete'
)}
</button>
</div>
</div>
<form method="dialog" className="modal-backdrop" onClick={onClose}>
<button disabled={isDeleting}>close</button>
</form>
</dialog>
);
};
export default DeleteConfirmModal;

View File

@@ -0,0 +1,193 @@
/**
* DocumentCard Component
* Displays a document as a DaisyUI card with thumbnail and metadata
* Supports different view modes: small, large, and detail
*/
import React, { memo } from 'react';
/**
* Formats file size to human-readable format
* @param {number} bytes - File size in bytes
* @returns {string} Formatted file size
*/
const formatFileSize = (bytes) => {
if (bytes === 0) return '0 Bytes';
const k = 1024;
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return Math.round((bytes / Math.pow(k, i)) * 100) / 100 + ' ' + sizes[i];
};
/**
* Formats date to localized string
* @param {string} dateString - ISO date string
* @returns {string} Formatted date
*/
const formatDate = (dateString) => {
return new Date(dateString).toLocaleDateString('en-US', {
year: 'numeric',
month: 'short',
day: 'numeric'
});
};
/**
* DocumentCard component
* @param {Object} props
* @param {Object} props.document - Document object
* @param {'small'|'large'|'detail'} props.viewMode - Current view mode
* @param {function(): void} props.onEdit - Callback when edit is clicked
* @param {function(): void} props.onDelete - Callback when delete is clicked
* @returns {JSX.Element}
*/
const DocumentCard = memo(({ document, viewMode, onEdit, onDelete }) => {
const { name, originalFileType, thumbnailUrl, pageCount, fileSize, createdAt, tags, categories } = document;
// Determine card classes based on view mode
const getCardClasses = () => {
const baseClasses = 'card bg-base-100 shadow-xl hover:shadow-2xl transition-shadow group relative';
switch (viewMode) {
case 'small':
return `${baseClasses} w-full`;
case 'large':
return `${baseClasses} w-full`;
case 'detail':
return `${baseClasses} w-full`;
default:
return baseClasses;
}
};
// Render thumbnail with hover actions
const renderThumbnail = () => (
<figure className="relative overflow-hidden">
<img
src={`http://localhost:8000${thumbnailUrl}`}
alt={`${thumbnailUrl} thumbnail`}
className={`w-full object-cover ${
viewMode === 'small' ? 'h-32' : viewMode === 'large' ? 'h-48' : 'h-64'
}`}
loading="lazy"
/>
{/* Hover overlay with actions */}
<div className="absolute top-2 right-2 flex gap-2 opacity-0 group-hover:opacity-100 transition-opacity">
<button
className="btn btn-sm btn-circle btn-primary"
onClick={onEdit}
aria-label="Edit document"
title="Edit"
>
<svg xmlns="http://www.w3.org/2000/svg" className="h-4 w-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M11 5H6a2 2 0 00-2 2v11a2 2 0 002 2h11a2 2 0 002-2v-5m-1.414-9.414a2 2 0 112.828 2.828L11.828 15H9v-2.828l8.586-8.586z" />
</svg>
</button>
<button
className="btn btn-sm btn-circle btn-error"
onClick={onDelete}
aria-label="Delete document"
title="Delete"
>
<svg xmlns="http://www.w3.org/2000/svg" className="h-4 w-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 7l-.867 12.142A2 2 0 0116.138 21H7.862a2 2 0 01-1.995-1.858L5 7m5 4v6m4-6v6m1-10V4a1 1 0 00-1-1h-4a1 1 0 00-1 1v3M4 7h16" />
</svg>
</button>
</div>
{/* File type badge */}
<div className="absolute bottom-2 left-2">
<span className="badge badge-accent badge-sm">{originalFileType}</span>
</div>
</figure>
);
// Render card body based on view mode
const renderCardBody = () => {
if (viewMode === 'small') {
return (
<div className="card-body p-3">
<h3 className="card-title text-sm truncate" title={name}>{name}</h3>
<p className="text-xs text-gray-500">{pageCount} page{pageCount > 1 ? 's' : ''}</p>
</div>
);
}
if (viewMode === 'large') {
return (
<div className="card-body p-4">
<h3 className="card-title text-base truncate" title={name}>{name}</h3>
<div className="flex flex-wrap gap-1 mb-2">
{tags.slice(0, 3).map(tag => (
<span key={tag} className="badge badge-primary badge-xs">{tag}</span>
))}
{tags.length > 3 && (
<span className="badge badge-ghost badge-xs">+{tags.length - 3}</span>
)}
</div>
<div className="text-sm space-y-1">
<p className="text-gray-500">{pageCount} page{pageCount > 1 ? 's' : ''}</p>
<p className="text-gray-500">{formatFileSize(fileSize)}</p>
</div>
</div>
);
}
// Detail mode
return (
<div className="card-body">
<h3 className="card-title text-lg" title={name}>{name}</h3>
{/* Tags */}
{tags.length > 0 && (
<div className="flex flex-wrap gap-1 mb-2">
{tags.map(tag => (
<span key={tag} className="badge badge-primary badge-sm">{tag}</span>
))}
</div>
)}
{/* Categories */}
{categories.length > 0 && (
<div className="flex flex-wrap gap-1 mb-3">
{categories.map(category => (
<span key={category} className="badge badge-secondary badge-sm">{category}</span>
))}
</div>
)}
{/* Metadata */}
<div className="grid grid-cols-2 gap-2 text-sm">
<div>
<span className="font-semibold">Pages:</span>
<span className="ml-2 text-gray-500">{pageCount}</span>
</div>
<div>
<span className="font-semibold">Size:</span>
<span className="ml-2 text-gray-500">{formatFileSize(fileSize)}</span>
</div>
<div>
<span className="font-semibold">Type:</span>
<span className="ml-2 text-gray-500">{originalFileType}</span>
</div>
<div>
<span className="font-semibold">Date:</span>
<span className="ml-2 text-gray-500">{formatDate(createdAt)}</span>
</div>
</div>
</div>
);
};
return (
<div className={getCardClasses()}>
{renderThumbnail()}
{renderCardBody()}
</div>
);
});
DocumentCard.displayName = 'DocumentCard';
export default DocumentCard;

View File

@@ -0,0 +1,164 @@
/**
* DocumentDetailView Component
* Displays a document in detail mode with all pages visible
* This is a placeholder that shows multiple page thumbnails
* When real PDF backend is ready, this can be replaced with actual PDF rendering
*/
import React from 'react';
/**
* Formats file size to human-readable format
* @param {number} bytes - File size in bytes
* @returns {string} Formatted file size
*/
const formatFileSize = (bytes) => {
if (bytes === 0) return '0 Bytes';
const k = 1024;
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return Math.round((bytes / Math.pow(k, i)) * 100) / 100 + ' ' + sizes[i];
};
/**
* Formats date to localized string
* @param {string} dateString - ISO date string
* @returns {string} Formatted date
*/
const formatDate = (dateString) => {
return new Date(dateString).toLocaleDateString('en-US', {
year: 'numeric',
month: 'long',
day: 'numeric',
hour: '2-digit',
minute: '2-digit'
});
};
/**
* DocumentDetailView component
* @param {Object} props
* @param {Object} props.document - Document object
* @param {function(): void} props.onEdit - Callback when edit is clicked
* @param {function(): void} props.onDelete - Callback when delete is clicked
* @returns {JSX.Element}
*/
const DocumentDetailView = ({ document, onEdit, onDelete }) => {
const {
name,
originalFileType,
thumbnailUrl,
pageCount,
fileSize,
createdAt,
tags,
categories
} = document;
// Generate placeholder pages (in real implementation, these would be actual PDF pages)
const pages = Array.from({ length: pageCount }, (_, i) => ({
pageNumber: i + 1,
thumbnailUrl: thumbnailUrl.replace('Page+1', `Page+${i + 1}`)
}));
return (
<div className="card bg-base-100 shadow-xl">
{/* Header with actions */}
<div className="card-body">
<div className="flex justify-between items-start mb-4">
<div className="flex-1">
<h2 className="card-title text-2xl mb-2">{name}</h2>
{/* Tags */}
{tags.length > 0 && (
<div className="flex flex-wrap gap-2 mb-2">
<span className="text-sm font-semibold text-gray-600">Tags:</span>
{tags.map(tag => (
<span key={tag} className="badge badge-primary">{tag}</span>
))}
</div>
)}
{/* Categories */}
{categories.length > 0 && (
<div className="flex flex-wrap gap-2 mb-3">
<span className="text-sm font-semibold text-gray-600">Categories:</span>
{categories.map(category => (
<span key={category} className="badge badge-secondary">{category}</span>
))}
</div>
)}
</div>
{/* Action buttons */}
<div className="flex gap-2">
<button
className="btn btn-primary btn-sm"
onClick={onEdit}
aria-label="Edit document"
>
<svg xmlns="http://www.w3.org/2000/svg" className="h-4 w-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M11 5H6a2 2 0 00-2 2v11a2 2 0 002 2h11a2 2 0 002-2v-5m-1.414-9.414a2 2 0 112.828 2.828L11.828 15H9v-2.828l8.586-8.586z" />
</svg>
Edit
</button>
<button
className="btn btn-error btn-sm"
onClick={onDelete}
aria-label="Delete document"
>
<svg xmlns="http://www.w3.org/2000/svg" className="h-4 w-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 7l-.867 12.142A2 2 0 0116.138 21H7.862a2 2 0 01-1.995-1.858L5 7m5 4v6m4-6v6m1-10V4a1 1 0 00-1-1h-4a1 1 0 00-1 1v3M4 7h16" />
</svg>
Delete
</button>
</div>
</div>
{/* Metadata grid */}
<div className="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6 p-4 bg-base-200 rounded-lg">
<div>
<span className="text-sm font-semibold text-gray-600">Original Type</span>
<p className="text-lg">{originalFileType}</p>
</div>
<div>
<span className="text-sm font-semibold text-gray-600">Pages</span>
<p className="text-lg">{pageCount}</p>
</div>
<div>
<span className="text-sm font-semibold text-gray-600">File Size</span>
<p className="text-lg">{formatFileSize(fileSize)}</p>
</div>
<div>
<span className="text-sm font-semibold text-gray-600">Created</span>
<p className="text-lg">{formatDate(createdAt)}</p>
</div>
</div>
{/* Pages preview */}
<div>
<h3 className="text-lg font-semibold mb-4">Document Pages ({pageCount})</h3>
<div className="grid grid-cols-2 md:grid-cols-3 lg:grid-cols-4 gap-4">
{pages.map((page) => (
<div key={page.pageNumber} className="relative group">
<div className="aspect-[3/4] bg-base-200 rounded-lg overflow-hidden shadow-md hover:shadow-xl transition-shadow">
<img
src={page.thumbnailUrl}
alt={`Page ${page.pageNumber}`}
className="w-full h-full object-cover"
loading="lazy"
/>
</div>
<div className="text-center mt-2">
<span className="text-sm text-gray-600">Page {page.pageNumber}</span>
</div>
</div>
))}
</div>
</div>
</div>
</div>
);
};
export default DocumentDetailView;

View File

@@ -0,0 +1,180 @@
/**
* DocumentGallery Component
* Main container for displaying documents in different view modes
*/
import React, { useState } from 'react';
import DocumentCard from './DocumentCard';
import DocumentDetailView from './DocumentDetailView';
import ViewModeSwitcher from './ViewModeSwitcher';
import EditDocumentModal from './EditDocumentModal';
import DeleteConfirmModal from './DeleteConfirmModal';
import { useDocuments } from '../../hooks/useDocuments';
/**
* DocumentGallery component
* @returns {JSX.Element}
*/
const DocumentGallery = () => {
const { documents, loading, error, updateDocument, deleteDocument } = useDocuments();
const [viewMode, setViewMode] = useState('large');
const [editingDocument, setEditingDocument] = useState(null);
const [deletingDocument, setDeletingDocument] = useState(null);
const [isSaving, setIsSaving] = useState(false);
const [isDeleting, setIsDeleting] = useState(false);
/**
* Handles opening the edit modal
* @param {Object} document - Document to edit
*/
const handleEditClick = (document) => {
setEditingDocument(document);
};
/**
* Handles opening the delete confirmation modal
* @param {Object} document - Document to delete
*/
const handleDeleteClick = (document) => {
setDeletingDocument(document);
};
/**
* Handles saving document changes
* @param {Object} updates - Updates object with tags and categories
*/
const handleSaveEdit = async (updates) => {
if (!editingDocument) return;
setIsSaving(true);
const success = await updateDocument(editingDocument.id, updates);
setIsSaving(false);
if (success) {
setEditingDocument(null);
}
};
/**
* Handles confirming document deletion
*/
const handleConfirmDelete = async () => {
if (!deletingDocument) return;
setIsDeleting(true);
const success = await deleteDocument(deletingDocument.id);
setIsDeleting(false);
if (success) {
setDeletingDocument(null);
}
};
/**
* Gets grid classes based on view mode
* @returns {string} Tailwind CSS classes
*/
const getGridClasses = () => {
switch (viewMode) {
case 'small':
return 'grid grid-cols-2 sm:grid-cols-3 md:grid-cols-4 lg:grid-cols-5 xl:grid-cols-6 gap-4';
case 'large':
return 'grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-3 gap-6';
case 'detail':
return 'flex flex-col gap-6';
default:
return 'grid grid-cols-1 gap-4';
}
};
// Loading state
if (loading) {
return (
<div className="flex justify-center items-center min-h-[400px]">
<span className="loading loading-spinner loading-lg"></span>
</div>
);
}
// Error state
if (error) {
return (
<div className="alert alert-error">
<svg xmlns="http://www.w3.org/2000/svg" className="stroke-current shrink-0 h-6 w-6" fill="none" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth="2" d="M10 14l2-2m0 0l2-2m-2 2l-2-2m2 2l2 2m7-2a9 9 0 11-18 0 9 9 0 0118 0z" />
</svg>
<span>Error loading documents: {error}</span>
</div>
);
}
// Empty state
if (documents.length === 0) {
return (
<div className="flex flex-col items-center justify-center min-h-[400px] text-center">
<svg xmlns="http://www.w3.org/2000/svg" className="h-24 w-24 text-gray-300 mb-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={1.5} d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z" />
</svg>
<h3 className="text-xl font-semibold mb-2">No documents yet</h3>
<p className="text-gray-500">Upload your first document to get started</p>
</div>
);
}
return (
<div>
{/* Header with view mode switcher */}
<div className="flex justify-between items-center mb-6">
<div>
<h2 className="text-2xl font-bold">Documents</h2>
<p className="text-gray-500">{documents.length} document{documents.length !== 1 ? 's' : ''}</p>
</div>
<ViewModeSwitcher
currentMode={viewMode}
onModeChange={setViewMode}
/>
</div>
{/* Document grid/list */}
<div className={getGridClasses()}>
{documents.map(document => (
viewMode === 'detail' ? (
<DocumentDetailView
key={document.id}
document={document}
onEdit={() => handleEditClick(document)}
onDelete={() => handleDeleteClick(document)}
/>
) : (
<DocumentCard
key={document.id}
document={document}
viewMode={viewMode}
onEdit={() => handleEditClick(document)}
onDelete={() => handleDeleteClick(document)}
/>
)
))}
</div>
{/* Modals */}
<EditDocumentModal
isOpen={!!editingDocument}
document={editingDocument}
onClose={() => setEditingDocument(null)}
onSave={handleSaveEdit}
isSaving={isSaving}
/>
<DeleteConfirmModal
isOpen={!!deletingDocument}
document={deletingDocument}
onClose={() => setDeletingDocument(null)}
onConfirm={handleConfirmDelete}
isDeleting={isDeleting}
/>
</div>
);
};
export default DocumentGallery;

View File

@@ -0,0 +1,225 @@
/**
* EditDocumentModal Component
* Modal dialog for editing document tags and categories
*/
import React, { useState, useEffect } from 'react';
import { getAvailableTags, getAvailableCategories } from '../../services/documentService';
/**
* EditDocumentModal component
* @param {Object} props
* @param {boolean} props.isOpen - Whether the modal is open
* @param {Object|null} props.document - Document to edit
* @param {function(): void} props.onClose - Callback when modal is closed
* @param {function(Object): void} props.onSave - Callback when changes are saved
* @param {boolean} props.isSaving - Whether save is in progress
* @returns {JSX.Element}
*/
const EditDocumentModal = ({
isOpen,
document,
onClose,
onSave,
isSaving = false
}) => {
const [selectedTags, setSelectedTags] = useState([]);
const [selectedCategories, setSelectedCategories] = useState([]);
const [availableTags, setAvailableTags] = useState([]);
const [availableCategories, setAvailableCategories] = useState([]);
const [newTag, setNewTag] = useState('');
const [newCategory, setNewCategory] = useState('');
// Load available tags and categories
useEffect(() => {
const loadOptions = async () => {
const [tags, categories] = await Promise.all([
getAvailableTags(),
getAvailableCategories()
]);
setAvailableTags(tags);
setAvailableCategories(categories);
};
loadOptions();
}, []);
// Initialize selected values when document changes
useEffect(() => {
if (document) {
setSelectedTags(document.tags || []);
setSelectedCategories(document.categories || []);
}
}, [document]);
const handleAddTag = (tag) => {
if (tag && !selectedTags.includes(tag)) {
setSelectedTags([...selectedTags, tag]);
}
setNewTag('');
};
const handleRemoveTag = (tag) => {
setSelectedTags(selectedTags.filter(t => t !== tag));
};
const handleAddCategory = (category) => {
if (category && !selectedCategories.includes(category)) {
setSelectedCategories([...selectedCategories, category]);
}
setNewCategory('');
};
const handleRemoveCategory = (category) => {
setSelectedCategories(selectedCategories.filter(c => c !== category));
};
const handleSave = () => {
onSave({
tags: selectedTags,
categories: selectedCategories
});
};
if (!isOpen || !document) return null;
return (
<dialog className="modal modal-open">
<div className="modal-box max-w-2xl">
<h3 className="font-bold text-lg mb-4">Edit Document</h3>
<div className="mb-4">
<p className="text-sm text-gray-500">
Document: <span className="font-semibold">{document.name}</span>
</p>
</div>
{/* Tags Section */}
<div className="mb-6">
<label className="label">
<span className="label-text font-semibold">Tags</span>
</label>
{/* Selected Tags */}
<div className="flex flex-wrap gap-2 mb-3">
{selectedTags.map(tag => (
<div key={tag} className="badge badge-primary gap-2">
{tag}
<button
type="button"
className="btn btn-ghost btn-xs"
onClick={() => handleRemoveTag(tag)}
disabled={isSaving}
>
</button>
</div>
))}
</div>
{/* Add Tag */}
<div className="flex gap-2">
<select
className="select select-bordered flex-1"
value={newTag}
onChange={(e) => setNewTag(e.target.value)}
disabled={isSaving}
>
<option value="">Select a tag...</option>
{availableTags
.filter(tag => !selectedTags.includes(tag))
.map(tag => (
<option key={tag} value={tag}>{tag}</option>
))
}
</select>
<button
className="btn btn-primary"
onClick={() => handleAddTag(newTag)}
disabled={!newTag || isSaving}
>
Add
</button>
</div>
</div>
{/* Categories Section */}
<div className="mb-6">
<label className="label">
<span className="label-text font-semibold">Categories</span>
</label>
{/* Selected Categories */}
<div className="flex flex-wrap gap-2 mb-3">
{selectedCategories.map(category => (
<div key={category} className="badge badge-secondary gap-2">
{category}
<button
type="button"
className="btn btn-ghost btn-xs"
onClick={() => handleRemoveCategory(category)}
disabled={isSaving}
>
</button>
</div>
))}
</div>
{/* Add Category */}
<div className="flex gap-2">
<select
className="select select-bordered flex-1"
value={newCategory}
onChange={(e) => setNewCategory(e.target.value)}
disabled={isSaving}
>
<option value="">Select a category...</option>
{availableCategories
.filter(cat => !selectedCategories.includes(cat))
.map(cat => (
<option key={cat} value={cat}>{cat}</option>
))
}
</select>
<button
className="btn btn-secondary"
onClick={() => handleAddCategory(newCategory)}
disabled={!newCategory || isSaving}
>
Add
</button>
</div>
</div>
<div className="modal-action">
<button
className="btn btn-ghost"
onClick={onClose}
disabled={isSaving}
>
Cancel
</button>
<button
className="btn btn-primary"
onClick={handleSave}
disabled={isSaving}
>
{isSaving ? (
<>
<span className="loading loading-spinner loading-sm"></span>
Saving...
</>
) : (
'Save Changes'
)}
</button>
</div>
</div>
<form method="dialog" className="modal-backdrop" onClick={onClose}>
<button disabled={isSaving}>close</button>
</form>
</dialog>
);
};
export default EditDocumentModal;

View File

@@ -0,0 +1,46 @@
/**
* ViewModeSwitcher Component
* Allows users to switch between different view modes (small, large, detail)
*/
import React from 'react';
/**
* @typedef {'small' | 'large' | 'detail'} ViewMode
*/
/**
* ViewModeSwitcher component
* @param {Object} props
* @param {ViewMode} props.currentMode - Current active view mode
* @param {function(ViewMode): void} props.onModeChange - Callback when mode changes
* @returns {JSX.Element}
*/
const ViewModeSwitcher = ({ currentMode, onModeChange }) => {
const modes = [
{ id: 'small', label: 'Small', icon: '⊞' },
{ id: 'large', label: 'Large', icon: '⊡' },
{ id: 'detail', label: 'Detail', icon: '☰' }
];
return (
<div className="flex gap-2">
{modes.map(mode => (
<button
key={mode.id}
onClick={() => onModeChange(mode.id)}
className={`btn btn-sm ${
currentMode === mode.id ? 'btn-primary' : 'btn-ghost'
}`}
aria-label={`Switch to ${mode.label} view`}
title={`${mode.label} view`}
>
<span className="text-lg">{mode.icon}</span>
<span className="hidden sm:inline ml-1">{mode.label}</span>
</button>
))}
</div>
);
};
export default ViewModeSwitcher;

View File

@@ -0,0 +1,85 @@
/**
* Custom hook for managing documents
* Handles fetching, updating, and deleting documents
*/
import { useState, useEffect, useCallback } from 'react';
import * as documentService from '../services/documentService';
/**
* Hook for managing documents state and operations
* @returns {Object} Documents state and operations
*/
export const useDocuments = () => {
const [documents, setDocuments] = useState([]);
const [loading, setLoading] = useState(true);
const [error, setError] = useState(null);
/**
* Fetches all documents from the service
*/
const fetchDocuments = useCallback(async () => {
try {
setLoading(true);
setError(null);
const data = await documentService.getAllDocuments();
setDocuments(data);
} catch (err) {
setError(err.message);
console.error('Error fetching documents:', err);
} finally {
setLoading(false);
}
}, []);
/**
* Updates a document's tags and categories
* @param {string} id - Document ID
* @param {Object} updates - Updates object
* @returns {Promise<boolean>} Success status
*/
const updateDocument = useCallback(async (id, updates) => {
try {
const updatedDoc = await documentService.updateDocument(id, updates);
setDocuments(prevDocs =>
prevDocs.map(doc => (doc.id === id ? updatedDoc : doc))
);
return true;
} catch (err) {
setError(err.message);
console.error('Error updating document:', err);
return false;
}
}, []);
/**
* Deletes a document
* @param {string} id - Document ID
* @returns {Promise<boolean>} Success status
*/
const deleteDocument = useCallback(async (id) => {
try {
await documentService.deleteDocument(id);
setDocuments(prevDocs => prevDocs.filter(doc => doc.id !== id));
return true;
} catch (err) {
setError(err.message);
console.error('Error deleting document:', err);
return false;
}
}, []);
// Fetch documents on mount
useEffect(() => {
fetchDocuments();
}, [fetchDocuments]);
return {
documents,
loading,
error,
fetchDocuments,
updateDocument,
deleteDocument
};
};

View File

@@ -0,0 +1,21 @@
/**
* DocumentsPage Component
* Main page for displaying and managing documents
*/
import React from 'react';
import DocumentGallery from '../components/documents/DocumentGallery';
/**
* DocumentsPage component
* @returns {JSX.Element}
*/
const DocumentsPage = () => {
return (
<div className="container mx-auto px-4 py-8">
<DocumentGallery />
</div>
);
};
export default DocumentsPage;

View File

@@ -0,0 +1,97 @@
/**
* Document Service
* Handles all API calls related to documents
* Currently using mock data for development
*/
import { mockDocuments, availableTags, availableCategories } from '../utils/mockData';
import api from '../utils/api';
// Simulate network delay
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
/**
* Fetches all documents from the API
* @returns {Promise<Array>} Array of document objects
*/
export const getAllDocuments = async () => {
try {
const response = await api.get('/api/documents');
return response.data;
} catch (error) {
console.error('Failed to fetch documents:', error);
// Fallback to mock data in case of API error during development
console.warn('Falling back to mock data');
}
};
/**
* Fetches a single document by ID
* @param {string} id - Document ID
* @returns {Promise<Object|null>} Document object or null if not found
*/
export const getDocumentById = async (id) => {
await delay(300);
const document = mockDocuments.find(doc => doc.id === id);
return document || null;
};
/**
* Updates a document's tags and categories
* @param {string} id - Document ID
* @param {Object} updates - Object containing tags and/or categories
* @param {Array<string>} updates.tags - New tags array
* @param {Array<string>} updates.categories - New categories array
* @returns {Promise<Object>} Updated document object
*/
export const updateDocument = async (id, updates) => {
await delay(400);
const index = mockDocuments.findIndex(doc => doc.id === id);
if (index === -1) {
throw new Error('Document not found');
}
// Update the document
mockDocuments[index] = {
...mockDocuments[index],
...updates
};
return mockDocuments[index];
};
/**
* Deletes a document
* @param {string} id - Document ID
* @returns {Promise<boolean>} True if deletion was successful
*/
export const deleteDocument = async (id) => {
await delay(300);
const index = mockDocuments.findIndex(doc => doc.id === id);
if (index === -1) {
throw new Error('Document not found');
}
mockDocuments.splice(index, 1);
return true;
};
/**
* Gets all available tags
* @returns {Promise<Array<string>>} Array of tag strings
*/
export const getAvailableTags = async () => {
await delay(200);
return [...availableTags];
};
/**
* Gets all available categories
* @returns {Promise<Array<string>>} Array of category strings
*/
export const getAvailableCategories = async () => {
await delay(200);
return [...availableCategories];
};

View File

@@ -0,0 +1,155 @@
/**
* Mock data for PDF documents
* This file provides sample data for development and testing purposes
*/
/**
* Generates a placeholder thumbnail URL
* @param {number} index - Document index for unique colors
* @returns {string} Placeholder image URL
*/
const generateThumbnailUrl = (index) => {
const colors = ['3B82F6', '10B981', 'F59E0B', 'EF4444', '8B5CF6', 'EC4899'];
const color = colors[index % colors.length];
return `https://via.placeholder.com/300x400/${color}/FFFFFF?text=Page+1`;
};
/**
* Mock documents data
* @type {Array<Object>}
*/
export const mockDocuments = [
{
id: 'doc-001',
name: 'Contrat-2025.pdf',
originalFileType: 'DOCX',
createdAt: '2025-10-01T10:30:00Z',
fileSize: 2048576, // 2 MB
pageCount: 12,
thumbnailUrl: generateThumbnailUrl(0),
pdfUrl: '/mock/contrat-2025.pdf',
tags: ['contrat', '2025'],
categories: ['legal']
},
{
id: 'doc-002',
name: 'Facture-Janvier.pdf',
originalFileType: 'XLSX',
createdAt: '2025-09-15T14:20:00Z',
fileSize: 512000, // 512 KB
pageCount: 3,
thumbnailUrl: generateThumbnailUrl(1),
pdfUrl: '/mock/facture-janvier.pdf',
tags: ['facture', 'comptabilité'],
categories: ['finance']
},
{
id: 'doc-003',
name: 'Présentation-Projet.pdf',
originalFileType: 'PPTX',
createdAt: '2025-09-28T09:15:00Z',
fileSize: 5242880, // 5 MB
pageCount: 24,
thumbnailUrl: generateThumbnailUrl(2),
pdfUrl: '/mock/presentation-projet.pdf',
tags: ['présentation', 'projet'],
categories: ['marketing']
},
{
id: 'doc-004',
name: 'Photo-Identité.pdf',
originalFileType: 'JPG',
createdAt: '2025-10-05T16:45:00Z',
fileSize: 204800, // 200 KB
pageCount: 1,
thumbnailUrl: generateThumbnailUrl(3),
pdfUrl: '/mock/photo-identite.pdf',
tags: ['photo', 'identité'],
categories: ['personnel']
},
{
id: 'doc-005',
name: 'Manuel-Utilisateur.pdf',
originalFileType: 'PDF',
createdAt: '2025-09-20T11:00:00Z',
fileSize: 3145728, // 3 MB
pageCount: 45,
thumbnailUrl: generateThumbnailUrl(4),
pdfUrl: '/mock/manuel-utilisateur.pdf',
tags: ['manuel', 'documentation'],
categories: ['technique']
},
{
id: 'doc-006',
name: 'Rapport-Annuel.pdf',
originalFileType: 'DOCX',
createdAt: '2025-08-30T13:30:00Z',
fileSize: 4194304, // 4 MB
pageCount: 67,
thumbnailUrl: generateThumbnailUrl(5),
pdfUrl: '/mock/rapport-annuel.pdf',
tags: ['rapport', 'annuel'],
categories: ['finance', 'management']
},
{
id: 'doc-007',
name: 'CV-Candidat.pdf',
originalFileType: 'DOCX',
createdAt: '2025-10-02T08:00:00Z',
fileSize: 153600, // 150 KB
pageCount: 2,
thumbnailUrl: generateThumbnailUrl(0),
pdfUrl: '/mock/cv-candidat.pdf',
tags: ['cv', 'recrutement'],
categories: ['rh']
},
{
id: 'doc-008',
name: 'Devis-Travaux.pdf',
originalFileType: 'XLSX',
createdAt: '2025-09-25T15:20:00Z',
fileSize: 409600, // 400 KB
pageCount: 5,
thumbnailUrl: generateThumbnailUrl(1),
pdfUrl: '/mock/devis-travaux.pdf',
tags: ['devis', 'travaux'],
categories: ['finance']
}
];
/**
* Available tags for documents
* @type {Array<string>}
*/
export const availableTags = [
'contrat',
'facture',
'présentation',
'photo',
'manuel',
'rapport',
'cv',
'devis',
'comptabilité',
'projet',
'identité',
'documentation',
'annuel',
'recrutement',
'travaux',
'2025'
];
/**
* Available categories for documents
* @type {Array<string>}
*/
export const availableCategories = [
'legal',
'finance',
'marketing',
'personnel',
'technique',
'management',
'rh'
];

View File

@@ -12,12 +12,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
texlive-xetex \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Change the user
USER 1002:1002
# Copy application code
COPY . .
# Command will be overridden by docker-compose
CMD ["celery", "-A", "main", "worker", "--loglevel=info"]

View File

@@ -5,10 +5,12 @@ email-validator==2.3.0
fastapi==0.116.1
httptools==0.6.4
motor==3.7.1
pikepdf==9.11.0
pillow==11.3.0
pydantic==2.11.9
PyJWT==2.10.1
pymongo==4.15.0
PyMuPDF==1.26.4
pypandoc==1.15
python-multipart==0.0.20
redis==6.4.0

View File

@@ -1,22 +1,14 @@
import subprocess
import uuid
from pathlib import Path
import magic # python-magic
from tasks.common.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter
class UnsupportedFileTypeError(Exception):
"""Exception raised when a file type is not supported."""
pass
def generate_uuid_filename() -> str:
"""Generate a unique filename using UUID4."""
return str(uuid.uuid4())
def detect_file_type(file_path: str) -> str:
"""
Detect the type of file using python-magic.
@@ -28,12 +20,19 @@ def detect_file_type(file_path: str) -> str:
UnsupportedFileTypeError: If file type is not supported.
"""
mime = magic.from_file(file_path, mime=True)
extension = Path(file_path).suffix
if mime.startswith("text/"):
return "text"
elif mime.startswith("image/"):
return "image"
elif mime in ("application/vnd.openxmlformats-officedocument.wordprocessingml.document",):
return "word"
elif mime == "application/pdf":
return "pdf"
elif mime == "application/vnd.ms-powerpoint":
return "powerpoint"
elif mime == "application/octet-stream" and extension in (".jpg", ".jpeg", ".png", ".gif"):
return "image"
else:
raise UnsupportedFileTypeError(f"Unsupported file type: {mime}")
@@ -72,31 +71,3 @@ def compress_pdf(input_pdf: str, output_pdf: str, quality: str = "ebook") -> Non
result = subprocess.run(cmd)
if result.returncode != 0:
raise RuntimeError(f"Ghostscript failed with return code {result.returncode}")
def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
"""
Convert any supported file to PDF.
Args:
filepath (str): Path to the input file.
output_dir (str): Directory to save the output PDF.
Returns:
str: Path to the generated PDF.
Raises:
UnsupportedFileTypeError: If the input file type is not supported.
"""
file_type = detect_file_type(filepath)
if file_type == "text":
converter = TextToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "image":
converter = ImageToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "word":
converter = WordToPdfConverter(filepath, output_dir=output_dir)
else:
raise ValueError(f"Unsupported file type: {file_type}")
return converter.convert()

View File

@@ -0,0 +1,64 @@
import hashlib
import logging
import os
from pathlib import Path
from app.config import settings
logger = logging.getLogger(__name__)
def get_file_hash(file_bytes: bytes) -> str:
"""
Calculate SHA256 hash of file content.
Args:
file_bytes: Raw file content as bytes
Returns:
Hexadecimal SHA256 hash string
"""
return hashlib.sha256(file_bytes).hexdigest()
def get_object_path(file_hash):
"""
:param file_hash:
:return:
"""
root = settings.get_objects_folder()
return os.path.join(root, file_hash[:24], file_hash)
def save_as_object(file_path, remove_on_success=True) -> str:
"""
Read the file, get the hash and save using the hash as the filename.
:param file_path:
:param remove_on_success:
:return: hash of the file
"""
logger.info(f"Saving file {file_path} as object")
path = Path(file_path)
as_bytes = path.read_bytes()
file_hash = get_file_hash(as_bytes)
logger.info(f"File hash: {file_hash}")
object_path = get_object_path(file_hash)
if os.path.exists(object_path):
logger.info(f"Object already exists: {object_path}")
return file_hash
if not os.path.exists(os.path.dirname(object_path)):
os.makedirs(os.path.dirname(object_path))
logger.info(f"Saving object to: {object_path}")
with open(object_path, "wb") as f:
f.write(as_bytes)
if remove_on_success:
logger.info(f"Removing file: {file_path}")
path.unlink()
return file_hash

View File

@@ -1,83 +0,0 @@
from abc import ABC, abstractmethod
from pathlib import Path
import pypandoc
from PIL import Image
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from tasks.common.converter_utils import generate_uuid_filename
class BaseConverter(ABC):
"""Abstract base class for file converters to PDF."""
def __init__(self, input_path: str, output_dir: str = ".") -> None:
self.input_path = Path(input_path)
self.output_dir = Path(output_dir)
self.output_path = self.output_dir / f"{generate_uuid_filename()}.pdf"
@abstractmethod
def convert(self) -> str:
"""Convert input file to PDF and return the output path."""
pass
class TextToPdfConverter(BaseConverter):
"""Converter for text files to PDF."""
def convert(self) -> str:
c = canvas.Canvas(str(self.output_path), pagesize=A4)
width, height = A4
with open(self.input_path, "r", encoding="utf-8") as f:
y = height - 50
for line in f:
c.drawString(50, y, line.strip())
y -= 15
if y < 50:
c.showPage()
y = height - 50
c.save()
return str(self.output_path)
class ImageToPdfConverter(BaseConverter):
"""Converter for image files to PDF."""
def convert(self) -> str:
image = Image.open(self.input_path)
rgb_image = image.convert("RGB")
rgb_image.save(self.output_path)
return str(self.output_path)
class WordToPdfConverter(BaseConverter):
"""Converter for Word files (.docx) to PDF using pypandoc."""
def convert(self) -> str:
pypandoc.convert_file(
str(self.input_path), "pdf", outputfile=str(self.output_path)
)
return str(self.output_path)
# Placeholders for future extensions
class HtmlToPdfConverter(BaseConverter):
"""Placeholder for HTML to PDF converter."""
def convert(self) -> str:
raise NotImplementedError("HTML to PDF conversion not implemented.")
class ExcelToPdfConverter(BaseConverter):
"""Placeholder for Excel to PDF converter."""
def convert(self) -> str:
raise NotImplementedError("Excel to PDF conversion not implemented.")
class MarkdownToPdfConverter(BaseConverter):
"""Placeholder for Markdown to PDF converter."""
def convert(self) -> str:
raise NotImplementedError("Markdown to PDF conversion not implemented.")

View File

@@ -6,13 +6,14 @@ and update processing job statuses throughout the task lifecycle.
"""
import logging
import os
from typing import Any, Dict
from app.config import settings
from app.database.connection import get_database
from app.services.document_service import DocumentService
from app.models.job import ProcessingStatus
from app.services.document_service import DocumentService, DocumentAlreadyExists
from app.services.job_service import JobService
from tasks.common.converter_utils import convert_to_pdf
from tasks.main import celery_app
logger = logging.getLogger(__name__)
@@ -25,7 +26,8 @@ def get_services():
return document_service, job_service
@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
@celery_app.task(bind=True)
def process_document(self, filepath: str) -> Dict[str, Any]:
"""
Process a document file and extract its content.
@@ -46,30 +48,34 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
Exception: Any processing error (will trigger retry)
"""
task_id = self.request.id
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
logger.info(f'Task {task_id} : Starting document processing for file: "{filepath}"')
# get services
document_service, job_service = get_services()
job = None
document = None
try:
# Step 1: Insert the document in DB
# Step 1: Create the document and a new job record for the document
document = document_service.create_document(filepath)
logger.info(f"Job {task_id} created for document {document.id} with file path: {filepath}")
# Step 2: Create a new job record for the document
job = job_service.create_job(task_id=task_id, document_id=document.id)
# Step 3: Mark job as started
job_service.mark_job_as_started(job_id=job.id)
logger.info(f"Job {task_id} marked as PROCESSING")
logger.info(f'Task {task_id} : Created document "{document.id}". Started job "{job.id}"')
# Step 4: Create the pdf version of the document
pdf_file_path = convert_to_pdf(filepath, settings.get_temp_folder())
logger.info(f"Task {task_id} : Creating associated PDF")
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.SAVING_PDF)
document_service.create_pdf(document.id)
logger.info(f"Task {task_id} : Creating thumbnail")
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.CREATING_THUMBNAIL)
document_service.create_thumbnail(document.id)
# remove the file from the watch folder
os.remove(filepath)
# Step x: Mark job as completed
job_service.mark_job_as_completed(job_id=job.id)
logger.info(f"Job {task_id} marked as COMPLETED")
logger.info(f"Task {task_id} marked as COMPLETED")
return {
"task_id": task_id,
@@ -77,6 +83,19 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
"status": "completed",
}
except DocumentAlreadyExists as e:
logger.info(f"Task {task_id} completed: {str(e)}")
if job is not None:
job_service.mark_job_as_completed(job_id=job.id)
logger.info(f"Job {task_id} marked as COMPLETED")
return {
"task_id": task_id,
"filepath": filepath,
"status": "completed",
"message": str(e),
}
except Exception as e:
error_message = f"Document processing failed: {str(e)}"
logger.error(f"Task {task_id} failed: {error_message}")
@@ -88,6 +107,11 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
logger.info(f"Job {task_id} marked as FAILED")
else:
logger.error(f"Failed to process {filepath}. error = {str(e)}")
if document is not None:
document_service.move_to_errors(document.id, filepath)
logger.info(f"Moved file {filepath} to errors/{document.id}")
except Exception as job_error:
logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")

View File

@@ -7,10 +7,10 @@ import logging
import os
from celery import Celery
from celery.signals import worker_process_init
from app.config import settings
# Environment variables
REDIS_URL = settings.get_redis_url()
MONGODB_URL = settings.get_mongodb_url()
@@ -38,11 +38,16 @@ celery_app.conf.update(
task_soft_time_limit=240, # 4 minutes
)
def global_init(**kwargs):
"""Initialize global variables."""
logger.info(f"{'*' * 45}")
logger.info(f"{'--' * 5}" + " Starting MyDocManager worker " + f"{'--' * 5}")
logger.info(f"{'*' * 45}")
global_init()
if __name__ == "__main__":
# initialize temp folder if needed
tmp_folder = settings.get_temp_folder()
if not os.path.exists(tmp_folder):
logger.info(f"Creating temporary folder: {tmp_folder}")
os.makedirs(tmp_folder)
global_init()
celery_app.start()

View File

@@ -568,3 +568,137 @@ class TestFileTypeDetection:
"""Test unsupported file type raises ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
document_service._detect_file_type("/path/to/document.xyz")
class TestCreatePdf:
"""Tests for create_pdf method."""
@patch('app.services.document_service.convert_to_pdf')
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_create_pdf_successfully(
self,
mock_magic,
mock_convert_to_pdf,
document_service,
sample_file_bytes
):
"""Test creating PDF from an existing document."""
# Setup
mock_magic.return_value = "text/plain"
# Create a document first
created_doc = document_service.create_document(
"/test/test.txt",
sample_file_bytes,
"utf-8"
)
# Mock the PDF conversion
pdf_path = os.path.join(document_service.temp_folder, "converted.pdf")
mock_convert_to_pdf.return_value = pdf_path
# Write a sample PDF file that the conversion would create
pdf_content = b"This is PDF content"
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
with open(pdf_path, "wb") as f:
f.write(pdf_content)
# Execute
result = document_service.create_pdf(created_doc.id)
# Verify
assert result is True
# Get the updated document
updated_doc = document_service.get_document_by_id(created_doc.id)
assert updated_doc.pdf_file_hash is not None
# Verify the PDF content was saved
pdf_hash = document_service._calculate_file_hash(pdf_content)
assert updated_doc.pdf_file_hash == pdf_hash
# Verify convert_to_pdf was called with correct arguments
doc_path = document_service.get_document_path(created_doc.file_hash)
mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder)
# Verify content exists on disk
validate_file_saved(document_service, pdf_hash, pdf_content)
# Verify PDF hash was added to document
updated_doc = document_service.get_document_by_id(created_doc.id)
pdf_hash = document_service._calculate_file_hash(pdf_content)
assert updated_doc.pdf_file_hash == pdf_hash
@patch('app.services.document_service.convert_to_pdf')
@patch('app.services.document_service.magic.from_buffer')
def test_i_can_reuse_existing_pdf(
self,
mock_magic,
mock_convert_to_pdf,
document_service,
sample_file_bytes
):
"""Test that if PDF already exists, it doesn't recreate it."""
# Setup
mock_magic.return_value = "text/plain"
# Create a document first
created_doc = document_service.create_document(
"/test/test.txt",
sample_file_bytes,
"utf-8"
)
# Create a fake PDF file and update the document
pdf_content = b"This is PDF content"
pdf_hash = document_service._calculate_file_hash(pdf_content)
document_service.save_content_if_needed(pdf_hash, pdf_content)
document_service.update_document(created_doc.id, {"pdf_file_hash": pdf_hash})
# Execute
result = document_service.create_pdf(created_doc.id)
# Verify
assert result is True
# Verify convert_to_pdf was NOT called
mock_convert_to_pdf.assert_not_called()
def test_i_cannot_create_pdf_for_nonexistent_document(
self,
document_service
):
"""Test behavior when document ID doesn't exist."""
# Execute with random ObjectId
result = document_service.create_pdf(ObjectId())
# Verify
assert result is False
@patch('app.services.document_service.magic.from_buffer')
def test_i_cannot_create_pdf_when_file_content_missing(
self,
mock_magic,
document_service,
sample_file_bytes
):
"""Test behavior when file content doesn't exist."""
# Setup
mock_magic.return_value = "text/plain"
# Create a document
created_doc = document_service.create_document(
"/test/test.txt",
sample_file_bytes,
"utf-8"
)
# Simulate missing content by removing file
file_path = document_service.get_document_path(created_doc.file_hash)
os.remove(file_path)
# Execute
result = document_service.create_pdf(created_doc.id)
# Verify
assert result is False

View File

@@ -417,6 +417,25 @@ class TestUpdateStatus:
# Verify exception details
assert exc_info.value.current_status == ProcessingStatus.FAILED
assert exc_info.value.target_status == ProcessingStatus.FAILED
def test_i_can_update_job_status(
self,
job_service,
sample_document_id,
sample_task_id
):
"""Test that failed job cannot be marked as failed again."""
# Create, start, and fail a job
created_job = job_service.create_job(sample_document_id, sample_task_id)
job_service.mark_job_as_started(created_job.id)
# Execute without error message
result = job_service.update_job_status(created_job.id, ProcessingStatus.SAVING_OBJECT)
# Verify status transition
assert result is not None
assert result.status == ProcessingStatus.SAVING_OBJECT
assert result.error_message is None
class TestDeleteJob:

View File

@@ -4,7 +4,7 @@ from pathlib import Path
import pytest
from tasks.common.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter
from app.utils.pdf_converter import TextToPdfConverter, ImageToPdfConverter, WordToPdfConverter
@pytest.fixture
@@ -20,10 +20,10 @@ def test_i_can_convert_text_to_pdf(temp_dir):
input_txt.write_text("Hello World!\nThis is a test.")
converter = TextToPdfConverter(str(input_txt), output_dir=temp_dir)
output_pdf = converter.convert()
converter.convert()
assert Path(output_pdf).exists()
assert output_pdf.endswith(".pdf")
assert Path(converter.output_path).exists()
assert str(converter.output_path).endswith(".pdf")
def test_i_can_convert_image_to_pdf(temp_dir):
@@ -34,10 +34,10 @@ def test_i_can_convert_image_to_pdf(temp_dir):
image.save(input_img)
converter = ImageToPdfConverter(str(input_img), output_dir=temp_dir)
output_pdf = converter.convert()
converter.convert()
assert Path(output_pdf).exists()
assert output_pdf.endswith(".pdf")
assert Path(converter.output_path).exists()
assert str(converter.output_path).endswith(".pdf")
def test_i_can_convert_word_to_pdf(temp_dir):
@@ -49,7 +49,7 @@ def test_i_can_convert_word_to_pdf(temp_dir):
doc.save(input_docx)
converter = WordToPdfConverter(str(input_docx), output_dir=temp_dir)
output_pdf = converter.convert()
converter.convert()
assert Path(output_pdf).exists()
assert output_pdf.endswith(".pdf")
assert Path(converter.output_path).exists()
assert str(converter.output_path).endswith(".pdf")