Thumbnails generated and displayed in the front end

This commit is contained in:
2025-10-07 00:16:49 +02:00
parent 79bfae4ba8
commit 477d6bf538
19 changed files with 860 additions and 54 deletions

View File

@@ -13,6 +13,7 @@ click-didyoumean==0.3.1
click-plugins==1.1.1.2
click-repl==0.3.0
cryptography==46.0.1
Deprecated==1.2.18
dnspython==2.8.0
ecdsa==0.19.1
email-validator==2.3.0
@@ -32,6 +33,7 @@ mongomock==4.3.0
mongomock-motor==0.0.36
motor==3.7.1
packaging==25.0
pikepdf==9.11.0
pillow==11.3.0
pipdeptree==2.28.0
pluggy==1.6.0
@@ -44,6 +46,7 @@ pydantic_core==2.33.2
Pygments==2.19.2
PyJWT==2.10.1
pymongo==4.15.1
PyMuPDF==1.26.4
pypandoc==1.15
pytest==8.4.2
pytest-asyncio==1.2.0
@@ -72,4 +75,5 @@ watchdog==6.0.0
watchfiles==1.1.0
wcwidth==0.2.13
websockets==15.0.1
wrapt==1.17.3
zipp==3.23.0

View File

@@ -9,6 +9,7 @@ from app.database.connection import get_database
from app.models.auth import UserRole
from app.models.user import UserInDB
from app.services.auth_service import AuthService
from app.services.document_service import DocumentService
from app.services.user_service import UserService
security = HTTPBearer()
@@ -25,6 +26,12 @@ def get_user_service() -> UserService:
return UserService(database)
def get_document_service() -> DocumentService:
"""Dependency to get DocumentService instance."""
database = get_database()
return DocumentService(database)
def get_current_user(
credentials: HTTPAuthorizationCredentials = Depends(security),
user_service: UserService = Depends(get_user_service)
@@ -79,7 +86,7 @@ def get_current_user(
return user
def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
"""
Dependency to ensure current user has admin role.

View File

@@ -0,0 +1,241 @@
"""
Document API routes.
This module provides REST endpoints for document management operations.
"""
import logging
import os
from typing import List, Optional
import fitz # PyMuPDF
from fastapi import APIRouter, Depends, HTTPException, Query, status, Path
from starlette.responses import Response
from app.api.dependencies import get_document_service, get_current_user
from app.models.document import DocumentResponse, FileDocument
from app.services.document_service import DocumentService
logger = logging.getLogger(__name__)
router = APIRouter(tags=["Documents"])
def _count_pdf_pages(pdf_file_path: str) -> int:
"""
Count the number of pages in a PDF file using PyMuPDF.
Args:
pdf_file_path: Path to the PDF file
Returns:
Number of pages in the PDF, or 0 if file cannot be read
"""
try:
with fitz.open(pdf_file_path) as doc:
return doc.page_count
except Exception as e:
logger.warning(f"Could not count pages for PDF {pdf_file_path}: {e}")
return 0
def _build_object_url(file_hash: Optional[str]) -> Optional[str]:
"""
Build object URL from file hash.
Args:
file_hash: SHA256 hash of the file
Returns:
URL string or None if hash is not provided
"""
if not file_hash:
return None
return f"/api/objects/{file_hash}"
def _extract_metadata_field(metadata: dict, field_name: str) -> List[str]:
"""
Extract a list field from metadata dictionary.
Args:
metadata: Document metadata dictionary
field_name: Name of the field to extract
Returns:
List of strings, empty list if field doesn't exist or is not a list
"""
field_value = metadata.get(field_name, [])
if isinstance(field_value, list):
return [str(item) for item in field_value]
return []
def _map_file_document_to_response(
document: FileDocument,
document_service: DocumentService
) -> DocumentResponse:
"""
Map FileDocument to DocumentResponse format.
Args:
document: FileDocument instance from database
document_service: Document service for file operations
Returns:
DocumentResponse instance ready for API response
"""
# Calculate page count for PDF files
page_count = 0
if document.pdf_file_hash and document_service.exists(document.pdf_file_hash):
pdf_path = document_service.get_document_path(document.pdf_file_hash)
page_count = _count_pdf_pages(pdf_path)
# Build URLs
thumbnail_url = _build_object_url(document.thumbnail_file_hash)
pdf_url = _build_object_url(document.pdf_file_hash)
# Extract tags and categories from metadata
tags = _extract_metadata_field(document.metadata, "tags")
categories = _extract_metadata_field(document.metadata, "categories")
# Format created_at timestamp
created_at = document.detected_at.isoformat() if document.detected_at else ""
as_dict = {
"id": str(document.id),
"name": document.filename,
"original_file_type": document.file_type.value.upper(),
"created_at": created_at,
"file_size": document.file_size,
"page_count": page_count,
"thumbnail_url": thumbnail_url,
"pdf_url": pdf_url,
"tags": tags,
"categories": categories
}
logger.info(f"Document: {as_dict}")
return DocumentResponse(**as_dict)
@router.get("/documents", response_model=List[DocumentResponse])
def list_documents(
skip: int = Query(0, ge=0, description="Number of documents to skip"),
limit: int = Query(100, ge=1, le=1000, description="Maximum number of documents to return"),
UserInDB=Depends(get_current_user),
document_service: DocumentService = Depends(get_document_service)
) -> List[DocumentResponse]:
"""
Retrieve a paginated list of documents.
Args:
skip: Number of documents to skip for pagination
limit: Maximum number of documents to return
document_service: Document service instance
Returns:
List of documents in API response format
Raises:
HTTPException: If database operation fails
"""
try:
# Get documents from service
documents = document_service.list_documents(skip=skip, limit=limit)
# Map to response format
document_responses = [
_map_file_document_to_response(doc, document_service)
for doc in documents
]
return document_responses
except Exception as e:
logger.error(f"Failed to list documents: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to retrieve documents"
)
@router.get("/objects/{file_hash}")
async def get_object_by_hash(
file_hash: str = Path(..., description="SHA256 hash of the object to retrieve"),
document_service: DocumentService = Depends(get_document_service)
):
"""
Serve object content by its hash.
This endpoint serves files (original documents, PDFs, thumbnails) by their
SHA256 hash. It supports all file types stored in the objects folder.
Args:
file_hash: SHA256 hash of the object
document_service: Document service dependency
Returns:
FileResponse with the requested object content
Raises:
HTTPException: If object not found (404) or server error (500)
"""
try:
# Check if object exists
if not document_service.exists(file_hash):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Object not found"
)
# Get file path
file_path = document_service.get_document_path(file_hash)
# Verify file exists on disk
if not os.path.exists(file_path):
logger.error(f"Object {file_hash} registered but file not found at {file_path}")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Object file not found on disk"
)
# Determine media type based on file content
try:
file_content = document_service.get_document_content_by_hash(file_hash)
if not file_content:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Object content not available"
)
# Detect MIME type
import magic
mime_type = magic.from_buffer(file_content, mime=True)
# Return file content with appropriate headers
return Response(
content=file_content,
media_type=mime_type,
headers={
"Content-Length": str(len(file_content)),
"Cache-Control": "public, max-age=3600" # Cache for 1 hour
}
)
except Exception as e:
logger.error(f"Error reading object content for hash {file_hash}: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to read object content"
)
except HTTPException:
# Re-raise HTTP exceptions as-is
raise
except Exception as e:
logger.error(f"Unexpected error serving object {file_hash}: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Internal server error while serving object"
)

View File

@@ -4,7 +4,7 @@ MongoDB database connection management.
This module handles MongoDB connection with fail-fast approach.
The application will terminate if MongoDB is not accessible at startup.
"""
import logging
import sys
from typing import Optional
@@ -13,11 +13,14 @@ from pymongo.database import Database
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
from app.config.settings import get_mongodb_url, get_mongodb_database_name
from app.utils.security import safe_connection_string
# Global variables for singleton pattern
_client: Optional[MongoClient] = None
_database: Optional[Database] = None
logger = logging.getLogger(__name__)
def create_mongodb_client() -> MongoClient:
"""
@@ -43,16 +46,16 @@ def create_mongodb_client() -> MongoClient:
# Test connection by running admin command
client.admin.command('ping')
print(f"Successfully connected to MongoDB at {mongodb_url}")
logger.info(f"Successfully connected to MongoDB at {safe_connection_string(mongodb_url)}")
return client
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
print(f"ERROR: Failed to connect to MongoDB at {mongodb_url}")
print(f"Connection error: {str(e)}")
print("MongoDB is required for this application. Please ensure MongoDB is running and accessible.")
logger.error(f"ERROR: Failed to connect to MongoDB at {safe_connection_string(mongodb_url)}")
logger.error(f"Connection error: {str(e)}")
logger.error("MongoDB is required for this application. Please ensure MongoDB is running and accessible.")
sys.exit(1)
except Exception as e:
print(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}")
logger.error(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}")
sys.exit(1)
@@ -74,7 +77,7 @@ def get_database() -> Database:
database_name = get_mongodb_database_name()
_database = _client[database_name]
print(f"Connected to database: {database_name}")
logger.info(f"Connected to database: {database_name}")
return _database
@@ -92,7 +95,7 @@ def close_database_connection():
_client.close()
_client = None
_database = None
print("MongoDB connection closed")
logger.info("MongoDB connection closed")
def get_mongodb_client() -> Optional[MongoClient]:

View File

@@ -17,6 +17,7 @@ from fastapi.middleware.cors import CORSMiddleware
from app.api.routes.auth import router as auth_router
from app.api.routes.users import router as users_router
from app.api.routes.document import router as documents_router
from app.config import settings
from app.database.connection import get_database
from app.file_watcher import create_file_watcher, FileWatcher
@@ -111,7 +112,7 @@ app.add_middleware(
# Include routers
app.include_router(auth_router, prefix="/auth", tags=["Authentication"])
app.include_router(users_router, prefix="/users", tags=["User Management"])
# app.include_router(documents_router, prefix="/documents", tags=["Documents"])
app.include_router(documents_router, prefix="/api", tags=["Documents"])
# app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"])

View File

@@ -7,10 +7,9 @@ stored in MongoDB collections.
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Optional
from typing import Any, Dict, List, Optional
from bson import ObjectId
from pydantic import BaseModel, Field, field_validator
from pydantic import BaseModel, Field, field_validator, ConfigDict
from app.models.types import PyObjectId
@@ -50,6 +49,7 @@ class FileDocument(BaseModel):
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
pdf_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the associated pdf file content")
thumbnail_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the thumbnail")
encoding: str = Field(default="utf-8", description="Character encoding for text files")
file_size: int = Field(..., ge=0, description="File size in bytes")
mime_type: str = Field(..., description="MIME type detected")
@@ -69,3 +69,28 @@ class FileDocument(BaseModel):
if not v.strip():
raise ValueError("Filename cannot be empty")
return v.strip()
class DocumentResponse(BaseModel):
"""
Response model for document API endpoints.
Represents a document in the format expected by the frontend application.
Field names are automatically converted from snake_case to camelCase.
"""
model_config = ConfigDict(alias_generator=lambda field_name: ''.join(
word.capitalize() if i > 0 else word
for i, word in enumerate(field_name.split('_'))
), populate_by_name=True)
id: str = Field(..., description="Document unique identifier")
name: str = Field(..., description="Document filename")
original_file_type: str = Field(..., description="Original file type before conversion")
created_at: str = Field(..., description="ISO timestamp when document was created")
file_size: int = Field(..., description="File size in bytes")
page_count: int = Field(..., description="Number of pages in the document")
thumbnail_url: Optional[str] = Field(default=None, description="URL to document thumbnail")
pdf_url: Optional[str] = Field(default=None, description="URL to PDF version of document")
tags: List[str] = Field(default_factory=list, description="Document tags")
categories: List[str] = Field(default_factory=list, description="Document categories")

View File

@@ -16,6 +16,7 @@ class ProcessingStatus(str, Enum):
COMPLETED = "completed"
SAVING_OBJECT = "saving_object"
SAVING_PDF = "saving_pdf"
CREATING_THUMBNAIL = "creating_thumbnail"
FAILED = "failed"

View File

@@ -24,10 +24,22 @@ from app.models.document import (
)
from app.models.types import PyObjectId
from app.utils.pdf_converter import convert_to_pdf
from app.utils.pdf_thumbmail import PDFThumbnailGenerator
from app.utils.security import generate_uuid_filename
logger = logging.getLogger(__name__)
class DocumentAlreadyExists(Exception):
def __init__(self, message):
self.message = message
class DocumentProcessingError(Exception):
def __init__(self, message):
self.message = message
class DocumentService:
"""
Service for orchestrated document and content management.
@@ -162,7 +174,7 @@ class DocumentService:
# Increment counter for next attempt
counter += 1
def _get_document_path(self, file_hash):
def get_document_path(self, file_hash):
"""
:param file_hash:
@@ -171,10 +183,12 @@ class DocumentService:
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
def exists(self, file_hash):
return os.path.exists(self._get_document_path(file_hash))
if file_hash is None:
return False
return os.path.exists(self.get_document_path(file_hash))
def save_content_if_needed(self, file_hash, content: bytes):
target_path = self._get_document_path(file_hash)
target_path = self.get_document_path(file_hash)
if os.path.exists(target_path):
return
@@ -192,7 +206,8 @@ class DocumentService:
def move_to_ignored(self, file_path, reason="Unknown"):
logger.info(f"Moving file {file_path} to ignored folder")
ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(file_path)
ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(
file_path)
ignored_file_path = self._get_safe_path(os.path.join(self.ignored_folder, ignored_file_name))
shutil.move(file_path, ignored_file_path)
@@ -231,15 +246,16 @@ class DocumentService:
detected_at = datetime.now()
try:
logger.info(f"Creating Document for {file_path}")
logger.info(f'Creating Document for "{file_path}"')
# Skip the document if it already exists
same_document = self.document_repository.find_same_document(filename, file_hash)
if same_document is not None:
logger.info(f" Document with same hash already exists. Skipping...")
self.move_to_ignored(file_path, f"already exists ({same_document.id})")
raise DocumentAlreadyExists(f"Document with same hash already exists ({same_document.id})")
self.save_content_if_needed(file_hash, file_bytes)
logger.info(f" Saved content to {self._get_document_path(file_hash)}")
logger.info(f" Saved content to {self.get_document_path(file_hash)}")
# Create FileDocument
file_data = FileDocument(
@@ -255,11 +271,13 @@ class DocumentService:
mime_type=mime_type
)
created_file = self.document_repository.create_document(file_data)
logger.info(f" Created document with id '{created_file.id}'")
created_document = self.document_repository.create_document(file_data)
logger.info(f" Created document with id '{created_document.id}'")
return created_file
return created_document
except DocumentAlreadyExists as e:
raise e
except Exception as e:
# Transaction will automatically rollback if supported
raise PyMongoError(f"Failed to create document: {str(e)}")
@@ -273,40 +291,69 @@ class DocumentService:
document = self.get_document_by_id(document_id)
if document is None:
logger.error(f" Document not found")
raise ValueError(f"Document {document_id} not found")
raise DocumentProcessingError(f"Document {document_id} not found.")
# try to find another document that has the same hash
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
# the pdf will be created only if it does not exist yet
if (document_with_same_hash is not None and
document_with_same_hash.pdf_file_hash and
self.exists(document_with_same_hash.pdf_file_hash)):
logger.info(f"Found document with same hash. Will use pdf {document_with_same_hash.pdf_file_hash}")
if document_with_same_hash and self.exists(document_with_same_hash.pdf_file_hash):
logger.info(f'Found document with same hash. Will use pdf "{document_with_same_hash.pdf_file_hash}".')
self.update_document(document_id, {"pdf_file_hash": document_with_same_hash.pdf_file_hash})
return True
return
# get the content of the file
logger.info(f" No document with same hash found and valid pdf found. Will create new pdf")
logger.info(f" No document with same hash and valid pdf found. Will create new pdf content.")
file_bytes = self.get_document_content_by_hash(document.file_hash)
if file_bytes is None:
logger.error(f"Content for document {document_id} not found. hash = {document.file_hash}.")
return False
logger.error(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".')
raise DocumentProcessingError(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".')
# create the pdf file
temp_pdf_file = convert_to_pdf(self._get_document_path(document.file_hash), self.temp_folder)
temp_pdf_file = convert_to_pdf(self.get_document_path(document.file_hash), self.temp_folder)
pdf_file_hash = self._calculate_file_hash(self._read_file_bytes(temp_pdf_file))
self.save_content_if_needed(pdf_file_hash, self._read_file_bytes(temp_pdf_file))
logger.info(f" Created new pdf file with hash {pdf_file_hash}")
# remove the temporary file
os.remove(temp_pdf_file)
logger.info(f" Removed temporary pdf file {temp_pdf_file}")
os.remove(temp_pdf_file) # remove the temporary file
logger.info(f' Created new pdf file with hash "{pdf_file_hash}"')
# update the document
self.update_document(document_id, {"pdf_file_hash": pdf_file_hash})
def create_thumbnail(self, document_id: PyObjectId):
logger.info(f'Creating thumbnail document for "{document_id}"')
document = self.get_document_by_id(document_id)
if document is None:
logger.error(f" Document not found !")
raise DocumentProcessingError(f"Document {document_id} not found.")
return True
# try to find another document that has the same hash
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
# We will use the thumbnail of the pdf if it exists
if document_with_same_hash and self.exists(document_with_same_hash.thumbnail_file_hash):
logger.info(f" Found document with same hash. Will use thumbnail {document_with_same_hash.thumbnail_file_hash}")
self.update_document(document_id, {"thumbnail_file_hash": document_with_same_hash.thumbnail_file_hash})
return
logger.info(f" No document with same hash and valid thumbnail found. Will create new thumbnail")
if not self.exists(document.pdf_file_hash):
logger.error(f" PDF file not found.")
raise DocumentProcessingError(f"PDF file for document {document_id} not found")
tmp_thumbnail_path = os.path.join(self.temp_folder, f"{generate_uuid_filename()}.png")
with PDFThumbnailGenerator(self.get_document_path(document.pdf_file_hash)) as gen:
# create the thumbnail
gen.create_thumbnail(tmp_thumbnail_path, page_num=0, width=200)
thumbnail_file_hash = self._calculate_file_hash(self._read_file_bytes(tmp_thumbnail_path))
# save the thumbnail to the objects folder
self.save_content_if_needed(thumbnail_file_hash, self._read_file_bytes(tmp_thumbnail_path))
os.remove(tmp_thumbnail_path)
# update the document
self.update_document(document_id, {"thumbnail_file_hash": thumbnail_file_hash})
logger.info(f" Created thumbnail {thumbnail_file_hash}")
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
"""
@@ -348,7 +395,7 @@ class DocumentService:
return self.document_repository.find_document_by_filepath(filepath)
def get_document_content_by_hash(self, file_hash):
target_path = self._get_document_path(file_hash)
target_path = self.get_document_path(file_hash)
if not os.path.exists(target_path):
return None
@@ -439,7 +486,7 @@ class DocumentService:
# If no other files reference this content, delete it
if not remaining_files:
try:
os.remove(self._get_document_path(document.file_hash))
os.remove(self.get_document_path(document.file_hash))
except Exception:
pass

View File

@@ -0,0 +1,241 @@
import fitz # PyMuPDF
class PDFAnnotator:
def __init__(self, pdf_path):
self.doc = fitz.open(pdf_path)
def add_highlight(self, rect, page_num=0, color=(1, 1, 0)):
"""
Add highlight annotation
Args:
rect: (x0, y0, x1, y1) coordinates or fitz.Rect object
page_num: Page number (0-indexed), default first page
color: RGB tuple (0-1 range), default yellow
"""
page = self.doc[page_num]
annot = page.add_highlight_annot(rect)
annot.set_colors(stroke=color)
annot.update()
return annot
def add_rectangle(self, rect, page_num=0, color=(1, 0, 0), width=2):
"""
Add rectangle annotation (border only)
Args:
rect: (x0, y0, x1, y1) coordinates or fitz.Rect object
page_num: Page number (0-indexed), default first page
color: RGB tuple (0-1 range), default red
width: Line width in points
"""
page = self.doc[page_num]
annot = page.add_rect_annot(rect)
annot.set_colors(stroke=color)
annot.set_border(width=width)
annot.update()
return annot
def add_text_note(self, point, text, page_num=0, icon="Note"):
"""
Add sticky note annotation
Args:
point: (x, y) position tuple
text: Note content string
page_num: Page number (0-indexed), default first page
icon: "Note", "Comment", "Help", "Insert", "Key", etc.
"""
page = self.doc[page_num]
annot = page.add_text_annot(point, text, icon=icon)
annot.update()
return annot
def add_free_text(self, rect, text, page_num=0, fontsize=12,
color=(0, 0, 0)):
"""
Add free text annotation (visible text box)
Args:
rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect
text: Text content string
page_num: Page number (0-indexed), default first page
fontsize: Font size in points
color: Text color RGB tuple (0-1 range)
"""
page = self.doc[page_num]
annot = page.add_freetext_annot(
rect,
text,
fontsize=fontsize,
text_color=color
)
annot.update()
return annot
def add_arrow(self, start_point, end_point, page_num=0,
color=(1, 0, 0), width=2):
"""
Add arrow annotation
Args:
start_point: (x, y) tuple for arrow start
end_point: (x, y) tuple for arrow end
page_num: Page number (0-indexed), default first page
color: Arrow color RGB tuple (0-1 range), default red
width: Line width in points
"""
page = self.doc[page_num]
annot = page.add_line_annot(start_point, end_point)
annot.set_colors(stroke=color)
annot.set_border(width=width)
# Set arrow at end - use integer constant
annot.set_line_ends(0, 1) # 1 = ClosedArrow
annot.update()
return annot
def add_stamp(self, rect, page_num=0, stamp_type=0):
"""
Add stamp annotation
Args:
rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect
page_num: Page number (0-indexed), default first page
stamp_type: Integer for stamp type:
0=Approved, 1=AsIs, 2=Confidential,
3=Departmental, 4=Draft, 5=Experimental,
6=Expired, 7=Final, 8=ForComment,
9=ForPublicRelease, 10=NotApproved, etc.
"""
page = self.doc[page_num]
annot = page.add_stamp_annot(rect, stamp=stamp_type)
annot.update()
return annot
def add_redaction(self, rect, page_num=0, fill_color=(0, 0, 0)):
"""
Add redaction annotation (marks area for redaction)
Note: Use apply_redactions() to permanently remove content
Args:
rect: (x0, y0, x1, y1) area to redact, tuple or fitz.Rect
page_num: Page number (0-indexed), default first page
fill_color: RGB tuple (0-1 range) for redacted area, default black
"""
page = self.doc[page_num]
annot = page.add_redact_annot(rect, fill=fill_color)
annot.update()
return annot
def apply_redactions(self, page_num=0, images=2, graphics=2, text=2):
"""
Apply all redaction annotations on a page (permanent removal)
Args:
page_num: Page number (0-indexed), default first page
images: 2=remove, 1=blank, 0=ignore
graphics: 2=remove, 1=blank, 0=ignore
text: 2=remove, 1=blank, 0=ignore
Returns:
True if redactions were applied, False otherwise
"""
page = self.doc[page_num]
# Check if page has redaction annotations
has_redactions = any(annot.type[0] == 12 for annot in page.annots())
if has_redactions:
page.apply_redactions(images=images, graphics=graphics, text=text)
return True
return False
def get_all_annotations(self, page_num=0):
"""
Retrieve all annotations from a page
Args:
page_num: Page number (0-indexed), default first page
Returns:
List of dicts with annotation information
"""
page = self.doc[page_num]
annotations = []
for annot in page.annots():
info = {
'type': annot.type[1], # Annotation type name
'rect': annot.rect,
'content': annot.info.get('content', ''),
'author': annot.info.get('title', ''),
'created': annot.info.get('creationDate', ''),
'colors': annot.colors
}
annotations.append(info)
return annotations
def remove_all_annotations(self, page_num=0):
"""
Remove all annotations from a page
Args:
page_num: Page number (0-indexed), default first page
"""
page = self.doc[page_num]
for annot in page.annots():
page.delete_annot(annot)
def save(self, output_path):
"""Save the annotated PDF"""
self.doc.save(output_path)
def close(self):
self.doc.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
# Example usage
if __name__ == "__main__":
with PDFAnnotator("input.pdf") as annotator:
# Add yellow highlight
annotator.add_highlight((100, 100, 300, 120), page_num=0,
color=(1, 1, 0))
# Add red rectangle border
annotator.add_rectangle((100, 150, 300, 250), page_num=0,
color=(1, 0, 0), width=3)
# Add sticky note
annotator.add_text_note((400, 100), "This is important!",
page_num=0, icon="Comment")
# Add visible text box
annotator.add_free_text((100, 300, 400, 350), "DRAFT VERSION",
page_num=0, fontsize=20, color=(1, 0, 0))
# Add arrow pointing to something
annotator.add_arrow((450, 100), (500, 200), page_num=0,
color=(0, 0, 1), width=2)
# Add "Approved" stamp
annotator.add_stamp((450, 300, 550, 350), page_num=0, stamp_type=0)
# Add redaction (black box over sensitive info)
annotator.add_redaction((100, 400, 300, 420), page_num=0)
annotator.apply_redactions(page_num=0)
# List all annotations
annots = annotator.get_all_annotations(page_num=0)
print(f"Found {len(annots)} annotations:")
for a in annots:
print(f" - {a['type']} at {a['rect']}")
# Save annotated PDF
annotator.save("output_annotated.pdf")

View File

@@ -127,6 +127,15 @@ class TextToPdfConverter(BaseConverter):
return self
class PdfToPdfConverter(BaseConverter):
"""Converter for PDF files to PDF."""
def convert(self) -> Self:
# copy self.input_path to self.output_path
os.system(f"cp {self.input_path} {self.output_path}")
return self
class ImageToPdfConverter(BaseConverter):
"""Converter for image files to PDF."""
@@ -191,6 +200,8 @@ def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
converter = ImageToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "word":
converter = WordToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "pdf":
converter = PdfToPdfConverter(filepath, output_dir=output_dir)
else:
raise ValueError(f"Unsupported file type: {file_type}")

View File

@@ -0,0 +1,167 @@
from pathlib import Path
import fitz # PyMuPDF
class PDFThumbnailGenerator:
def __init__(self, pdf_path):
"""
Initialize PDF thumbnail generator
Args:
pdf_path: Path to the PDF file (string or Path object)
"""
self.pdf_path = pdf_path
self.doc = fitz.open(pdf_path)
def create_thumbnail(self, output_path, page_num=0, width=200, rotation=0, zoom_factor=1.0):
"""
Create a thumbnail with zoom and rotation
Args:
output_path: Path to save the thumbnail (string or Path)
page_num: Page number (0-indexed), default first page
width: Desired width in pixels, default 200
rotation: Rotation angle in degrees (0, 90, 180, 270), default 0
zoom_factor: Additional zoom multiplier (1.0 = normal, 2.0 = 2x), default 1.0
Returns:
Dict with thumbnail info (width, height, rotation, zoom)
"""
page = self.doc[page_num]
# Apply rotation to page
page.set_rotation(rotation)
# Calculate zoom to achieve desired width
base_zoom = width / page.rect.width
final_zoom = base_zoom * zoom_factor
# Create transformation matrix
mat = fitz.Matrix(final_zoom, final_zoom)
# Render page to pixmap
pix = page.get_pixmap(matrix=mat, alpha=False)
# Save thumbnail
pix.save(output_path)
return {
'width': pix.width,
'height': pix.height,
'rotation': rotation,
'zoom': zoom_factor
}
def create_cropped_thumbnail(self, output_path, crop_rect=None, page_num=0, width=200):
"""
Create a thumbnail of a specific region (zoom on area)
Args:
output_path: Path to save the thumbnail (string or Path)
crop_rect: Tuple (x0, y0, x1, y1) in PDF coordinates for cropping,
or None for full page, default None
page_num: Page number (0-indexed), default first page
width: Desired width in pixels, default 200
Returns:
Tuple (width, height) of the generated thumbnail
"""
page = self.doc[page_num]
if crop_rect:
# Create rectangle for cropping
rect = fitz.Rect(crop_rect)
zoom = width / rect.width
else:
rect = page.rect
zoom = width / page.rect.width
mat = fitz.Matrix(zoom, zoom)
# Render only the specified rectangle
pix = page.get_pixmap(matrix=mat, clip=rect)
pix.save(output_path)
return pix.width, pix.height
def get_page_info(self, page_num=0):
"""
Get information about a specific page
Args:
page_num: Page number (0-indexed), default first page
Returns:
Dict with page information (width, height, rotation, number, total_pages)
"""
page = self.doc[page_num]
return {
'width': page.rect.width,
'height': page.rect.height,
'rotation': page.rotation,
'number': page_num + 1,
'total_pages': len(self.doc)
}
def create_multi_resolution_thumbnails(self, output_folder, page_num=0, sizes=(150, 300, 600)):
"""
Create multiple thumbnails at different resolutions
Args:
output_folder: Folder path to save thumbnails (string or Path)
page_num: Page number (0-indexed), default first page
sizes: List of widths in pixels, default [150, 300, 600]
Returns:
Dict mapping each size to thumbnail info
"""
output_folder = Path(output_folder)
output_folder.mkdir(exist_ok=True, parents=True)
results = {}
for size in sizes:
output_path = output_folder / f"thumb_{size}px.png"
info = self.create_thumbnail(output_path, page_num=page_num, width=size)
results[size] = info
return results
def close(self):
"""Close the PDF document and free resources"""
self.doc.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
# Example usage
if __name__ == "__main__":
# Basic usage with context manager
with PDFThumbnailGenerator("example.pdf") as gen:
# Standard thumbnail
gen.create_thumbnail("thumb_standard.png", page_num=0, width=200)
# Rotated thumbnail
gen.create_thumbnail("thumb_rotated.png", page_num=0,
width=200, rotation=90)
# Zoomed thumbnail (2x zoom)
gen.create_thumbnail("thumb_zoomed.png", page_num=0,
width=200, zoom_factor=2.0)
# Cropped/zoomed on specific area (x0, y0, x1, y1)
gen.create_cropped_thumbnail("thumb_crop.png",
crop_rect=(100, 100, 400, 400),
page_num=0, width=300)
# Multiple resolutions
gen.create_multi_resolution_thumbnails("thumbnails/", page_num=0,
sizes=[150, 300, 600])
# Get page information
info = gen.get_page_info(page_num=0)
print(f"Page info: {info}")

View File

@@ -4,9 +4,10 @@ Password security utilities using bcrypt for secure password hashing.
This module provides secure password hashing and verification functions
using the bcrypt algorithm with automatic salt generation.
"""
import re
import uuid
import bcrypt
from typing import Union
def hash_password(password: str) -> str:
@@ -71,4 +72,33 @@ def verify_password(password: str, hashed_password: str) -> bool:
# bcrypt raises ValueError for malformed hashes
raise RuntimeError(f"Invalid hash format: {str(e)}")
except Exception as e:
raise RuntimeError(f"Failed to verify password: {str(e)}")
raise RuntimeError(f"Failed to verify password: {str(e)}")
def generate_uuid_filename() -> str:
"""Generate a unique filename using UUID4."""
return str(uuid.uuid4())
def safe_connection_string(connection_string: str) -> str:
"""
Mask the password in a MongoDB connection string.
Args:
connection_string (str): The complete MongoDB connection string
Returns:
str: The connection string with password replaced by asterisks
Example:
>>> mask_mongodb_password("mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin")
"mongodb://admin:***@mongodb:27017/mydocmanager?authSource=admin"
"""
# Pattern to detect password in MongoDB URL
# Format: mongodb://username:password@host:port/database
pattern = r'(mongodb://[^:]+:)([^@]+)(@.*)'
# Replace password with asterisks
masked_string = re.sub(pattern, r'\1*****\3', connection_string)
return masked_string

View File

@@ -10,6 +10,7 @@ pillow==11.3.0
pydantic==2.11.9
PyJWT==2.10.1
pymongo==4.15.0
PyMuPDF==1.26.4
pypandoc==1.15
python-multipart==0.0.20
redis==6.4.0

View File

@@ -1,11 +1,13 @@
import {FaBuffer, FaPlus} from "react-icons/fa6";
import { Link } from "react-router-dom";
const Menu = () => {
return (
<div className="p-4">
<ul className="menu">
<li className="menu-title">Exploration</li>
<li><a><FaBuffer/>To Review</a></li>
<li><Link to="/dashboard"><FaBuffer/>Dashboard</Link></li>
<li><Link to="/documents"><FaBuffer/>To Review</Link></li>
<li className="menu-title mt-4">Catégories</li>
<li><a><i className="fas fa-plus"></i>Item</a></li>
</ul>

View File

@@ -64,8 +64,8 @@ const DocumentCard = memo(({ document, viewMode, onEdit, onDelete }) => {
const renderThumbnail = () => (
<figure className="relative overflow-hidden">
<img
src={thumbnailUrl}
alt={`${name} thumbnail`}
src={`http://localhost:8000${thumbnailUrl}`}
alt={`${thumbnailUrl} thumbnail`}
className={`w-full object-cover ${
viewMode === 'small' ? 'h-32' : viewMode === 'large' ? 'h-48' : 'h-64'
}`}

View File

@@ -5,17 +5,24 @@
*/
import { mockDocuments, availableTags, availableCategories } from '../utils/mockData';
import api from '../utils/api';
// Simulate network delay
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
/**
* Fetches all documents
* Fetches all documents from the API
* @returns {Promise<Array>} Array of document objects
*/
export const getAllDocuments = async () => {
await delay(500); // Simulate network latency
return [...mockDocuments];
try {
const response = await api.get('/api/documents');
return response.data;
} catch (error) {
console.error('Failed to fetch documents:', error);
// Fallback to mock data in case of API error during development
console.warn('Falling back to mock data');
}
};
/**

View File

@@ -10,6 +10,7 @@ pillow==11.3.0
pydantic==2.11.9
PyJWT==2.10.1
pymongo==4.15.0
PyMuPDF==1.26.4
pypandoc==1.15
python-multipart==0.0.20
redis==6.4.0

View File

@@ -12,7 +12,7 @@ from typing import Any, Dict
from app.config import settings
from app.database.connection import get_database
from app.models.job import ProcessingStatus
from app.services.document_service import DocumentService
from app.services.document_service import DocumentService, DocumentAlreadyExists
from app.services.job_service import JobService
from tasks.main import celery_app
@@ -26,7 +26,7 @@ def get_services():
return document_service, job_service
#@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
@celery_app.task(bind=True)
def process_document(self, filepath: str) -> Dict[str, Any]:
"""
@@ -48,7 +48,7 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
Exception: Any processing error (will trigger retry)
"""
task_id = self.request.id
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
logger.info(f'Task {task_id} : Starting document processing for file: "{filepath}"')
# get services
document_service, job_service = get_services()
@@ -60,12 +60,16 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
document = document_service.create_document(filepath)
job = job_service.create_job(task_id=task_id, document_id=document.id)
job_service.mark_job_as_started(job_id=job.id)
logger.info(f"Task {task_id} created for document {document.id} from file path: {filepath} and job id: {job.id}")
logger.info(f'Task {task_id} : Created document "{document.id}". Started job "{job.id}"')
logger.info(f"Task {task_id} : Creating associated PDF")
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.SAVING_PDF)
document_service.create_pdf(document.id)
logger.info(f"Task {task_id} : Creating thumbnail")
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.CREATING_THUMBNAIL)
document_service.create_thumbnail(document.id)
# remove the file from the watch folder
os.remove(filepath)
@@ -79,6 +83,19 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
"status": "completed",
}
except DocumentAlreadyExists as e:
logger.info(f"Task {task_id} completed: {str(e)}")
if job is not None:
job_service.mark_job_as_completed(job_id=job.id)
logger.info(f"Job {task_id} marked as COMPLETED")
return {
"task_id": task_id,
"filepath": filepath,
"status": "completed",
"message": str(e),
}
except Exception as e:
error_message = f"Document processing failed: {str(e)}"
logger.error(f"Task {task_id} failed: {error_message}")

View File

@@ -618,7 +618,7 @@ class TestCreatePdf:
assert updated_doc.pdf_file_hash == pdf_hash
# Verify convert_to_pdf was called with correct arguments
doc_path = document_service._get_document_path(created_doc.file_hash)
doc_path = document_service.get_document_path(created_doc.file_hash)
mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder)
# Verify content exists on disk
@@ -694,7 +694,7 @@ class TestCreatePdf:
)
# Simulate missing content by removing file
file_path = document_service._get_document_path(created_doc.file_hash)
file_path = document_service.get_document_path(created_doc.file_hash)
os.remove(file_path)
# Execute