diff --git a/requirements.txt b/requirements.txt index de1f41e..3e007ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ click-didyoumean==0.3.1 click-plugins==1.1.1.2 click-repl==0.3.0 cryptography==46.0.1 +Deprecated==1.2.18 dnspython==2.8.0 ecdsa==0.19.1 email-validator==2.3.0 @@ -32,6 +33,7 @@ mongomock==4.3.0 mongomock-motor==0.0.36 motor==3.7.1 packaging==25.0 +pikepdf==9.11.0 pillow==11.3.0 pipdeptree==2.28.0 pluggy==1.6.0 @@ -44,6 +46,7 @@ pydantic_core==2.33.2 Pygments==2.19.2 PyJWT==2.10.1 pymongo==4.15.1 +PyMuPDF==1.26.4 pypandoc==1.15 pytest==8.4.2 pytest-asyncio==1.2.0 @@ -72,4 +75,5 @@ watchdog==6.0.0 watchfiles==1.1.0 wcwidth==0.2.13 websockets==15.0.1 +wrapt==1.17.3 zipp==3.23.0 diff --git a/src/file-processor/app/api/dependencies.py b/src/file-processor/app/api/dependencies.py index e52a3de..e36fe1d 100644 --- a/src/file-processor/app/api/dependencies.py +++ b/src/file-processor/app/api/dependencies.py @@ -9,6 +9,7 @@ from app.database.connection import get_database from app.models.auth import UserRole from app.models.user import UserInDB from app.services.auth_service import AuthService +from app.services.document_service import DocumentService from app.services.user_service import UserService security = HTTPBearer() @@ -25,6 +26,12 @@ def get_user_service() -> UserService: return UserService(database) +def get_document_service() -> DocumentService: + """Dependency to get DocumentService instance.""" + database = get_database() + return DocumentService(database) + + def get_current_user( credentials: HTTPAuthorizationCredentials = Depends(security), user_service: UserService = Depends(get_user_service) @@ -79,7 +86,7 @@ def get_current_user( return user -def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB: +def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB: """ Dependency to ensure current user has admin role. diff --git a/src/file-processor/app/api/routes/document.py b/src/file-processor/app/api/routes/document.py new file mode 100644 index 0000000..f4df7d6 --- /dev/null +++ b/src/file-processor/app/api/routes/document.py @@ -0,0 +1,241 @@ +""" +Document API routes. + +This module provides REST endpoints for document management operations. +""" + +import logging +import os +from typing import List, Optional + +import fitz # PyMuPDF +from fastapi import APIRouter, Depends, HTTPException, Query, status, Path +from starlette.responses import Response + +from app.api.dependencies import get_document_service, get_current_user +from app.models.document import DocumentResponse, FileDocument +from app.services.document_service import DocumentService + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["Documents"]) + + +def _count_pdf_pages(pdf_file_path: str) -> int: + """ + Count the number of pages in a PDF file using PyMuPDF. + + Args: + pdf_file_path: Path to the PDF file + + Returns: + Number of pages in the PDF, or 0 if file cannot be read + """ + try: + with fitz.open(pdf_file_path) as doc: + return doc.page_count + except Exception as e: + logger.warning(f"Could not count pages for PDF {pdf_file_path}: {e}") + return 0 + + +def _build_object_url(file_hash: Optional[str]) -> Optional[str]: + """ + Build object URL from file hash. + + Args: + file_hash: SHA256 hash of the file + + Returns: + URL string or None if hash is not provided + """ + if not file_hash: + return None + return f"/api/objects/{file_hash}" + + +def _extract_metadata_field(metadata: dict, field_name: str) -> List[str]: + """ + Extract a list field from metadata dictionary. + + Args: + metadata: Document metadata dictionary + field_name: Name of the field to extract + + Returns: + List of strings, empty list if field doesn't exist or is not a list + """ + field_value = metadata.get(field_name, []) + if isinstance(field_value, list): + return [str(item) for item in field_value] + return [] + + +def _map_file_document_to_response( + document: FileDocument, + document_service: DocumentService +) -> DocumentResponse: + """ + Map FileDocument to DocumentResponse format. + + Args: + document: FileDocument instance from database + document_service: Document service for file operations + + Returns: + DocumentResponse instance ready for API response + """ + # Calculate page count for PDF files + page_count = 0 + if document.pdf_file_hash and document_service.exists(document.pdf_file_hash): + pdf_path = document_service.get_document_path(document.pdf_file_hash) + page_count = _count_pdf_pages(pdf_path) + + # Build URLs + thumbnail_url = _build_object_url(document.thumbnail_file_hash) + pdf_url = _build_object_url(document.pdf_file_hash) + + # Extract tags and categories from metadata + tags = _extract_metadata_field(document.metadata, "tags") + categories = _extract_metadata_field(document.metadata, "categories") + + # Format created_at timestamp + created_at = document.detected_at.isoformat() if document.detected_at else "" + + as_dict = { + "id": str(document.id), + "name": document.filename, + "original_file_type": document.file_type.value.upper(), + "created_at": created_at, + "file_size": document.file_size, + "page_count": page_count, + "thumbnail_url": thumbnail_url, + "pdf_url": pdf_url, + "tags": tags, + "categories": categories + } + logger.info(f"Document: {as_dict}") + + return DocumentResponse(**as_dict) + + +@router.get("/documents", response_model=List[DocumentResponse]) +def list_documents( + skip: int = Query(0, ge=0, description="Number of documents to skip"), + limit: int = Query(100, ge=1, le=1000, description="Maximum number of documents to return"), + UserInDB=Depends(get_current_user), + document_service: DocumentService = Depends(get_document_service) +) -> List[DocumentResponse]: + """ + Retrieve a paginated list of documents. + + Args: + skip: Number of documents to skip for pagination + limit: Maximum number of documents to return + document_service: Document service instance + + Returns: + List of documents in API response format + + Raises: + HTTPException: If database operation fails + """ + try: + # Get documents from service + documents = document_service.list_documents(skip=skip, limit=limit) + + # Map to response format + document_responses = [ + _map_file_document_to_response(doc, document_service) + for doc in documents + ] + + return document_responses + + except Exception as e: + logger.error(f"Failed to list documents: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to retrieve documents" + ) + + +@router.get("/objects/{file_hash}") +async def get_object_by_hash( + file_hash: str = Path(..., description="SHA256 hash of the object to retrieve"), + document_service: DocumentService = Depends(get_document_service) +): + """ + Serve object content by its hash. + + This endpoint serves files (original documents, PDFs, thumbnails) by their + SHA256 hash. It supports all file types stored in the objects folder. + + Args: + file_hash: SHA256 hash of the object + document_service: Document service dependency + + Returns: + FileResponse with the requested object content + + Raises: + HTTPException: If object not found (404) or server error (500) + """ + try: + # Check if object exists + if not document_service.exists(file_hash): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Object not found" + ) + + # Get file path + file_path = document_service.get_document_path(file_hash) + + # Verify file exists on disk + if not os.path.exists(file_path): + logger.error(f"Object {file_hash} registered but file not found at {file_path}") + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Object file not found on disk" + ) + + # Determine media type based on file content + try: + file_content = document_service.get_document_content_by_hash(file_hash) + if not file_content: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Object content not available" + ) + + # Detect MIME type + import magic + mime_type = magic.from_buffer(file_content, mime=True) + + # Return file content with appropriate headers + return Response( + content=file_content, + media_type=mime_type, + headers={ + "Content-Length": str(len(file_content)), + "Cache-Control": "public, max-age=3600" # Cache for 1 hour + } + ) + + except Exception as e: + logger.error(f"Error reading object content for hash {file_hash}: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to read object content" + ) + + except HTTPException: + # Re-raise HTTP exceptions as-is + raise + except Exception as e: + logger.error(f"Unexpected error serving object {file_hash}: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Internal server error while serving object" + ) diff --git a/src/file-processor/app/database/connection.py b/src/file-processor/app/database/connection.py index 48295cb..17ac617 100644 --- a/src/file-processor/app/database/connection.py +++ b/src/file-processor/app/database/connection.py @@ -4,7 +4,7 @@ MongoDB database connection management. This module handles MongoDB connection with fail-fast approach. The application will terminate if MongoDB is not accessible at startup. """ - +import logging import sys from typing import Optional @@ -13,11 +13,14 @@ from pymongo.database import Database from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError from app.config.settings import get_mongodb_url, get_mongodb_database_name +from app.utils.security import safe_connection_string # Global variables for singleton pattern _client: Optional[MongoClient] = None _database: Optional[Database] = None +logger = logging.getLogger(__name__) + def create_mongodb_client() -> MongoClient: """ @@ -43,16 +46,16 @@ def create_mongodb_client() -> MongoClient: # Test connection by running admin command client.admin.command('ping') - print(f"Successfully connected to MongoDB at {mongodb_url}") + logger.info(f"Successfully connected to MongoDB at {safe_connection_string(mongodb_url)}") return client except (ConnectionFailure, ServerSelectionTimeoutError) as e: - print(f"ERROR: Failed to connect to MongoDB at {mongodb_url}") - print(f"Connection error: {str(e)}") - print("MongoDB is required for this application. Please ensure MongoDB is running and accessible.") + logger.error(f"ERROR: Failed to connect to MongoDB at {safe_connection_string(mongodb_url)}") + logger.error(f"Connection error: {str(e)}") + logger.error("MongoDB is required for this application. Please ensure MongoDB is running and accessible.") sys.exit(1) except Exception as e: - print(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}") + logger.error(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}") sys.exit(1) @@ -74,7 +77,7 @@ def get_database() -> Database: database_name = get_mongodb_database_name() _database = _client[database_name] - print(f"Connected to database: {database_name}") + logger.info(f"Connected to database: {database_name}") return _database @@ -92,7 +95,7 @@ def close_database_connection(): _client.close() _client = None _database = None - print("MongoDB connection closed") + logger.info("MongoDB connection closed") def get_mongodb_client() -> Optional[MongoClient]: diff --git a/src/file-processor/app/main.py b/src/file-processor/app/main.py index b35825a..c348af1 100644 --- a/src/file-processor/app/main.py +++ b/src/file-processor/app/main.py @@ -17,6 +17,7 @@ from fastapi.middleware.cors import CORSMiddleware from app.api.routes.auth import router as auth_router from app.api.routes.users import router as users_router +from app.api.routes.document import router as documents_router from app.config import settings from app.database.connection import get_database from app.file_watcher import create_file_watcher, FileWatcher @@ -111,7 +112,7 @@ app.add_middleware( # Include routers app.include_router(auth_router, prefix="/auth", tags=["Authentication"]) app.include_router(users_router, prefix="/users", tags=["User Management"]) -# app.include_router(documents_router, prefix="/documents", tags=["Documents"]) +app.include_router(documents_router, prefix="/api", tags=["Documents"]) # app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"]) diff --git a/src/file-processor/app/models/document.py b/src/file-processor/app/models/document.py index 105bffd..f49d779 100644 --- a/src/file-processor/app/models/document.py +++ b/src/file-processor/app/models/document.py @@ -7,10 +7,9 @@ stored in MongoDB collections. from datetime import datetime from enum import Enum -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional -from bson import ObjectId -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field, field_validator, ConfigDict from app.models.types import PyObjectId @@ -50,6 +49,7 @@ class FileDocument(BaseModel): detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected") file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content") pdf_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the associated pdf file content") + thumbnail_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the thumbnail") encoding: str = Field(default="utf-8", description="Character encoding for text files") file_size: int = Field(..., ge=0, description="File size in bytes") mime_type: str = Field(..., description="MIME type detected") @@ -69,3 +69,28 @@ class FileDocument(BaseModel): if not v.strip(): raise ValueError("Filename cannot be empty") return v.strip() + + +class DocumentResponse(BaseModel): + """ + Response model for document API endpoints. + + Represents a document in the format expected by the frontend application. + Field names are automatically converted from snake_case to camelCase. + """ + + model_config = ConfigDict(alias_generator=lambda field_name: ''.join( + word.capitalize() if i > 0 else word + for i, word in enumerate(field_name.split('_')) + ), populate_by_name=True) + + id: str = Field(..., description="Document unique identifier") + name: str = Field(..., description="Document filename") + original_file_type: str = Field(..., description="Original file type before conversion") + created_at: str = Field(..., description="ISO timestamp when document was created") + file_size: int = Field(..., description="File size in bytes") + page_count: int = Field(..., description="Number of pages in the document") + thumbnail_url: Optional[str] = Field(default=None, description="URL to document thumbnail") + pdf_url: Optional[str] = Field(default=None, description="URL to PDF version of document") + tags: List[str] = Field(default_factory=list, description="Document tags") + categories: List[str] = Field(default_factory=list, description="Document categories") diff --git a/src/file-processor/app/models/job.py b/src/file-processor/app/models/job.py index 3e1be5e..af10f06 100644 --- a/src/file-processor/app/models/job.py +++ b/src/file-processor/app/models/job.py @@ -16,6 +16,7 @@ class ProcessingStatus(str, Enum): COMPLETED = "completed" SAVING_OBJECT = "saving_object" SAVING_PDF = "saving_pdf" + CREATING_THUMBNAIL = "creating_thumbnail" FAILED = "failed" diff --git a/src/file-processor/app/services/document_service.py b/src/file-processor/app/services/document_service.py index f20dc90..a3b5364 100644 --- a/src/file-processor/app/services/document_service.py +++ b/src/file-processor/app/services/document_service.py @@ -24,10 +24,22 @@ from app.models.document import ( ) from app.models.types import PyObjectId from app.utils.pdf_converter import convert_to_pdf +from app.utils.pdf_thumbmail import PDFThumbnailGenerator +from app.utils.security import generate_uuid_filename logger = logging.getLogger(__name__) +class DocumentAlreadyExists(Exception): + def __init__(self, message): + self.message = message + + +class DocumentProcessingError(Exception): + def __init__(self, message): + self.message = message + + class DocumentService: """ Service for orchestrated document and content management. @@ -162,7 +174,7 @@ class DocumentService: # Increment counter for next attempt counter += 1 - def _get_document_path(self, file_hash): + def get_document_path(self, file_hash): """ :param file_hash: @@ -171,10 +183,12 @@ class DocumentService: return os.path.join(self.objects_folder, file_hash[:24], file_hash) def exists(self, file_hash): - return os.path.exists(self._get_document_path(file_hash)) + if file_hash is None: + return False + return os.path.exists(self.get_document_path(file_hash)) def save_content_if_needed(self, file_hash, content: bytes): - target_path = self._get_document_path(file_hash) + target_path = self.get_document_path(file_hash) if os.path.exists(target_path): return @@ -192,7 +206,8 @@ class DocumentService: def move_to_ignored(self, file_path, reason="Unknown"): logger.info(f"Moving file {file_path} to ignored folder") - ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(file_path) + ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename( + file_path) ignored_file_path = self._get_safe_path(os.path.join(self.ignored_folder, ignored_file_name)) shutil.move(file_path, ignored_file_path) @@ -231,15 +246,16 @@ class DocumentService: detected_at = datetime.now() try: - logger.info(f"Creating Document for {file_path}") + logger.info(f'Creating Document for "{file_path}"') # Skip the document if it already exists same_document = self.document_repository.find_same_document(filename, file_hash) if same_document is not None: logger.info(f" Document with same hash already exists. Skipping...") self.move_to_ignored(file_path, f"already exists ({same_document.id})") + raise DocumentAlreadyExists(f"Document with same hash already exists ({same_document.id})") self.save_content_if_needed(file_hash, file_bytes) - logger.info(f" Saved content to {self._get_document_path(file_hash)}") + logger.info(f" Saved content to {self.get_document_path(file_hash)}") # Create FileDocument file_data = FileDocument( @@ -255,11 +271,13 @@ class DocumentService: mime_type=mime_type ) - created_file = self.document_repository.create_document(file_data) - logger.info(f" Created document with id '{created_file.id}'") + created_document = self.document_repository.create_document(file_data) + logger.info(f" Created document with id '{created_document.id}'") - return created_file + return created_document + except DocumentAlreadyExists as e: + raise e except Exception as e: # Transaction will automatically rollback if supported raise PyMongoError(f"Failed to create document: {str(e)}") @@ -273,40 +291,69 @@ class DocumentService: document = self.get_document_by_id(document_id) if document is None: logger.error(f" Document not found") - raise ValueError(f"Document {document_id} not found") + raise DocumentProcessingError(f"Document {document_id} not found.") # try to find another document that has the same hash document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash) # the pdf will be created only if it does not exist yet - if (document_with_same_hash is not None and - document_with_same_hash.pdf_file_hash and - self.exists(document_with_same_hash.pdf_file_hash)): - logger.info(f"Found document with same hash. Will use pdf {document_with_same_hash.pdf_file_hash}") + if document_with_same_hash and self.exists(document_with_same_hash.pdf_file_hash): + logger.info(f'Found document with same hash. Will use pdf "{document_with_same_hash.pdf_file_hash}".') self.update_document(document_id, {"pdf_file_hash": document_with_same_hash.pdf_file_hash}) - return True + return # get the content of the file - logger.info(f" No document with same hash found and valid pdf found. Will create new pdf") + logger.info(f" No document with same hash and valid pdf found. Will create new pdf content.") file_bytes = self.get_document_content_by_hash(document.file_hash) if file_bytes is None: - logger.error(f"Content for document {document_id} not found. hash = {document.file_hash}.") - return False + logger.error(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".') + raise DocumentProcessingError(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".') # create the pdf file - temp_pdf_file = convert_to_pdf(self._get_document_path(document.file_hash), self.temp_folder) + temp_pdf_file = convert_to_pdf(self.get_document_path(document.file_hash), self.temp_folder) pdf_file_hash = self._calculate_file_hash(self._read_file_bytes(temp_pdf_file)) self.save_content_if_needed(pdf_file_hash, self._read_file_bytes(temp_pdf_file)) - logger.info(f" Created new pdf file with hash {pdf_file_hash}") - - # remove the temporary file - os.remove(temp_pdf_file) - logger.info(f" Removed temporary pdf file {temp_pdf_file}") + os.remove(temp_pdf_file) # remove the temporary file + logger.info(f' Created new pdf file with hash "{pdf_file_hash}"') # update the document self.update_document(document_id, {"pdf_file_hash": pdf_file_hash}) + + def create_thumbnail(self, document_id: PyObjectId): + logger.info(f'Creating thumbnail document for "{document_id}"') + document = self.get_document_by_id(document_id) + if document is None: + logger.error(f" Document not found !") + raise DocumentProcessingError(f"Document {document_id} not found.") - return True + # try to find another document that has the same hash + document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash) + + # We will use the thumbnail of the pdf if it exists + if document_with_same_hash and self.exists(document_with_same_hash.thumbnail_file_hash): + logger.info(f" Found document with same hash. Will use thumbnail {document_with_same_hash.thumbnail_file_hash}") + self.update_document(document_id, {"thumbnail_file_hash": document_with_same_hash.thumbnail_file_hash}) + return + + logger.info(f" No document with same hash and valid thumbnail found. Will create new thumbnail") + + if not self.exists(document.pdf_file_hash): + logger.error(f" PDF file not found.") + raise DocumentProcessingError(f"PDF file for document {document_id} not found") + + tmp_thumbnail_path = os.path.join(self.temp_folder, f"{generate_uuid_filename()}.png") + with PDFThumbnailGenerator(self.get_document_path(document.pdf_file_hash)) as gen: + # create the thumbnail + gen.create_thumbnail(tmp_thumbnail_path, page_num=0, width=200) + thumbnail_file_hash = self._calculate_file_hash(self._read_file_bytes(tmp_thumbnail_path)) + + # save the thumbnail to the objects folder + self.save_content_if_needed(thumbnail_file_hash, self._read_file_bytes(tmp_thumbnail_path)) + os.remove(tmp_thumbnail_path) + + # update the document + self.update_document(document_id, {"thumbnail_file_hash": thumbnail_file_hash}) + logger.info(f" Created thumbnail {thumbnail_file_hash}") def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]: """ @@ -348,7 +395,7 @@ class DocumentService: return self.document_repository.find_document_by_filepath(filepath) def get_document_content_by_hash(self, file_hash): - target_path = self._get_document_path(file_hash) + target_path = self.get_document_path(file_hash) if not os.path.exists(target_path): return None @@ -439,7 +486,7 @@ class DocumentService: # If no other files reference this content, delete it if not remaining_files: try: - os.remove(self._get_document_path(document.file_hash)) + os.remove(self.get_document_path(document.file_hash)) except Exception: pass diff --git a/src/file-processor/app/utils/pdf_annotation.py b/src/file-processor/app/utils/pdf_annotation.py new file mode 100644 index 0000000..f5a83a5 --- /dev/null +++ b/src/file-processor/app/utils/pdf_annotation.py @@ -0,0 +1,241 @@ +import fitz # PyMuPDF + + +class PDFAnnotator: + def __init__(self, pdf_path): + self.doc = fitz.open(pdf_path) + + def add_highlight(self, rect, page_num=0, color=(1, 1, 0)): + """ + Add highlight annotation + + Args: + rect: (x0, y0, x1, y1) coordinates or fitz.Rect object + page_num: Page number (0-indexed), default first page + color: RGB tuple (0-1 range), default yellow + """ + page = self.doc[page_num] + annot = page.add_highlight_annot(rect) + annot.set_colors(stroke=color) + annot.update() + return annot + + def add_rectangle(self, rect, page_num=0, color=(1, 0, 0), width=2): + """ + Add rectangle annotation (border only) + + Args: + rect: (x0, y0, x1, y1) coordinates or fitz.Rect object + page_num: Page number (0-indexed), default first page + color: RGB tuple (0-1 range), default red + width: Line width in points + """ + page = self.doc[page_num] + annot = page.add_rect_annot(rect) + annot.set_colors(stroke=color) + annot.set_border(width=width) + annot.update() + return annot + + def add_text_note(self, point, text, page_num=0, icon="Note"): + """ + Add sticky note annotation + + Args: + point: (x, y) position tuple + text: Note content string + page_num: Page number (0-indexed), default first page + icon: "Note", "Comment", "Help", "Insert", "Key", etc. + """ + page = self.doc[page_num] + annot = page.add_text_annot(point, text, icon=icon) + annot.update() + return annot + + def add_free_text(self, rect, text, page_num=0, fontsize=12, + color=(0, 0, 0)): + """ + Add free text annotation (visible text box) + + Args: + rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect + text: Text content string + page_num: Page number (0-indexed), default first page + fontsize: Font size in points + color: Text color RGB tuple (0-1 range) + """ + page = self.doc[page_num] + annot = page.add_freetext_annot( + rect, + text, + fontsize=fontsize, + text_color=color + ) + annot.update() + return annot + + def add_arrow(self, start_point, end_point, page_num=0, + color=(1, 0, 0), width=2): + """ + Add arrow annotation + + Args: + start_point: (x, y) tuple for arrow start + end_point: (x, y) tuple for arrow end + page_num: Page number (0-indexed), default first page + color: Arrow color RGB tuple (0-1 range), default red + width: Line width in points + """ + page = self.doc[page_num] + annot = page.add_line_annot(start_point, end_point) + annot.set_colors(stroke=color) + annot.set_border(width=width) + # Set arrow at end - use integer constant + annot.set_line_ends(0, 1) # 1 = ClosedArrow + annot.update() + return annot + + def add_stamp(self, rect, page_num=0, stamp_type=0): + """ + Add stamp annotation + + Args: + rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect + page_num: Page number (0-indexed), default first page + stamp_type: Integer for stamp type: + 0=Approved, 1=AsIs, 2=Confidential, + 3=Departmental, 4=Draft, 5=Experimental, + 6=Expired, 7=Final, 8=ForComment, + 9=ForPublicRelease, 10=NotApproved, etc. + """ + page = self.doc[page_num] + annot = page.add_stamp_annot(rect, stamp=stamp_type) + annot.update() + return annot + + def add_redaction(self, rect, page_num=0, fill_color=(0, 0, 0)): + """ + Add redaction annotation (marks area for redaction) + Note: Use apply_redactions() to permanently remove content + + Args: + rect: (x0, y0, x1, y1) area to redact, tuple or fitz.Rect + page_num: Page number (0-indexed), default first page + fill_color: RGB tuple (0-1 range) for redacted area, default black + """ + page = self.doc[page_num] + annot = page.add_redact_annot(rect, fill=fill_color) + annot.update() + return annot + + def apply_redactions(self, page_num=0, images=2, graphics=2, text=2): + """ + Apply all redaction annotations on a page (permanent removal) + + Args: + page_num: Page number (0-indexed), default first page + images: 2=remove, 1=blank, 0=ignore + graphics: 2=remove, 1=blank, 0=ignore + text: 2=remove, 1=blank, 0=ignore + + Returns: + True if redactions were applied, False otherwise + """ + page = self.doc[page_num] + # Check if page has redaction annotations + has_redactions = any(annot.type[0] == 12 for annot in page.annots()) + + if has_redactions: + page.apply_redactions(images=images, graphics=graphics, text=text) + return True + return False + + def get_all_annotations(self, page_num=0): + """ + Retrieve all annotations from a page + + Args: + page_num: Page number (0-indexed), default first page + + Returns: + List of dicts with annotation information + """ + page = self.doc[page_num] + annotations = [] + + for annot in page.annots(): + info = { + 'type': annot.type[1], # Annotation type name + 'rect': annot.rect, + 'content': annot.info.get('content', ''), + 'author': annot.info.get('title', ''), + 'created': annot.info.get('creationDate', ''), + 'colors': annot.colors + } + annotations.append(info) + + return annotations + + def remove_all_annotations(self, page_num=0): + """ + Remove all annotations from a page + + Args: + page_num: Page number (0-indexed), default first page + """ + page = self.doc[page_num] + for annot in page.annots(): + page.delete_annot(annot) + + def save(self, output_path): + """Save the annotated PDF""" + self.doc.save(output_path) + + def close(self): + self.doc.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + +# Example usage +if __name__ == "__main__": + with PDFAnnotator("input.pdf") as annotator: + # Add yellow highlight + annotator.add_highlight((100, 100, 300, 120), page_num=0, + color=(1, 1, 0)) + + # Add red rectangle border + annotator.add_rectangle((100, 150, 300, 250), page_num=0, + color=(1, 0, 0), width=3) + + # Add sticky note + annotator.add_text_note((400, 100), "This is important!", + page_num=0, icon="Comment") + + # Add visible text box + annotator.add_free_text((100, 300, 400, 350), "DRAFT VERSION", + page_num=0, fontsize=20, color=(1, 0, 0)) + + # Add arrow pointing to something + annotator.add_arrow((450, 100), (500, 200), page_num=0, + color=(0, 0, 1), width=2) + + # Add "Approved" stamp + annotator.add_stamp((450, 300, 550, 350), page_num=0, stamp_type=0) + + # Add redaction (black box over sensitive info) + annotator.add_redaction((100, 400, 300, 420), page_num=0) + annotator.apply_redactions(page_num=0) + + # List all annotations + annots = annotator.get_all_annotations(page_num=0) + print(f"Found {len(annots)} annotations:") + for a in annots: + print(f" - {a['type']} at {a['rect']}") + + # Save annotated PDF + annotator.save("output_annotated.pdf") \ No newline at end of file diff --git a/src/file-processor/app/utils/pdf_converter.py b/src/file-processor/app/utils/pdf_converter.py index 2012f56..7232fa0 100644 --- a/src/file-processor/app/utils/pdf_converter.py +++ b/src/file-processor/app/utils/pdf_converter.py @@ -127,6 +127,15 @@ class TextToPdfConverter(BaseConverter): return self +class PdfToPdfConverter(BaseConverter): + """Converter for PDF files to PDF.""" + + def convert(self) -> Self: + # copy self.input_path to self.output_path + os.system(f"cp {self.input_path} {self.output_path}") + return self + + class ImageToPdfConverter(BaseConverter): """Converter for image files to PDF.""" @@ -191,6 +200,8 @@ def convert_to_pdf(filepath: str, output_dir: str = ".") -> str: converter = ImageToPdfConverter(filepath, output_dir=output_dir) elif file_type == "word": converter = WordToPdfConverter(filepath, output_dir=output_dir) + elif file_type == "pdf": + converter = PdfToPdfConverter(filepath, output_dir=output_dir) else: raise ValueError(f"Unsupported file type: {file_type}") diff --git a/src/file-processor/app/utils/pdf_thumbmail.py b/src/file-processor/app/utils/pdf_thumbmail.py new file mode 100644 index 0000000..e3a2b0e --- /dev/null +++ b/src/file-processor/app/utils/pdf_thumbmail.py @@ -0,0 +1,167 @@ +from pathlib import Path + +import fitz # PyMuPDF + + +class PDFThumbnailGenerator: + def __init__(self, pdf_path): + """ + Initialize PDF thumbnail generator + + Args: + pdf_path: Path to the PDF file (string or Path object) + """ + self.pdf_path = pdf_path + self.doc = fitz.open(pdf_path) + + def create_thumbnail(self, output_path, page_num=0, width=200, rotation=0, zoom_factor=1.0): + """ + Create a thumbnail with zoom and rotation + + Args: + output_path: Path to save the thumbnail (string or Path) + page_num: Page number (0-indexed), default first page + width: Desired width in pixels, default 200 + rotation: Rotation angle in degrees (0, 90, 180, 270), default 0 + zoom_factor: Additional zoom multiplier (1.0 = normal, 2.0 = 2x), default 1.0 + + Returns: + Dict with thumbnail info (width, height, rotation, zoom) + """ + page = self.doc[page_num] + + # Apply rotation to page + page.set_rotation(rotation) + + # Calculate zoom to achieve desired width + base_zoom = width / page.rect.width + final_zoom = base_zoom * zoom_factor + + # Create transformation matrix + mat = fitz.Matrix(final_zoom, final_zoom) + + # Render page to pixmap + pix = page.get_pixmap(matrix=mat, alpha=False) + + # Save thumbnail + pix.save(output_path) + + return { + 'width': pix.width, + 'height': pix.height, + 'rotation': rotation, + 'zoom': zoom_factor + } + + def create_cropped_thumbnail(self, output_path, crop_rect=None, page_num=0, width=200): + """ + Create a thumbnail of a specific region (zoom on area) + + Args: + output_path: Path to save the thumbnail (string or Path) + crop_rect: Tuple (x0, y0, x1, y1) in PDF coordinates for cropping, + or None for full page, default None + page_num: Page number (0-indexed), default first page + width: Desired width in pixels, default 200 + + Returns: + Tuple (width, height) of the generated thumbnail + """ + page = self.doc[page_num] + + if crop_rect: + # Create rectangle for cropping + rect = fitz.Rect(crop_rect) + zoom = width / rect.width + else: + rect = page.rect + zoom = width / page.rect.width + + mat = fitz.Matrix(zoom, zoom) + + # Render only the specified rectangle + pix = page.get_pixmap(matrix=mat, clip=rect) + pix.save(output_path) + + return pix.width, pix.height + + def get_page_info(self, page_num=0): + """ + Get information about a specific page + + Args: + page_num: Page number (0-indexed), default first page + + Returns: + Dict with page information (width, height, rotation, number, total_pages) + """ + page = self.doc[page_num] + return { + 'width': page.rect.width, + 'height': page.rect.height, + 'rotation': page.rotation, + 'number': page_num + 1, + 'total_pages': len(self.doc) + } + + def create_multi_resolution_thumbnails(self, output_folder, page_num=0, sizes=(150, 300, 600)): + """ + Create multiple thumbnails at different resolutions + + Args: + output_folder: Folder path to save thumbnails (string or Path) + page_num: Page number (0-indexed), default first page + sizes: List of widths in pixels, default [150, 300, 600] + + Returns: + Dict mapping each size to thumbnail info + """ + output_folder = Path(output_folder) + output_folder.mkdir(exist_ok=True, parents=True) + + results = {} + for size in sizes: + output_path = output_folder / f"thumb_{size}px.png" + info = self.create_thumbnail(output_path, page_num=page_num, width=size) + results[size] = info + + return results + + def close(self): + """Close the PDF document and free resources""" + self.doc.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + +# Example usage +if __name__ == "__main__": + # Basic usage with context manager + with PDFThumbnailGenerator("example.pdf") as gen: + # Standard thumbnail + gen.create_thumbnail("thumb_standard.png", page_num=0, width=200) + + # Rotated thumbnail + gen.create_thumbnail("thumb_rotated.png", page_num=0, + width=200, rotation=90) + + # Zoomed thumbnail (2x zoom) + gen.create_thumbnail("thumb_zoomed.png", page_num=0, + width=200, zoom_factor=2.0) + + # Cropped/zoomed on specific area (x0, y0, x1, y1) + gen.create_cropped_thumbnail("thumb_crop.png", + crop_rect=(100, 100, 400, 400), + page_num=0, width=300) + + # Multiple resolutions + gen.create_multi_resolution_thumbnails("thumbnails/", page_num=0, + sizes=[150, 300, 600]) + + # Get page information + info = gen.get_page_info(page_num=0) + print(f"Page info: {info}") diff --git a/src/file-processor/app/utils/security.py b/src/file-processor/app/utils/security.py index deda634..6a7d70f 100644 --- a/src/file-processor/app/utils/security.py +++ b/src/file-processor/app/utils/security.py @@ -4,9 +4,10 @@ Password security utilities using bcrypt for secure password hashing. This module provides secure password hashing and verification functions using the bcrypt algorithm with automatic salt generation. """ +import re +import uuid import bcrypt -from typing import Union def hash_password(password: str) -> str: @@ -71,4 +72,33 @@ def verify_password(password: str, hashed_password: str) -> bool: # bcrypt raises ValueError for malformed hashes raise RuntimeError(f"Invalid hash format: {str(e)}") except Exception as e: - raise RuntimeError(f"Failed to verify password: {str(e)}") \ No newline at end of file + raise RuntimeError(f"Failed to verify password: {str(e)}") + + +def generate_uuid_filename() -> str: + """Generate a unique filename using UUID4.""" + return str(uuid.uuid4()) + + +def safe_connection_string(connection_string: str) -> str: + """ + Mask the password in a MongoDB connection string. + + Args: + connection_string (str): The complete MongoDB connection string + + Returns: + str: The connection string with password replaced by asterisks + + Example: + >>> mask_mongodb_password("mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin") + "mongodb://admin:***@mongodb:27017/mydocmanager?authSource=admin" + """ + # Pattern to detect password in MongoDB URL + # Format: mongodb://username:password@host:port/database + pattern = r'(mongodb://[^:]+:)([^@]+)(@.*)' + + # Replace password with asterisks + masked_string = re.sub(pattern, r'\1*****\3', connection_string) + + return masked_string diff --git a/src/file-processor/requirements.txt b/src/file-processor/requirements.txt index 8a69627..2d1c86c 100644 --- a/src/file-processor/requirements.txt +++ b/src/file-processor/requirements.txt @@ -10,6 +10,7 @@ pillow==11.3.0 pydantic==2.11.9 PyJWT==2.10.1 pymongo==4.15.0 +PyMuPDF==1.26.4 pypandoc==1.15 python-multipart==0.0.20 redis==6.4.0 diff --git a/src/frontend/src/components/common/Menu.jsx b/src/frontend/src/components/common/Menu.jsx index fa45f33..3f67f42 100644 --- a/src/frontend/src/components/common/Menu.jsx +++ b/src/frontend/src/components/common/Menu.jsx @@ -1,11 +1,13 @@ import {FaBuffer, FaPlus} from "react-icons/fa6"; +import { Link } from "react-router-dom"; const Menu = () => { return (
diff --git a/src/frontend/src/components/documents/DocumentCard.jsx b/src/frontend/src/components/documents/DocumentCard.jsx index 426fafe..cd97e09 100644 --- a/src/frontend/src/components/documents/DocumentCard.jsx +++ b/src/frontend/src/components/documents/DocumentCard.jsx @@ -64,8 +64,8 @@ const DocumentCard = memo(({ document, viewMode, onEdit, onDelete }) => { const renderThumbnail = () => (
{`${name} new Promise(resolve => setTimeout(resolve, ms)); /** - * Fetches all documents + * Fetches all documents from the API * @returns {Promise} Array of document objects */ export const getAllDocuments = async () => { - await delay(500); // Simulate network latency - return [...mockDocuments]; + try { + const response = await api.get('/api/documents'); + return response.data; + } catch (error) { + console.error('Failed to fetch documents:', error); + // Fallback to mock data in case of API error during development + console.warn('Falling back to mock data'); + } }; /** diff --git a/src/worker/requirements.txt b/src/worker/requirements.txt index 8a69627..2d1c86c 100644 --- a/src/worker/requirements.txt +++ b/src/worker/requirements.txt @@ -10,6 +10,7 @@ pillow==11.3.0 pydantic==2.11.9 PyJWT==2.10.1 pymongo==4.15.0 +PyMuPDF==1.26.4 pypandoc==1.15 python-multipart==0.0.20 redis==6.4.0 diff --git a/src/worker/tasks/document_processing.py b/src/worker/tasks/document_processing.py index b19d5fd..1bffd4c 100644 --- a/src/worker/tasks/document_processing.py +++ b/src/worker/tasks/document_processing.py @@ -12,7 +12,7 @@ from typing import Any, Dict from app.config import settings from app.database.connection import get_database from app.models.job import ProcessingStatus -from app.services.document_service import DocumentService +from app.services.document_service import DocumentService, DocumentAlreadyExists from app.services.job_service import JobService from tasks.main import celery_app @@ -26,7 +26,7 @@ def get_services(): return document_service, job_service -#@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60}) +# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60}) @celery_app.task(bind=True) def process_document(self, filepath: str) -> Dict[str, Any]: """ @@ -48,7 +48,7 @@ def process_document(self, filepath: str) -> Dict[str, Any]: Exception: Any processing error (will trigger retry) """ task_id = self.request.id - logger.info(f"Starting document processing task {task_id} for file: {filepath}") + logger.info(f'Task {task_id} : Starting document processing for file: "{filepath}"') # get services document_service, job_service = get_services() @@ -60,12 +60,16 @@ def process_document(self, filepath: str) -> Dict[str, Any]: document = document_service.create_document(filepath) job = job_service.create_job(task_id=task_id, document_id=document.id) job_service.mark_job_as_started(job_id=job.id) - logger.info(f"Task {task_id} created for document {document.id} from file path: {filepath} and job id: {job.id}") + logger.info(f'Task {task_id} : Created document "{document.id}". Started job "{job.id}"') logger.info(f"Task {task_id} : Creating associated PDF") job_service.update_job_status(job_id=job.id, status=ProcessingStatus.SAVING_PDF) document_service.create_pdf(document.id) + logger.info(f"Task {task_id} : Creating thumbnail") + job_service.update_job_status(job_id=job.id, status=ProcessingStatus.CREATING_THUMBNAIL) + document_service.create_thumbnail(document.id) + # remove the file from the watch folder os.remove(filepath) @@ -79,6 +83,19 @@ def process_document(self, filepath: str) -> Dict[str, Any]: "status": "completed", } + except DocumentAlreadyExists as e: + logger.info(f"Task {task_id} completed: {str(e)}") + if job is not None: + job_service.mark_job_as_completed(job_id=job.id) + logger.info(f"Job {task_id} marked as COMPLETED") + + return { + "task_id": task_id, + "filepath": filepath, + "status": "completed", + "message": str(e), + } + except Exception as e: error_message = f"Document processing failed: {str(e)}" logger.error(f"Task {task_id} failed: {error_message}") diff --git a/tests/services/test_document_service.py b/tests/services/test_document_service.py index 80fb157..0873b3e 100644 --- a/tests/services/test_document_service.py +++ b/tests/services/test_document_service.py @@ -618,7 +618,7 @@ class TestCreatePdf: assert updated_doc.pdf_file_hash == pdf_hash # Verify convert_to_pdf was called with correct arguments - doc_path = document_service._get_document_path(created_doc.file_hash) + doc_path = document_service.get_document_path(created_doc.file_hash) mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder) # Verify content exists on disk @@ -694,7 +694,7 @@ class TestCreatePdf: ) # Simulate missing content by removing file - file_path = document_service._get_document_path(created_doc.file_hash) + file_path = document_service.get_document_path(created_doc.file_hash) os.remove(file_path) # Execute