Thumbnails generated and displayed in the front end
This commit is contained in:
@@ -13,6 +13,7 @@ click-didyoumean==0.3.1
|
||||
click-plugins==1.1.1.2
|
||||
click-repl==0.3.0
|
||||
cryptography==46.0.1
|
||||
Deprecated==1.2.18
|
||||
dnspython==2.8.0
|
||||
ecdsa==0.19.1
|
||||
email-validator==2.3.0
|
||||
@@ -32,6 +33,7 @@ mongomock==4.3.0
|
||||
mongomock-motor==0.0.36
|
||||
motor==3.7.1
|
||||
packaging==25.0
|
||||
pikepdf==9.11.0
|
||||
pillow==11.3.0
|
||||
pipdeptree==2.28.0
|
||||
pluggy==1.6.0
|
||||
@@ -44,6 +46,7 @@ pydantic_core==2.33.2
|
||||
Pygments==2.19.2
|
||||
PyJWT==2.10.1
|
||||
pymongo==4.15.1
|
||||
PyMuPDF==1.26.4
|
||||
pypandoc==1.15
|
||||
pytest==8.4.2
|
||||
pytest-asyncio==1.2.0
|
||||
@@ -72,4 +75,5 @@ watchdog==6.0.0
|
||||
watchfiles==1.1.0
|
||||
wcwidth==0.2.13
|
||||
websockets==15.0.1
|
||||
wrapt==1.17.3
|
||||
zipp==3.23.0
|
||||
|
||||
@@ -9,6 +9,7 @@ from app.database.connection import get_database
|
||||
from app.models.auth import UserRole
|
||||
from app.models.user import UserInDB
|
||||
from app.services.auth_service import AuthService
|
||||
from app.services.document_service import DocumentService
|
||||
from app.services.user_service import UserService
|
||||
|
||||
security = HTTPBearer()
|
||||
@@ -25,6 +26,12 @@ def get_user_service() -> UserService:
|
||||
return UserService(database)
|
||||
|
||||
|
||||
def get_document_service() -> DocumentService:
|
||||
"""Dependency to get DocumentService instance."""
|
||||
database = get_database()
|
||||
return DocumentService(database)
|
||||
|
||||
|
||||
def get_current_user(
|
||||
credentials: HTTPAuthorizationCredentials = Depends(security),
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
@@ -79,7 +86,7 @@ def get_current_user(
|
||||
return user
|
||||
|
||||
|
||||
def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
|
||||
def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
|
||||
"""
|
||||
Dependency to ensure current user has admin role.
|
||||
|
||||
|
||||
241
src/file-processor/app/api/routes/document.py
Normal file
241
src/file-processor/app/api/routes/document.py
Normal file
@@ -0,0 +1,241 @@
|
||||
"""
|
||||
Document API routes.
|
||||
|
||||
This module provides REST endpoints for document management operations.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, status, Path
|
||||
from starlette.responses import Response
|
||||
|
||||
from app.api.dependencies import get_document_service, get_current_user
|
||||
from app.models.document import DocumentResponse, FileDocument
|
||||
from app.services.document_service import DocumentService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(tags=["Documents"])
|
||||
|
||||
|
||||
def _count_pdf_pages(pdf_file_path: str) -> int:
|
||||
"""
|
||||
Count the number of pages in a PDF file using PyMuPDF.
|
||||
|
||||
Args:
|
||||
pdf_file_path: Path to the PDF file
|
||||
|
||||
Returns:
|
||||
Number of pages in the PDF, or 0 if file cannot be read
|
||||
"""
|
||||
try:
|
||||
with fitz.open(pdf_file_path) as doc:
|
||||
return doc.page_count
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not count pages for PDF {pdf_file_path}: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def _build_object_url(file_hash: Optional[str]) -> Optional[str]:
|
||||
"""
|
||||
Build object URL from file hash.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of the file
|
||||
|
||||
Returns:
|
||||
URL string or None if hash is not provided
|
||||
"""
|
||||
if not file_hash:
|
||||
return None
|
||||
return f"/api/objects/{file_hash}"
|
||||
|
||||
|
||||
def _extract_metadata_field(metadata: dict, field_name: str) -> List[str]:
|
||||
"""
|
||||
Extract a list field from metadata dictionary.
|
||||
|
||||
Args:
|
||||
metadata: Document metadata dictionary
|
||||
field_name: Name of the field to extract
|
||||
|
||||
Returns:
|
||||
List of strings, empty list if field doesn't exist or is not a list
|
||||
"""
|
||||
field_value = metadata.get(field_name, [])
|
||||
if isinstance(field_value, list):
|
||||
return [str(item) for item in field_value]
|
||||
return []
|
||||
|
||||
|
||||
def _map_file_document_to_response(
|
||||
document: FileDocument,
|
||||
document_service: DocumentService
|
||||
) -> DocumentResponse:
|
||||
"""
|
||||
Map FileDocument to DocumentResponse format.
|
||||
|
||||
Args:
|
||||
document: FileDocument instance from database
|
||||
document_service: Document service for file operations
|
||||
|
||||
Returns:
|
||||
DocumentResponse instance ready for API response
|
||||
"""
|
||||
# Calculate page count for PDF files
|
||||
page_count = 0
|
||||
if document.pdf_file_hash and document_service.exists(document.pdf_file_hash):
|
||||
pdf_path = document_service.get_document_path(document.pdf_file_hash)
|
||||
page_count = _count_pdf_pages(pdf_path)
|
||||
|
||||
# Build URLs
|
||||
thumbnail_url = _build_object_url(document.thumbnail_file_hash)
|
||||
pdf_url = _build_object_url(document.pdf_file_hash)
|
||||
|
||||
# Extract tags and categories from metadata
|
||||
tags = _extract_metadata_field(document.metadata, "tags")
|
||||
categories = _extract_metadata_field(document.metadata, "categories")
|
||||
|
||||
# Format created_at timestamp
|
||||
created_at = document.detected_at.isoformat() if document.detected_at else ""
|
||||
|
||||
as_dict = {
|
||||
"id": str(document.id),
|
||||
"name": document.filename,
|
||||
"original_file_type": document.file_type.value.upper(),
|
||||
"created_at": created_at,
|
||||
"file_size": document.file_size,
|
||||
"page_count": page_count,
|
||||
"thumbnail_url": thumbnail_url,
|
||||
"pdf_url": pdf_url,
|
||||
"tags": tags,
|
||||
"categories": categories
|
||||
}
|
||||
logger.info(f"Document: {as_dict}")
|
||||
|
||||
return DocumentResponse(**as_dict)
|
||||
|
||||
|
||||
@router.get("/documents", response_model=List[DocumentResponse])
|
||||
def list_documents(
|
||||
skip: int = Query(0, ge=0, description="Number of documents to skip"),
|
||||
limit: int = Query(100, ge=1, le=1000, description="Maximum number of documents to return"),
|
||||
UserInDB=Depends(get_current_user),
|
||||
document_service: DocumentService = Depends(get_document_service)
|
||||
) -> List[DocumentResponse]:
|
||||
"""
|
||||
Retrieve a paginated list of documents.
|
||||
|
||||
Args:
|
||||
skip: Number of documents to skip for pagination
|
||||
limit: Maximum number of documents to return
|
||||
document_service: Document service instance
|
||||
|
||||
Returns:
|
||||
List of documents in API response format
|
||||
|
||||
Raises:
|
||||
HTTPException: If database operation fails
|
||||
"""
|
||||
try:
|
||||
# Get documents from service
|
||||
documents = document_service.list_documents(skip=skip, limit=limit)
|
||||
|
||||
# Map to response format
|
||||
document_responses = [
|
||||
_map_file_document_to_response(doc, document_service)
|
||||
for doc in documents
|
||||
]
|
||||
|
||||
return document_responses
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list documents: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Failed to retrieve documents"
|
||||
)
|
||||
|
||||
|
||||
@router.get("/objects/{file_hash}")
|
||||
async def get_object_by_hash(
|
||||
file_hash: str = Path(..., description="SHA256 hash of the object to retrieve"),
|
||||
document_service: DocumentService = Depends(get_document_service)
|
||||
):
|
||||
"""
|
||||
Serve object content by its hash.
|
||||
|
||||
This endpoint serves files (original documents, PDFs, thumbnails) by their
|
||||
SHA256 hash. It supports all file types stored in the objects folder.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of the object
|
||||
document_service: Document service dependency
|
||||
|
||||
Returns:
|
||||
FileResponse with the requested object content
|
||||
|
||||
Raises:
|
||||
HTTPException: If object not found (404) or server error (500)
|
||||
"""
|
||||
try:
|
||||
# Check if object exists
|
||||
if not document_service.exists(file_hash):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="Object not found"
|
||||
)
|
||||
|
||||
# Get file path
|
||||
file_path = document_service.get_document_path(file_hash)
|
||||
|
||||
# Verify file exists on disk
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"Object {file_hash} registered but file not found at {file_path}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="Object file not found on disk"
|
||||
)
|
||||
|
||||
# Determine media type based on file content
|
||||
try:
|
||||
file_content = document_service.get_document_content_by_hash(file_hash)
|
||||
if not file_content:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="Object content not available"
|
||||
)
|
||||
|
||||
# Detect MIME type
|
||||
import magic
|
||||
mime_type = magic.from_buffer(file_content, mime=True)
|
||||
|
||||
# Return file content with appropriate headers
|
||||
return Response(
|
||||
content=file_content,
|
||||
media_type=mime_type,
|
||||
headers={
|
||||
"Content-Length": str(len(file_content)),
|
||||
"Cache-Control": "public, max-age=3600" # Cache for 1 hour
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading object content for hash {file_hash}: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Failed to read object content"
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
# Re-raise HTTP exceptions as-is
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error serving object {file_hash}: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Internal server error while serving object"
|
||||
)
|
||||
@@ -4,7 +4,7 @@ MongoDB database connection management.
|
||||
This module handles MongoDB connection with fail-fast approach.
|
||||
The application will terminate if MongoDB is not accessible at startup.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
@@ -13,11 +13,14 @@ from pymongo.database import Database
|
||||
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
||||
|
||||
from app.config.settings import get_mongodb_url, get_mongodb_database_name
|
||||
from app.utils.security import safe_connection_string
|
||||
|
||||
# Global variables for singleton pattern
|
||||
_client: Optional[MongoClient] = None
|
||||
_database: Optional[Database] = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_mongodb_client() -> MongoClient:
|
||||
"""
|
||||
@@ -43,16 +46,16 @@ def create_mongodb_client() -> MongoClient:
|
||||
# Test connection by running admin command
|
||||
client.admin.command('ping')
|
||||
|
||||
print(f"Successfully connected to MongoDB at {mongodb_url}")
|
||||
logger.info(f"Successfully connected to MongoDB at {safe_connection_string(mongodb_url)}")
|
||||
return client
|
||||
|
||||
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
|
||||
print(f"ERROR: Failed to connect to MongoDB at {mongodb_url}")
|
||||
print(f"Connection error: {str(e)}")
|
||||
print("MongoDB is required for this application. Please ensure MongoDB is running and accessible.")
|
||||
logger.error(f"ERROR: Failed to connect to MongoDB at {safe_connection_string(mongodb_url)}")
|
||||
logger.error(f"Connection error: {str(e)}")
|
||||
logger.error("MongoDB is required for this application. Please ensure MongoDB is running and accessible.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}")
|
||||
logger.error(f"ERROR: Unexpected error connecting to MongoDB: {str(e)}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -74,7 +77,7 @@ def get_database() -> Database:
|
||||
|
||||
database_name = get_mongodb_database_name()
|
||||
_database = _client[database_name]
|
||||
print(f"Connected to database: {database_name}")
|
||||
logger.info(f"Connected to database: {database_name}")
|
||||
|
||||
return _database
|
||||
|
||||
@@ -92,7 +95,7 @@ def close_database_connection():
|
||||
_client.close()
|
||||
_client = None
|
||||
_database = None
|
||||
print("MongoDB connection closed")
|
||||
logger.info("MongoDB connection closed")
|
||||
|
||||
|
||||
def get_mongodb_client() -> Optional[MongoClient]:
|
||||
|
||||
@@ -17,6 +17,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from app.api.routes.auth import router as auth_router
|
||||
from app.api.routes.users import router as users_router
|
||||
from app.api.routes.document import router as documents_router
|
||||
from app.config import settings
|
||||
from app.database.connection import get_database
|
||||
from app.file_watcher import create_file_watcher, FileWatcher
|
||||
@@ -111,7 +112,7 @@ app.add_middleware(
|
||||
# Include routers
|
||||
app.include_router(auth_router, prefix="/auth", tags=["Authentication"])
|
||||
app.include_router(users_router, prefix="/users", tags=["User Management"])
|
||||
# app.include_router(documents_router, prefix="/documents", tags=["Documents"])
|
||||
app.include_router(documents_router, prefix="/api", tags=["Documents"])
|
||||
# app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"])
|
||||
|
||||
|
||||
|
||||
@@ -7,10 +7,9 @@ stored in MongoDB collections.
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from bson import ObjectId
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
from pydantic import BaseModel, Field, field_validator, ConfigDict
|
||||
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
@@ -50,6 +49,7 @@ class FileDocument(BaseModel):
|
||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
pdf_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the associated pdf file content")
|
||||
thumbnail_file_hash: Optional[str] = Field(default=None, description="SHA256 hash of the thumbnail")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
@@ -69,3 +69,28 @@ class FileDocument(BaseModel):
|
||||
if not v.strip():
|
||||
raise ValueError("Filename cannot be empty")
|
||||
return v.strip()
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
"""
|
||||
Response model for document API endpoints.
|
||||
|
||||
Represents a document in the format expected by the frontend application.
|
||||
Field names are automatically converted from snake_case to camelCase.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(alias_generator=lambda field_name: ''.join(
|
||||
word.capitalize() if i > 0 else word
|
||||
for i, word in enumerate(field_name.split('_'))
|
||||
), populate_by_name=True)
|
||||
|
||||
id: str = Field(..., description="Document unique identifier")
|
||||
name: str = Field(..., description="Document filename")
|
||||
original_file_type: str = Field(..., description="Original file type before conversion")
|
||||
created_at: str = Field(..., description="ISO timestamp when document was created")
|
||||
file_size: int = Field(..., description="File size in bytes")
|
||||
page_count: int = Field(..., description="Number of pages in the document")
|
||||
thumbnail_url: Optional[str] = Field(default=None, description="URL to document thumbnail")
|
||||
pdf_url: Optional[str] = Field(default=None, description="URL to PDF version of document")
|
||||
tags: List[str] = Field(default_factory=list, description="Document tags")
|
||||
categories: List[str] = Field(default_factory=list, description="Document categories")
|
||||
|
||||
@@ -16,6 +16,7 @@ class ProcessingStatus(str, Enum):
|
||||
COMPLETED = "completed"
|
||||
SAVING_OBJECT = "saving_object"
|
||||
SAVING_PDF = "saving_pdf"
|
||||
CREATING_THUMBNAIL = "creating_thumbnail"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
|
||||
@@ -24,10 +24,22 @@ from app.models.document import (
|
||||
)
|
||||
from app.models.types import PyObjectId
|
||||
from app.utils.pdf_converter import convert_to_pdf
|
||||
from app.utils.pdf_thumbmail import PDFThumbnailGenerator
|
||||
from app.utils.security import generate_uuid_filename
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentAlreadyExists(Exception):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
|
||||
class DocumentProcessingError(Exception):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
|
||||
class DocumentService:
|
||||
"""
|
||||
Service for orchestrated document and content management.
|
||||
@@ -162,7 +174,7 @@ class DocumentService:
|
||||
# Increment counter for next attempt
|
||||
counter += 1
|
||||
|
||||
def _get_document_path(self, file_hash):
|
||||
def get_document_path(self, file_hash):
|
||||
"""
|
||||
|
||||
:param file_hash:
|
||||
@@ -171,10 +183,12 @@ class DocumentService:
|
||||
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
||||
|
||||
def exists(self, file_hash):
|
||||
return os.path.exists(self._get_document_path(file_hash))
|
||||
if file_hash is None:
|
||||
return False
|
||||
return os.path.exists(self.get_document_path(file_hash))
|
||||
|
||||
def save_content_if_needed(self, file_hash, content: bytes):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
target_path = self.get_document_path(file_hash)
|
||||
if os.path.exists(target_path):
|
||||
return
|
||||
|
||||
@@ -192,7 +206,8 @@ class DocumentService:
|
||||
|
||||
def move_to_ignored(self, file_path, reason="Unknown"):
|
||||
logger.info(f"Moving file {file_path} to ignored folder")
|
||||
ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(file_path)
|
||||
ignored_file_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + f"_### {reason} ###_" + os.path.basename(
|
||||
file_path)
|
||||
ignored_file_path = self._get_safe_path(os.path.join(self.ignored_folder, ignored_file_name))
|
||||
shutil.move(file_path, ignored_file_path)
|
||||
|
||||
@@ -231,15 +246,16 @@ class DocumentService:
|
||||
detected_at = datetime.now()
|
||||
|
||||
try:
|
||||
logger.info(f"Creating Document for {file_path}")
|
||||
logger.info(f'Creating Document for "{file_path}"')
|
||||
# Skip the document if it already exists
|
||||
same_document = self.document_repository.find_same_document(filename, file_hash)
|
||||
if same_document is not None:
|
||||
logger.info(f" Document with same hash already exists. Skipping...")
|
||||
self.move_to_ignored(file_path, f"already exists ({same_document.id})")
|
||||
raise DocumentAlreadyExists(f"Document with same hash already exists ({same_document.id})")
|
||||
|
||||
self.save_content_if_needed(file_hash, file_bytes)
|
||||
logger.info(f" Saved content to {self._get_document_path(file_hash)}")
|
||||
logger.info(f" Saved content to {self.get_document_path(file_hash)}")
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
@@ -255,11 +271,13 @@ class DocumentService:
|
||||
mime_type=mime_type
|
||||
)
|
||||
|
||||
created_file = self.document_repository.create_document(file_data)
|
||||
logger.info(f" Created document with id '{created_file.id}'")
|
||||
created_document = self.document_repository.create_document(file_data)
|
||||
logger.info(f" Created document with id '{created_document.id}'")
|
||||
|
||||
return created_file
|
||||
return created_document
|
||||
|
||||
except DocumentAlreadyExists as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
@@ -273,40 +291,69 @@ class DocumentService:
|
||||
document = self.get_document_by_id(document_id)
|
||||
if document is None:
|
||||
logger.error(f" Document not found")
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
raise DocumentProcessingError(f"Document {document_id} not found.")
|
||||
|
||||
# try to find another document that has the same hash
|
||||
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
|
||||
|
||||
# the pdf will be created only if it does not exist yet
|
||||
if (document_with_same_hash is not None and
|
||||
document_with_same_hash.pdf_file_hash and
|
||||
self.exists(document_with_same_hash.pdf_file_hash)):
|
||||
logger.info(f"Found document with same hash. Will use pdf {document_with_same_hash.pdf_file_hash}")
|
||||
if document_with_same_hash and self.exists(document_with_same_hash.pdf_file_hash):
|
||||
logger.info(f'Found document with same hash. Will use pdf "{document_with_same_hash.pdf_file_hash}".')
|
||||
self.update_document(document_id, {"pdf_file_hash": document_with_same_hash.pdf_file_hash})
|
||||
return True
|
||||
return
|
||||
|
||||
# get the content of the file
|
||||
logger.info(f" No document with same hash found and valid pdf found. Will create new pdf")
|
||||
logger.info(f" No document with same hash and valid pdf found. Will create new pdf content.")
|
||||
file_bytes = self.get_document_content_by_hash(document.file_hash)
|
||||
if file_bytes is None:
|
||||
logger.error(f"Content for document {document_id} not found. hash = {document.file_hash}.")
|
||||
return False
|
||||
logger.error(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".')
|
||||
raise DocumentProcessingError(f'Content for document "{document_id}" not found. hash = "{document.file_hash}".')
|
||||
|
||||
# create the pdf file
|
||||
temp_pdf_file = convert_to_pdf(self._get_document_path(document.file_hash), self.temp_folder)
|
||||
temp_pdf_file = convert_to_pdf(self.get_document_path(document.file_hash), self.temp_folder)
|
||||
pdf_file_hash = self._calculate_file_hash(self._read_file_bytes(temp_pdf_file))
|
||||
self.save_content_if_needed(pdf_file_hash, self._read_file_bytes(temp_pdf_file))
|
||||
logger.info(f" Created new pdf file with hash {pdf_file_hash}")
|
||||
|
||||
# remove the temporary file
|
||||
os.remove(temp_pdf_file)
|
||||
logger.info(f" Removed temporary pdf file {temp_pdf_file}")
|
||||
os.remove(temp_pdf_file) # remove the temporary file
|
||||
logger.info(f' Created new pdf file with hash "{pdf_file_hash}"')
|
||||
|
||||
# update the document
|
||||
self.update_document(document_id, {"pdf_file_hash": pdf_file_hash})
|
||||
|
||||
def create_thumbnail(self, document_id: PyObjectId):
|
||||
logger.info(f'Creating thumbnail document for "{document_id}"')
|
||||
document = self.get_document_by_id(document_id)
|
||||
if document is None:
|
||||
logger.error(f" Document not found !")
|
||||
raise DocumentProcessingError(f"Document {document_id} not found.")
|
||||
|
||||
return True
|
||||
# try to find another document that has the same hash
|
||||
document_with_same_hash = self.get_document_with_pdf_hash(document.file_hash)
|
||||
|
||||
# We will use the thumbnail of the pdf if it exists
|
||||
if document_with_same_hash and self.exists(document_with_same_hash.thumbnail_file_hash):
|
||||
logger.info(f" Found document with same hash. Will use thumbnail {document_with_same_hash.thumbnail_file_hash}")
|
||||
self.update_document(document_id, {"thumbnail_file_hash": document_with_same_hash.thumbnail_file_hash})
|
||||
return
|
||||
|
||||
logger.info(f" No document with same hash and valid thumbnail found. Will create new thumbnail")
|
||||
|
||||
if not self.exists(document.pdf_file_hash):
|
||||
logger.error(f" PDF file not found.")
|
||||
raise DocumentProcessingError(f"PDF file for document {document_id} not found")
|
||||
|
||||
tmp_thumbnail_path = os.path.join(self.temp_folder, f"{generate_uuid_filename()}.png")
|
||||
with PDFThumbnailGenerator(self.get_document_path(document.pdf_file_hash)) as gen:
|
||||
# create the thumbnail
|
||||
gen.create_thumbnail(tmp_thumbnail_path, page_num=0, width=200)
|
||||
thumbnail_file_hash = self._calculate_file_hash(self._read_file_bytes(tmp_thumbnail_path))
|
||||
|
||||
# save the thumbnail to the objects folder
|
||||
self.save_content_if_needed(thumbnail_file_hash, self._read_file_bytes(tmp_thumbnail_path))
|
||||
os.remove(tmp_thumbnail_path)
|
||||
|
||||
# update the document
|
||||
self.update_document(document_id, {"thumbnail_file_hash": thumbnail_file_hash})
|
||||
logger.info(f" Created thumbnail {thumbnail_file_hash}")
|
||||
|
||||
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -348,7 +395,7 @@ class DocumentService:
|
||||
return self.document_repository.find_document_by_filepath(filepath)
|
||||
|
||||
def get_document_content_by_hash(self, file_hash):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
target_path = self.get_document_path(file_hash)
|
||||
if not os.path.exists(target_path):
|
||||
return None
|
||||
|
||||
@@ -439,7 +486,7 @@ class DocumentService:
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
try:
|
||||
os.remove(self._get_document_path(document.file_hash))
|
||||
os.remove(self.get_document_path(document.file_hash))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
241
src/file-processor/app/utils/pdf_annotation.py
Normal file
241
src/file-processor/app/utils/pdf_annotation.py
Normal file
@@ -0,0 +1,241 @@
|
||||
import fitz # PyMuPDF
|
||||
|
||||
|
||||
class PDFAnnotator:
|
||||
def __init__(self, pdf_path):
|
||||
self.doc = fitz.open(pdf_path)
|
||||
|
||||
def add_highlight(self, rect, page_num=0, color=(1, 1, 0)):
|
||||
"""
|
||||
Add highlight annotation
|
||||
|
||||
Args:
|
||||
rect: (x0, y0, x1, y1) coordinates or fitz.Rect object
|
||||
page_num: Page number (0-indexed), default first page
|
||||
color: RGB tuple (0-1 range), default yellow
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
annot = page.add_highlight_annot(rect)
|
||||
annot.set_colors(stroke=color)
|
||||
annot.update()
|
||||
return annot
|
||||
|
||||
def add_rectangle(self, rect, page_num=0, color=(1, 0, 0), width=2):
|
||||
"""
|
||||
Add rectangle annotation (border only)
|
||||
|
||||
Args:
|
||||
rect: (x0, y0, x1, y1) coordinates or fitz.Rect object
|
||||
page_num: Page number (0-indexed), default first page
|
||||
color: RGB tuple (0-1 range), default red
|
||||
width: Line width in points
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
annot = page.add_rect_annot(rect)
|
||||
annot.set_colors(stroke=color)
|
||||
annot.set_border(width=width)
|
||||
annot.update()
|
||||
return annot
|
||||
|
||||
def add_text_note(self, point, text, page_num=0, icon="Note"):
|
||||
"""
|
||||
Add sticky note annotation
|
||||
|
||||
Args:
|
||||
point: (x, y) position tuple
|
||||
text: Note content string
|
||||
page_num: Page number (0-indexed), default first page
|
||||
icon: "Note", "Comment", "Help", "Insert", "Key", etc.
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
annot = page.add_text_annot(point, text, icon=icon)
|
||||
annot.update()
|
||||
return annot
|
||||
|
||||
def add_free_text(self, rect, text, page_num=0, fontsize=12,
|
||||
color=(0, 0, 0)):
|
||||
"""
|
||||
Add free text annotation (visible text box)
|
||||
|
||||
Args:
|
||||
rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect
|
||||
text: Text content string
|
||||
page_num: Page number (0-indexed), default first page
|
||||
fontsize: Font size in points
|
||||
color: Text color RGB tuple (0-1 range)
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
annot = page.add_freetext_annot(
|
||||
rect,
|
||||
text,
|
||||
fontsize=fontsize,
|
||||
text_color=color
|
||||
)
|
||||
annot.update()
|
||||
return annot
|
||||
|
||||
def add_arrow(self, start_point, end_point, page_num=0,
|
||||
color=(1, 0, 0), width=2):
|
||||
"""
|
||||
Add arrow annotation
|
||||
|
||||
Args:
|
||||
start_point: (x, y) tuple for arrow start
|
||||
end_point: (x, y) tuple for arrow end
|
||||
page_num: Page number (0-indexed), default first page
|
||||
color: Arrow color RGB tuple (0-1 range), default red
|
||||
width: Line width in points
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
annot = page.add_line_annot(start_point, end_point)
|
||||
annot.set_colors(stroke=color)
|
||||
annot.set_border(width=width)
|
||||
# Set arrow at end - use integer constant
|
||||
annot.set_line_ends(0, 1) # 1 = ClosedArrow
|
||||
annot.update()
|
||||
return annot
|
||||
|
||||
def add_stamp(self, rect, page_num=0, stamp_type=0):
|
||||
"""
|
||||
Add stamp annotation
|
||||
|
||||
Args:
|
||||
rect: (x0, y0, x1, y1) bounding box tuple or fitz.Rect
|
||||
page_num: Page number (0-indexed), default first page
|
||||
stamp_type: Integer for stamp type:
|
||||
0=Approved, 1=AsIs, 2=Confidential,
|
||||
3=Departmental, 4=Draft, 5=Experimental,
|
||||
6=Expired, 7=Final, 8=ForComment,
|
||||
9=ForPublicRelease, 10=NotApproved, etc.
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
annot = page.add_stamp_annot(rect, stamp=stamp_type)
|
||||
annot.update()
|
||||
return annot
|
||||
|
||||
def add_redaction(self, rect, page_num=0, fill_color=(0, 0, 0)):
|
||||
"""
|
||||
Add redaction annotation (marks area for redaction)
|
||||
Note: Use apply_redactions() to permanently remove content
|
||||
|
||||
Args:
|
||||
rect: (x0, y0, x1, y1) area to redact, tuple or fitz.Rect
|
||||
page_num: Page number (0-indexed), default first page
|
||||
fill_color: RGB tuple (0-1 range) for redacted area, default black
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
annot = page.add_redact_annot(rect, fill=fill_color)
|
||||
annot.update()
|
||||
return annot
|
||||
|
||||
def apply_redactions(self, page_num=0, images=2, graphics=2, text=2):
|
||||
"""
|
||||
Apply all redaction annotations on a page (permanent removal)
|
||||
|
||||
Args:
|
||||
page_num: Page number (0-indexed), default first page
|
||||
images: 2=remove, 1=blank, 0=ignore
|
||||
graphics: 2=remove, 1=blank, 0=ignore
|
||||
text: 2=remove, 1=blank, 0=ignore
|
||||
|
||||
Returns:
|
||||
True if redactions were applied, False otherwise
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
# Check if page has redaction annotations
|
||||
has_redactions = any(annot.type[0] == 12 for annot in page.annots())
|
||||
|
||||
if has_redactions:
|
||||
page.apply_redactions(images=images, graphics=graphics, text=text)
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_all_annotations(self, page_num=0):
|
||||
"""
|
||||
Retrieve all annotations from a page
|
||||
|
||||
Args:
|
||||
page_num: Page number (0-indexed), default first page
|
||||
|
||||
Returns:
|
||||
List of dicts with annotation information
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
annotations = []
|
||||
|
||||
for annot in page.annots():
|
||||
info = {
|
||||
'type': annot.type[1], # Annotation type name
|
||||
'rect': annot.rect,
|
||||
'content': annot.info.get('content', ''),
|
||||
'author': annot.info.get('title', ''),
|
||||
'created': annot.info.get('creationDate', ''),
|
||||
'colors': annot.colors
|
||||
}
|
||||
annotations.append(info)
|
||||
|
||||
return annotations
|
||||
|
||||
def remove_all_annotations(self, page_num=0):
|
||||
"""
|
||||
Remove all annotations from a page
|
||||
|
||||
Args:
|
||||
page_num: Page number (0-indexed), default first page
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
for annot in page.annots():
|
||||
page.delete_annot(annot)
|
||||
|
||||
def save(self, output_path):
|
||||
"""Save the annotated PDF"""
|
||||
self.doc.save(output_path)
|
||||
|
||||
def close(self):
|
||||
self.doc.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
with PDFAnnotator("input.pdf") as annotator:
|
||||
# Add yellow highlight
|
||||
annotator.add_highlight((100, 100, 300, 120), page_num=0,
|
||||
color=(1, 1, 0))
|
||||
|
||||
# Add red rectangle border
|
||||
annotator.add_rectangle((100, 150, 300, 250), page_num=0,
|
||||
color=(1, 0, 0), width=3)
|
||||
|
||||
# Add sticky note
|
||||
annotator.add_text_note((400, 100), "This is important!",
|
||||
page_num=0, icon="Comment")
|
||||
|
||||
# Add visible text box
|
||||
annotator.add_free_text((100, 300, 400, 350), "DRAFT VERSION",
|
||||
page_num=0, fontsize=20, color=(1, 0, 0))
|
||||
|
||||
# Add arrow pointing to something
|
||||
annotator.add_arrow((450, 100), (500, 200), page_num=0,
|
||||
color=(0, 0, 1), width=2)
|
||||
|
||||
# Add "Approved" stamp
|
||||
annotator.add_stamp((450, 300, 550, 350), page_num=0, stamp_type=0)
|
||||
|
||||
# Add redaction (black box over sensitive info)
|
||||
annotator.add_redaction((100, 400, 300, 420), page_num=0)
|
||||
annotator.apply_redactions(page_num=0)
|
||||
|
||||
# List all annotations
|
||||
annots = annotator.get_all_annotations(page_num=0)
|
||||
print(f"Found {len(annots)} annotations:")
|
||||
for a in annots:
|
||||
print(f" - {a['type']} at {a['rect']}")
|
||||
|
||||
# Save annotated PDF
|
||||
annotator.save("output_annotated.pdf")
|
||||
@@ -127,6 +127,15 @@ class TextToPdfConverter(BaseConverter):
|
||||
return self
|
||||
|
||||
|
||||
class PdfToPdfConverter(BaseConverter):
|
||||
"""Converter for PDF files to PDF."""
|
||||
|
||||
def convert(self) -> Self:
|
||||
# copy self.input_path to self.output_path
|
||||
os.system(f"cp {self.input_path} {self.output_path}")
|
||||
return self
|
||||
|
||||
|
||||
class ImageToPdfConverter(BaseConverter):
|
||||
"""Converter for image files to PDF."""
|
||||
|
||||
@@ -191,6 +200,8 @@ def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
|
||||
converter = ImageToPdfConverter(filepath, output_dir=output_dir)
|
||||
elif file_type == "word":
|
||||
converter = WordToPdfConverter(filepath, output_dir=output_dir)
|
||||
elif file_type == "pdf":
|
||||
converter = PdfToPdfConverter(filepath, output_dir=output_dir)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
|
||||
|
||||
167
src/file-processor/app/utils/pdf_thumbmail.py
Normal file
167
src/file-processor/app/utils/pdf_thumbmail.py
Normal file
@@ -0,0 +1,167 @@
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
|
||||
|
||||
class PDFThumbnailGenerator:
|
||||
def __init__(self, pdf_path):
|
||||
"""
|
||||
Initialize PDF thumbnail generator
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file (string or Path object)
|
||||
"""
|
||||
self.pdf_path = pdf_path
|
||||
self.doc = fitz.open(pdf_path)
|
||||
|
||||
def create_thumbnail(self, output_path, page_num=0, width=200, rotation=0, zoom_factor=1.0):
|
||||
"""
|
||||
Create a thumbnail with zoom and rotation
|
||||
|
||||
Args:
|
||||
output_path: Path to save the thumbnail (string or Path)
|
||||
page_num: Page number (0-indexed), default first page
|
||||
width: Desired width in pixels, default 200
|
||||
rotation: Rotation angle in degrees (0, 90, 180, 270), default 0
|
||||
zoom_factor: Additional zoom multiplier (1.0 = normal, 2.0 = 2x), default 1.0
|
||||
|
||||
Returns:
|
||||
Dict with thumbnail info (width, height, rotation, zoom)
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
|
||||
# Apply rotation to page
|
||||
page.set_rotation(rotation)
|
||||
|
||||
# Calculate zoom to achieve desired width
|
||||
base_zoom = width / page.rect.width
|
||||
final_zoom = base_zoom * zoom_factor
|
||||
|
||||
# Create transformation matrix
|
||||
mat = fitz.Matrix(final_zoom, final_zoom)
|
||||
|
||||
# Render page to pixmap
|
||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# Save thumbnail
|
||||
pix.save(output_path)
|
||||
|
||||
return {
|
||||
'width': pix.width,
|
||||
'height': pix.height,
|
||||
'rotation': rotation,
|
||||
'zoom': zoom_factor
|
||||
}
|
||||
|
||||
def create_cropped_thumbnail(self, output_path, crop_rect=None, page_num=0, width=200):
|
||||
"""
|
||||
Create a thumbnail of a specific region (zoom on area)
|
||||
|
||||
Args:
|
||||
output_path: Path to save the thumbnail (string or Path)
|
||||
crop_rect: Tuple (x0, y0, x1, y1) in PDF coordinates for cropping,
|
||||
or None for full page, default None
|
||||
page_num: Page number (0-indexed), default first page
|
||||
width: Desired width in pixels, default 200
|
||||
|
||||
Returns:
|
||||
Tuple (width, height) of the generated thumbnail
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
|
||||
if crop_rect:
|
||||
# Create rectangle for cropping
|
||||
rect = fitz.Rect(crop_rect)
|
||||
zoom = width / rect.width
|
||||
else:
|
||||
rect = page.rect
|
||||
zoom = width / page.rect.width
|
||||
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
|
||||
# Render only the specified rectangle
|
||||
pix = page.get_pixmap(matrix=mat, clip=rect)
|
||||
pix.save(output_path)
|
||||
|
||||
return pix.width, pix.height
|
||||
|
||||
def get_page_info(self, page_num=0):
|
||||
"""
|
||||
Get information about a specific page
|
||||
|
||||
Args:
|
||||
page_num: Page number (0-indexed), default first page
|
||||
|
||||
Returns:
|
||||
Dict with page information (width, height, rotation, number, total_pages)
|
||||
"""
|
||||
page = self.doc[page_num]
|
||||
return {
|
||||
'width': page.rect.width,
|
||||
'height': page.rect.height,
|
||||
'rotation': page.rotation,
|
||||
'number': page_num + 1,
|
||||
'total_pages': len(self.doc)
|
||||
}
|
||||
|
||||
def create_multi_resolution_thumbnails(self, output_folder, page_num=0, sizes=(150, 300, 600)):
|
||||
"""
|
||||
Create multiple thumbnails at different resolutions
|
||||
|
||||
Args:
|
||||
output_folder: Folder path to save thumbnails (string or Path)
|
||||
page_num: Page number (0-indexed), default first page
|
||||
sizes: List of widths in pixels, default [150, 300, 600]
|
||||
|
||||
Returns:
|
||||
Dict mapping each size to thumbnail info
|
||||
"""
|
||||
output_folder = Path(output_folder)
|
||||
output_folder.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
results = {}
|
||||
for size in sizes:
|
||||
output_path = output_folder / f"thumb_{size}px.png"
|
||||
info = self.create_thumbnail(output_path, page_num=page_num, width=size)
|
||||
results[size] = info
|
||||
|
||||
return results
|
||||
|
||||
def close(self):
|
||||
"""Close the PDF document and free resources"""
|
||||
self.doc.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Basic usage with context manager
|
||||
with PDFThumbnailGenerator("example.pdf") as gen:
|
||||
# Standard thumbnail
|
||||
gen.create_thumbnail("thumb_standard.png", page_num=0, width=200)
|
||||
|
||||
# Rotated thumbnail
|
||||
gen.create_thumbnail("thumb_rotated.png", page_num=0,
|
||||
width=200, rotation=90)
|
||||
|
||||
# Zoomed thumbnail (2x zoom)
|
||||
gen.create_thumbnail("thumb_zoomed.png", page_num=0,
|
||||
width=200, zoom_factor=2.0)
|
||||
|
||||
# Cropped/zoomed on specific area (x0, y0, x1, y1)
|
||||
gen.create_cropped_thumbnail("thumb_crop.png",
|
||||
crop_rect=(100, 100, 400, 400),
|
||||
page_num=0, width=300)
|
||||
|
||||
# Multiple resolutions
|
||||
gen.create_multi_resolution_thumbnails("thumbnails/", page_num=0,
|
||||
sizes=[150, 300, 600])
|
||||
|
||||
# Get page information
|
||||
info = gen.get_page_info(page_num=0)
|
||||
print(f"Page info: {info}")
|
||||
@@ -4,9 +4,10 @@ Password security utilities using bcrypt for secure password hashing.
|
||||
This module provides secure password hashing and verification functions
|
||||
using the bcrypt algorithm with automatic salt generation.
|
||||
"""
|
||||
import re
|
||||
import uuid
|
||||
|
||||
import bcrypt
|
||||
from typing import Union
|
||||
|
||||
|
||||
def hash_password(password: str) -> str:
|
||||
@@ -71,4 +72,33 @@ def verify_password(password: str, hashed_password: str) -> bool:
|
||||
# bcrypt raises ValueError for malformed hashes
|
||||
raise RuntimeError(f"Invalid hash format: {str(e)}")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to verify password: {str(e)}")
|
||||
raise RuntimeError(f"Failed to verify password: {str(e)}")
|
||||
|
||||
|
||||
def generate_uuid_filename() -> str:
|
||||
"""Generate a unique filename using UUID4."""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def safe_connection_string(connection_string: str) -> str:
|
||||
"""
|
||||
Mask the password in a MongoDB connection string.
|
||||
|
||||
Args:
|
||||
connection_string (str): The complete MongoDB connection string
|
||||
|
||||
Returns:
|
||||
str: The connection string with password replaced by asterisks
|
||||
|
||||
Example:
|
||||
>>> mask_mongodb_password("mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin")
|
||||
"mongodb://admin:***@mongodb:27017/mydocmanager?authSource=admin"
|
||||
"""
|
||||
# Pattern to detect password in MongoDB URL
|
||||
# Format: mongodb://username:password@host:port/database
|
||||
pattern = r'(mongodb://[^:]+:)([^@]+)(@.*)'
|
||||
|
||||
# Replace password with asterisks
|
||||
masked_string = re.sub(pattern, r'\1*****\3', connection_string)
|
||||
|
||||
return masked_string
|
||||
|
||||
@@ -10,6 +10,7 @@ pillow==11.3.0
|
||||
pydantic==2.11.9
|
||||
PyJWT==2.10.1
|
||||
pymongo==4.15.0
|
||||
PyMuPDF==1.26.4
|
||||
pypandoc==1.15
|
||||
python-multipart==0.0.20
|
||||
redis==6.4.0
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import {FaBuffer, FaPlus} from "react-icons/fa6";
|
||||
import { Link } from "react-router-dom";
|
||||
|
||||
const Menu = () => {
|
||||
return (
|
||||
<div className="p-4">
|
||||
<ul className="menu">
|
||||
<li className="menu-title">Exploration</li>
|
||||
<li><a><FaBuffer/>To Review</a></li>
|
||||
<li><Link to="/dashboard"><FaBuffer/>Dashboard</Link></li>
|
||||
<li><Link to="/documents"><FaBuffer/>To Review</Link></li>
|
||||
<li className="menu-title mt-4">Catégories</li>
|
||||
<li><a><i className="fas fa-plus"></i>Item</a></li>
|
||||
</ul>
|
||||
|
||||
@@ -64,8 +64,8 @@ const DocumentCard = memo(({ document, viewMode, onEdit, onDelete }) => {
|
||||
const renderThumbnail = () => (
|
||||
<figure className="relative overflow-hidden">
|
||||
<img
|
||||
src={thumbnailUrl}
|
||||
alt={`${name} thumbnail`}
|
||||
src={`http://localhost:8000${thumbnailUrl}`}
|
||||
alt={`${thumbnailUrl} thumbnail`}
|
||||
className={`w-full object-cover ${
|
||||
viewMode === 'small' ? 'h-32' : viewMode === 'large' ? 'h-48' : 'h-64'
|
||||
}`}
|
||||
|
||||
@@ -5,17 +5,24 @@
|
||||
*/
|
||||
|
||||
import { mockDocuments, availableTags, availableCategories } from '../utils/mockData';
|
||||
import api from '../utils/api';
|
||||
|
||||
// Simulate network delay
|
||||
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
||||
|
||||
/**
|
||||
* Fetches all documents
|
||||
* Fetches all documents from the API
|
||||
* @returns {Promise<Array>} Array of document objects
|
||||
*/
|
||||
export const getAllDocuments = async () => {
|
||||
await delay(500); // Simulate network latency
|
||||
return [...mockDocuments];
|
||||
try {
|
||||
const response = await api.get('/api/documents');
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
console.error('Failed to fetch documents:', error);
|
||||
// Fallback to mock data in case of API error during development
|
||||
console.warn('Falling back to mock data');
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -10,6 +10,7 @@ pillow==11.3.0
|
||||
pydantic==2.11.9
|
||||
PyJWT==2.10.1
|
||||
pymongo==4.15.0
|
||||
PyMuPDF==1.26.4
|
||||
pypandoc==1.15
|
||||
python-multipart==0.0.20
|
||||
redis==6.4.0
|
||||
|
||||
@@ -12,7 +12,7 @@ from typing import Any, Dict
|
||||
from app.config import settings
|
||||
from app.database.connection import get_database
|
||||
from app.models.job import ProcessingStatus
|
||||
from app.services.document_service import DocumentService
|
||||
from app.services.document_service import DocumentService, DocumentAlreadyExists
|
||||
from app.services.job_service import JobService
|
||||
from tasks.main import celery_app
|
||||
|
||||
@@ -26,7 +26,7 @@ def get_services():
|
||||
return document_service, job_service
|
||||
|
||||
|
||||
#@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||
# @celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||
@celery_app.task(bind=True)
|
||||
def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -48,7 +48,7 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
Exception: Any processing error (will trigger retry)
|
||||
"""
|
||||
task_id = self.request.id
|
||||
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
||||
logger.info(f'Task {task_id} : Starting document processing for file: "{filepath}"')
|
||||
|
||||
# get services
|
||||
document_service, job_service = get_services()
|
||||
@@ -60,12 +60,16 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
document = document_service.create_document(filepath)
|
||||
job = job_service.create_job(task_id=task_id, document_id=document.id)
|
||||
job_service.mark_job_as_started(job_id=job.id)
|
||||
logger.info(f"Task {task_id} created for document {document.id} from file path: {filepath} and job id: {job.id}")
|
||||
logger.info(f'Task {task_id} : Created document "{document.id}". Started job "{job.id}"')
|
||||
|
||||
logger.info(f"Task {task_id} : Creating associated PDF")
|
||||
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.SAVING_PDF)
|
||||
document_service.create_pdf(document.id)
|
||||
|
||||
logger.info(f"Task {task_id} : Creating thumbnail")
|
||||
job_service.update_job_status(job_id=job.id, status=ProcessingStatus.CREATING_THUMBNAIL)
|
||||
document_service.create_thumbnail(document.id)
|
||||
|
||||
# remove the file from the watch folder
|
||||
os.remove(filepath)
|
||||
|
||||
@@ -79,6 +83,19 @@ def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
"status": "completed",
|
||||
}
|
||||
|
||||
except DocumentAlreadyExists as e:
|
||||
logger.info(f"Task {task_id} completed: {str(e)}")
|
||||
if job is not None:
|
||||
job_service.mark_job_as_completed(job_id=job.id)
|
||||
logger.info(f"Job {task_id} marked as COMPLETED")
|
||||
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"filepath": filepath,
|
||||
"status": "completed",
|
||||
"message": str(e),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Document processing failed: {str(e)}"
|
||||
logger.error(f"Task {task_id} failed: {error_message}")
|
||||
|
||||
@@ -618,7 +618,7 @@ class TestCreatePdf:
|
||||
assert updated_doc.pdf_file_hash == pdf_hash
|
||||
|
||||
# Verify convert_to_pdf was called with correct arguments
|
||||
doc_path = document_service._get_document_path(created_doc.file_hash)
|
||||
doc_path = document_service.get_document_path(created_doc.file_hash)
|
||||
mock_convert_to_pdf.assert_called_once_with(doc_path, document_service.temp_folder)
|
||||
|
||||
# Verify content exists on disk
|
||||
@@ -694,7 +694,7 @@ class TestCreatePdf:
|
||||
)
|
||||
|
||||
# Simulate missing content by removing file
|
||||
file_path = document_service._get_document_path(created_doc.file_hash)
|
||||
file_path = document_service.get_document_path(created_doc.file_hash)
|
||||
os.remove(file_path)
|
||||
|
||||
# Execute
|
||||
|
||||
Reference in New Issue
Block a user