Implemented default pipeline
This commit is contained in:
@@ -3,6 +3,12 @@ FROM python:3.12-slim
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install libmagic
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libmagic1 \
|
||||
file \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
0
src/file-processor/app/api/__init__.py
Normal file
0
src/file-processor/app/api/__init__.py
Normal file
100
src/file-processor/app/api/dependencies.py
Normal file
100
src/file-processor/app/api/dependencies.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import jwt
|
||||
from fastapi import Depends, HTTPException
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from jwt import InvalidTokenError
|
||||
from starlette import status
|
||||
|
||||
from app.config import settings
|
||||
from app.database.connection import get_database
|
||||
from app.models.auth import UserRole
|
||||
from app.models.user import UserInDB
|
||||
from app.services.auth_service import AuthService
|
||||
from app.services.user_service import UserService
|
||||
|
||||
security = HTTPBearer()
|
||||
|
||||
|
||||
def get_auth_service() -> AuthService:
|
||||
"""Dependency to get AuthService instance."""
|
||||
return AuthService()
|
||||
|
||||
|
||||
def get_user_service() -> UserService:
|
||||
"""Dependency to get UserService instance."""
|
||||
database = get_database()
|
||||
return UserService(database)
|
||||
|
||||
|
||||
def get_current_user(
|
||||
credentials: HTTPAuthorizationCredentials = Depends(security),
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
) -> UserInDB:
|
||||
"""
|
||||
Dependency to get current authenticated user from JWT token.
|
||||
|
||||
Args:
|
||||
credentials: HTTP Bearer credentials
|
||||
user_service: Auth service instance
|
||||
|
||||
Returns:
|
||||
User: Current authenticated user
|
||||
|
||||
Raises:
|
||||
HTTPException: If token is invalid or user not found
|
||||
"""
|
||||
try:
|
||||
payload = jwt.decode(
|
||||
credentials.credentials,
|
||||
settings.get_jwt_secret_key(),
|
||||
algorithms=[settings.get_jwt_algorithm()]
|
||||
)
|
||||
username: str = payload.get("sub")
|
||||
if username is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Could not validate credentials",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
except InvalidTokenError:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Could not validate credentials",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
user = user_service.get_user_by_username(username)
|
||||
if user is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Could not validate credentials",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
if not user.is_active:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Inactive user"
|
||||
)
|
||||
|
||||
return user
|
||||
|
||||
|
||||
def get_admin_user(current_user: UserInDB = Depends(get_current_user)) -> UserInDB:
|
||||
"""
|
||||
Dependency to ensure current user has admin role.
|
||||
|
||||
Args:
|
||||
current_user: Current authenticated user
|
||||
|
||||
Returns:
|
||||
User: Current user if admin
|
||||
|
||||
Raises:
|
||||
HTTPException: If user is not admin
|
||||
"""
|
||||
if current_user.role != UserRole.ADMIN:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="Not enough permissions"
|
||||
)
|
||||
return current_user
|
||||
0
src/file-processor/app/api/routes/__init__.py
Normal file
0
src/file-processor/app/api/routes/__init__.py
Normal file
80
src/file-processor/app/api/routes/auth.py
Normal file
80
src/file-processor/app/api/routes/auth.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from fastapi.security import OAuth2PasswordRequestForm
|
||||
|
||||
from app.api.dependencies import get_auth_service, get_current_user, get_user_service
|
||||
from app.models.auth import LoginResponse, UserResponse
|
||||
from app.models.user import UserInDB
|
||||
from app.services.auth_service import AuthService
|
||||
from app.services.user_service import UserService
|
||||
|
||||
router = APIRouter(tags=["authentication"])
|
||||
|
||||
|
||||
@router.post("/login", response_model=LoginResponse)
|
||||
def login(
|
||||
form_data: OAuth2PasswordRequestForm = Depends(),
|
||||
auth_service: AuthService = Depends(get_auth_service),
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
):
|
||||
"""
|
||||
Authenticate user and return JWT token.
|
||||
|
||||
Args:
|
||||
form_data: OAuth2 password form data
|
||||
auth_service: Auth service instance
|
||||
user_service: User service instance
|
||||
|
||||
Returns:
|
||||
LoginResponse: JWT token and user info
|
||||
|
||||
Raises:
|
||||
HTTPException: If authentication fails
|
||||
"""
|
||||
incorrect_username_or_pwd = HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Incorrect username or password",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
user = user_service.get_user_by_username(form_data.username)
|
||||
if (not user or
|
||||
not user.is_active or
|
||||
not auth_service.verify_user_password(form_data.password, user.hashed_password)):
|
||||
raise incorrect_username_or_pwd
|
||||
|
||||
access_token = auth_service.create_access_token(data={"sub": user.username})
|
||||
|
||||
return LoginResponse(
|
||||
access_token=access_token,
|
||||
user=UserResponse(
|
||||
_id=user.id,
|
||||
username=user.username,
|
||||
email=user.email,
|
||||
role=user.role,
|
||||
is_active=user.is_active,
|
||||
created_at=user.created_at,
|
||||
updated_at=user.updated_at
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@router.get("/me", response_model=UserResponse)
|
||||
def get_current_user_profile(current_user: UserInDB = Depends(get_current_user)):
|
||||
"""
|
||||
Get current user profile.
|
||||
|
||||
Args:
|
||||
current_user: Current authenticated user
|
||||
|
||||
Returns:
|
||||
UserResponse: Current user profile without sensitive data
|
||||
"""
|
||||
return UserResponse(
|
||||
_id=current_user.id,
|
||||
username=current_user.username,
|
||||
email=current_user.email,
|
||||
role=current_user.role,
|
||||
is_active=current_user.is_active,
|
||||
created_at=current_user.created_at,
|
||||
updated_at=current_user.updated_at
|
||||
)
|
||||
172
src/file-processor/app/api/routes/users.py
Normal file
172
src/file-processor/app/api/routes/users.py
Normal file
@@ -0,0 +1,172 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from starlette import status
|
||||
|
||||
from app.api.dependencies import get_admin_user, get_user_service
|
||||
from app.models.auth import UserResponse, MessageResponse
|
||||
from app.models.types import PyObjectId
|
||||
from app.models.user import UserInDB, UserCreate, UserUpdate
|
||||
from app.services.user_service import UserService
|
||||
|
||||
router = APIRouter(tags=["users"])
|
||||
|
||||
|
||||
@router.get("", response_model=list[UserInDB])
|
||||
def list_users(
|
||||
admin_user: UserInDB = Depends(get_admin_user),
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
):
|
||||
"""
|
||||
List all users (admin only).
|
||||
|
||||
Args:
|
||||
admin_user: Current admin user
|
||||
user_service: User service instance
|
||||
|
||||
Returns:
|
||||
List[UserResponse]: List of all users without sensitive data
|
||||
"""
|
||||
return user_service.list_users()
|
||||
|
||||
|
||||
@router.get("/{user_id}", response_model=UserResponse)
|
||||
def get_user_by_id(
|
||||
user_id: PyObjectId,
|
||||
admin_user: UserInDB = Depends(get_admin_user),
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
):
|
||||
"""
|
||||
Get specific user by ID (admin only).
|
||||
|
||||
Args:
|
||||
user_id: User ID to retrieve
|
||||
admin_user: Current admin user
|
||||
user_service: User service instance
|
||||
|
||||
Returns:
|
||||
UserResponse: User information without sensitive data
|
||||
|
||||
Raises:
|
||||
HTTPException: If user not found
|
||||
"""
|
||||
user = user_service.get_user_by_id(str(user_id))
|
||||
if not user:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="User not found"
|
||||
)
|
||||
|
||||
return user
|
||||
|
||||
|
||||
@router.post("", response_model=UserResponse, status_code=status.HTTP_201_CREATED)
|
||||
def create_user(
|
||||
user_data: UserCreate,
|
||||
admin_user: UserInDB = Depends(get_admin_user),
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
):
|
||||
"""
|
||||
Create new user (admin only).
|
||||
|
||||
Args:
|
||||
user_data: User creation data
|
||||
admin_user: Current admin user
|
||||
user_service: User service instance
|
||||
|
||||
Returns:
|
||||
UserResponse: Created user information without sensitive data
|
||||
|
||||
Raises:
|
||||
HTTPException: If user creation fails
|
||||
"""
|
||||
try:
|
||||
user = user_service.create_user(user_data)
|
||||
return UserResponse(
|
||||
_id=user.id,
|
||||
username=user.username,
|
||||
email=user.email,
|
||||
role=user.role,
|
||||
is_active=user.is_active,
|
||||
created_at=user.created_at,
|
||||
updated_at=user.updated_at
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=str(e)
|
||||
)
|
||||
|
||||
|
||||
@router.put("/{user_id}", response_model=UserResponse)
|
||||
def update_user(
|
||||
user_id: PyObjectId,
|
||||
user_data: UserUpdate,
|
||||
admin_user: UserInDB = Depends(get_admin_user),
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
):
|
||||
"""
|
||||
Update existing user (admin only).
|
||||
|
||||
Args:
|
||||
user_id: User ID to update
|
||||
user_data: User update data
|
||||
admin_user: Current admin user
|
||||
user_service: User service instance
|
||||
|
||||
Returns:
|
||||
UserResponse: Updated user information without sensitive data
|
||||
|
||||
Raises:
|
||||
HTTPException: If user not found or update fails
|
||||
"""
|
||||
try:
|
||||
user = user_service.update_user(str(user_id), user_data)
|
||||
if not user:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="User not found"
|
||||
)
|
||||
|
||||
return UserResponse(
|
||||
_id=user.id,
|
||||
username=user.username,
|
||||
email=user.email,
|
||||
role=user.role,
|
||||
is_active=user.is_active,
|
||||
created_at=user.created_at,
|
||||
updated_at=user.updated_at
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=str(e)
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/{user_id}", response_model=MessageResponse)
|
||||
def delete_user(
|
||||
user_id: PyObjectId,
|
||||
admin_user: UserInDB = Depends(get_admin_user),
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
):
|
||||
"""
|
||||
Delete user by ID (admin only).
|
||||
|
||||
Args:
|
||||
user_id: User ID to delete
|
||||
admin_user: Current admin user
|
||||
user_service: User service instance
|
||||
|
||||
Returns:
|
||||
MessageResponse: Success message
|
||||
|
||||
Raises:
|
||||
HTTPException: If user not found or deletion fails
|
||||
"""
|
||||
success = user_service.delete_user(str(user_id))
|
||||
if not success:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="User not found"
|
||||
)
|
||||
|
||||
return MessageResponse(message="User successfully deleted")
|
||||
@@ -6,7 +6,6 @@ using simple os.getenv() approach without external validation libraries.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def get_mongodb_url() -> str:
|
||||
@@ -31,6 +30,26 @@ def get_mongodb_database_name() -> str:
|
||||
return os.getenv("MONGODB_DATABASE", "mydocmanager")
|
||||
|
||||
|
||||
def get_redis_url() -> str:
|
||||
return os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
||||
|
||||
|
||||
# def get_redis_host() -> str:
|
||||
# redis_url = get_redis_url()
|
||||
# if redis_url.startswith("redis://"):
|
||||
# return redis_url.split("redis://")[1].split("/")[0]
|
||||
# else:
|
||||
# return redis_url
|
||||
#
|
||||
#
|
||||
# def get_redis_port() -> int:
|
||||
# redis_url = get_redis_url()
|
||||
# if redis_url.startswith("redis://"):
|
||||
# return int(redis_url.split("redis://")[1].split("/")[0].split(":")[1])
|
||||
# else:
|
||||
# return int(redis_url.split(":")[1])
|
||||
|
||||
|
||||
def get_jwt_secret_key() -> str:
|
||||
"""
|
||||
Get JWT secret key from environment variables.
|
||||
@@ -82,4 +101,19 @@ def is_development_environment() -> bool:
|
||||
Returns:
|
||||
bool: True if development environment
|
||||
"""
|
||||
return os.getenv("ENVIRONMENT", "development").lower() == "development"
|
||||
return os.getenv("ENVIRONMENT", "development").lower() == "development"
|
||||
|
||||
|
||||
def get_objects_folder() -> str:
|
||||
"""
|
||||
Get Vault path from environment variables.
|
||||
|
||||
Returns:
|
||||
str: Vault path
|
||||
"""
|
||||
return os.getenv("OBJECTS_FOLDER", "/objects")
|
||||
|
||||
|
||||
def watch_directory() -> str:
|
||||
"""Directory to monitor for new files"""
|
||||
return os.getenv("WATCH_DIRECTORY", "/watched_files")
|
||||
|
||||
@@ -7,6 +7,7 @@ The application will terminate if MongoDB is not accessible at startup.
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from pymongo import MongoClient
|
||||
from pymongo.database import Database
|
||||
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
||||
@@ -107,6 +108,15 @@ def get_mongodb_client() -> Optional[MongoClient]:
|
||||
return _client
|
||||
|
||||
|
||||
def get_extra_args(session):
|
||||
# Build kwargs only if session is provided
|
||||
kwargs = {}
|
||||
if session is not None:
|
||||
kwargs["session"] = session
|
||||
|
||||
return kwargs
|
||||
|
||||
|
||||
def test_database_connection() -> bool:
|
||||
"""
|
||||
Test if database connection is working.
|
||||
@@ -122,4 +132,4 @@ def test_database_connection() -> bool:
|
||||
db.command('ping')
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
@@ -1,214 +0,0 @@
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from bson import ObjectId
|
||||
|
||||
from app.models.document import DocumentContent
|
||||
|
||||
|
||||
class DocumentContentRepository:
|
||||
"""
|
||||
Repository class for document content CRUD operations in MongoDB.
|
||||
|
||||
This class handles all database operations related to document content,
|
||||
following the repository pattern with dependency injection and async/await.
|
||||
"""
|
||||
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
"""
|
||||
Initialize repository with database dependency.
|
||||
|
||||
Args:
|
||||
database (AsyncIOMotorDatabase): MongoDB database instance
|
||||
"""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = database.document_contents
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
|
||||
Creates unique index on file_hash field to prevent duplicates.
|
||||
"""
|
||||
try:
|
||||
await self.collection.create_index("file_hash", unique=True)
|
||||
except PyMongoError:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
|
||||
async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
|
||||
"""
|
||||
Create a new document content in the database.
|
||||
|
||||
Args:
|
||||
document_content (DocumentContent): Document content data
|
||||
|
||||
Returns:
|
||||
DocumentContent: Created document content with database ID
|
||||
|
||||
Raises:
|
||||
DuplicateKeyError: If file_hash already exists
|
||||
ValueError: If document content creation fails due to validation
|
||||
"""
|
||||
document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
|
||||
|
||||
# Remove _id if it's None to let MongoDB generate it
|
||||
if document_dict.get("_id") is None:
|
||||
document_dict.pop("_id", None)
|
||||
|
||||
try:
|
||||
result = await self.collection.insert_one(document_dict)
|
||||
document_dict["_id"] = result.inserted_id
|
||||
return DocumentContent(**document_dict)
|
||||
except DuplicateKeyError as e:
|
||||
raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
|
||||
except PyMongoError as e:
|
||||
raise ValueError(f"Failed to create document content: {e}")
|
||||
|
||||
async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Find document content by ID.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to search for
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return None
|
||||
|
||||
document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
|
||||
if document_doc:
|
||||
return DocumentContent(**document_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Find document content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash (str): File hash to search for
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
document_doc = await self.collection.find_one({"file_hash": file_hash})
|
||||
if document_doc:
|
||||
return DocumentContent(**document_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if document content exists by file hash.
|
||||
|
||||
Args:
|
||||
file_hash (str): File hash to check
|
||||
|
||||
Returns:
|
||||
bool: True if document content exists, False otherwise
|
||||
"""
|
||||
try:
|
||||
count = await self.collection.count_documents({"file_hash": file_hash})
|
||||
return count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update document content information.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to update
|
||||
update_data (dict): Updated document content data
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Updated document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return None
|
||||
|
||||
# Remove None values and _id from update data
|
||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
|
||||
|
||||
if not clean_update_data:
|
||||
return await self.find_document_content_by_id(document_id)
|
||||
|
||||
result = await self.collection.find_one_and_update(
|
||||
{"_id": ObjectId(document_id)},
|
||||
{"$set": clean_update_data},
|
||||
return_document=True
|
||||
)
|
||||
|
||||
if result:
|
||||
return DocumentContent(**result)
|
||||
return None
|
||||
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def delete_document_content(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete document content from database.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to delete
|
||||
|
||||
Returns:
|
||||
bool: True if document content was deleted, False otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return False
|
||||
|
||||
result = await self.collection.delete_one({"_id": ObjectId(document_id)})
|
||||
return result.deleted_count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
|
||||
"""
|
||||
List document contents with pagination.
|
||||
|
||||
Args:
|
||||
skip (int): Number of document contents to skip (default: 0)
|
||||
limit (int): Maximum number of document contents to return (default: 100)
|
||||
|
||||
Returns:
|
||||
List[DocumentContent]: List of document contents
|
||||
"""
|
||||
try:
|
||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
|
||||
document_docs = await cursor.to_list(length=limit)
|
||||
return [DocumentContent(**document_doc) for document_doc in document_docs]
|
||||
except PyMongoError:
|
||||
return []
|
||||
|
||||
async def count_document_contents(self) -> int:
|
||||
"""
|
||||
Count total number of document contents.
|
||||
|
||||
Returns:
|
||||
int: Total number of document contents in database
|
||||
"""
|
||||
try:
|
||||
return await self.collection.count_documents({})
|
||||
except PyMongoError:
|
||||
return 0
|
||||
@@ -6,9 +6,13 @@ in MongoDB with proper error handling and type safety.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
|
||||
from bson import ObjectId
|
||||
from pymongo.collection import Collection
|
||||
from pymongo.database import Database
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
|
||||
|
||||
from app.database.connection import get_extra_args
|
||||
from app.models.document import FileDocument
|
||||
from app.utils.document_matching import fuzzy_matching, subsequence_matching
|
||||
|
||||
@@ -34,52 +38,49 @@ class FileDocumentRepository:
|
||||
with proper error handling and data validation.
|
||||
"""
|
||||
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
def __init__(self, database: Database):
|
||||
"""Initialize file repository with database connection."""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = self.db.files
|
||||
self._ensure_indexes()
|
||||
self.collection: Collection = self.db.documents
|
||||
|
||||
async def initialize(self):
|
||||
def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
self._ensure_indexes()
|
||||
return self
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
|
||||
Creates unique index on username field to prevent duplicates.
|
||||
"""
|
||||
try:
|
||||
await self.collection.create_index("filepath", unique=True)
|
||||
except PyMongoError:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
pass
|
||||
|
||||
async def create_document(self, file_data: FileDocument) -> FileDocument:
|
||||
def create_document(self, file_data: FileDocument, session=None) -> FileDocument:
|
||||
"""
|
||||
Create a new file document in database.
|
||||
|
||||
Args:
|
||||
file_data (FileDocument): File document data to create
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
FileDocument: Created file document with database ID
|
||||
FileDocument: Created document with database ID
|
||||
|
||||
Raises:
|
||||
ValueError: If file creation fails due to validation
|
||||
DuplicateKeyError: If file with same hash already exists
|
||||
DuplicateKeyError: If a document with same hash already exists
|
||||
"""
|
||||
try:
|
||||
file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
|
||||
if "_id" in file_dict and file_dict["_id"] is None:
|
||||
del file_dict["_id"]
|
||||
|
||||
result = await self.collection.insert_one(file_dict)
|
||||
result = self.collection.insert_one(file_dict, **get_extra_args(session))
|
||||
file_data.id = result.inserted_id
|
||||
return file_data
|
||||
|
||||
@@ -88,7 +89,7 @@ class FileDocumentRepository:
|
||||
except PyMongoError as e:
|
||||
raise ValueError(f"Failed to create file document: {e}")
|
||||
|
||||
async def find_document_by_id(self, file_id: str) -> Optional[FileDocument]:
|
||||
def find_document_by_id(self, file_id: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Find file document by ID.
|
||||
|
||||
@@ -102,7 +103,7 @@ class FileDocumentRepository:
|
||||
if not ObjectId.is_valid(file_id):
|
||||
return None
|
||||
|
||||
file_doc = await self.collection.find_one({"_id": ObjectId(file_id)})
|
||||
file_doc = self.collection.find_one({"_id": ObjectId(file_id)})
|
||||
if file_doc:
|
||||
return FileDocument(**file_doc)
|
||||
return None
|
||||
@@ -110,7 +111,7 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Find file document by file hash to detect duplicates.
|
||||
|
||||
@@ -121,7 +122,7 @@ class FileDocumentRepository:
|
||||
FileDocument or None: File document if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
file_doc = await self.collection.find_one({"file_hash": file_hash})
|
||||
file_doc = self.collection.find_one({"file_hash": file_hash})
|
||||
if file_doc:
|
||||
return FileDocument(**file_doc)
|
||||
return None
|
||||
@@ -129,7 +130,7 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Find file document by exact filepath.
|
||||
|
||||
@@ -140,7 +141,7 @@ class FileDocumentRepository:
|
||||
FileDocument or None: File document if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
file_doc = await self.collection.find_one({"filepath": filepath})
|
||||
file_doc = self.collection.find_one({"filepath": filepath})
|
||||
if file_doc:
|
||||
return FileDocument(**file_doc)
|
||||
return None
|
||||
@@ -148,7 +149,7 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
|
||||
def find_document_by_name(self, filename: str, matching_method: MatchMethodBase = None) -> List[FileDocument]:
|
||||
"""
|
||||
Find file documents by filename using fuzzy matching.
|
||||
|
||||
@@ -162,8 +163,7 @@ class FileDocumentRepository:
|
||||
try:
|
||||
# Get all files from database
|
||||
cursor = self.collection.find({})
|
||||
all_files = await cursor.to_list(length=None)
|
||||
all_documents = [FileDocument(**file_doc) for file_doc in all_files]
|
||||
all_documents = [FileDocument(**file_doc) for file_doc in cursor]
|
||||
|
||||
if isinstance(matching_method, FuzzyMatching):
|
||||
return fuzzy_matching(filename, all_documents, matching_method.threshold)
|
||||
@@ -173,7 +173,7 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return []
|
||||
|
||||
async def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]:
|
||||
def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]:
|
||||
"""
|
||||
List file documents with pagination.
|
||||
|
||||
@@ -186,13 +186,12 @@ class FileDocumentRepository:
|
||||
"""
|
||||
try:
|
||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("detected_at", -1)
|
||||
file_docs = await cursor.to_list(length=limit)
|
||||
return [FileDocument(**doc) for doc in file_docs]
|
||||
return [FileDocument(**doc) for doc in cursor]
|
||||
|
||||
except PyMongoError:
|
||||
return []
|
||||
|
||||
async def count_documents(self) -> int:
|
||||
def count_documents(self) -> int:
|
||||
"""
|
||||
Count total number of file documents.
|
||||
|
||||
@@ -200,17 +199,18 @@ class FileDocumentRepository:
|
||||
int: Total number of file documents in collection
|
||||
"""
|
||||
try:
|
||||
return await self.collection.count_documents({})
|
||||
return self.collection.count_documents({})
|
||||
except PyMongoError:
|
||||
return 0
|
||||
|
||||
async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]:
|
||||
def update_document(self, file_id: str, update_data: dict, session=None) -> Optional[FileDocument]:
|
||||
"""
|
||||
Update file document with new data.
|
||||
|
||||
Args:
|
||||
file_id (str): File document ID to update
|
||||
update_data (dict): Fields to update
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
FileDocument or None: Updated file document if successful, None otherwise
|
||||
@@ -223,12 +223,13 @@ class FileDocumentRepository:
|
||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None}
|
||||
|
||||
if not clean_update_data:
|
||||
return await self.find_document_by_id(file_id)
|
||||
return self.find_document_by_id(file_id)
|
||||
|
||||
result = await self.collection.find_one_and_update(
|
||||
result = self.collection.find_one_and_update(
|
||||
{"_id": ObjectId(file_id)},
|
||||
{"$set": clean_update_data},
|
||||
return_document=True
|
||||
return_document=True,
|
||||
**get_extra_args(session)
|
||||
)
|
||||
|
||||
if result:
|
||||
@@ -238,12 +239,13 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def delete_document(self, file_id: str) -> bool:
|
||||
def delete_document(self, file_id: str, session=None) -> bool:
|
||||
"""
|
||||
Delete file document from database.
|
||||
|
||||
Args:
|
||||
file_id (str): File document ID to delete
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
bool: True if file was deleted, False otherwise
|
||||
@@ -252,7 +254,7 @@ class FileDocumentRepository:
|
||||
if not ObjectId.is_valid(file_id):
|
||||
return False
|
||||
|
||||
result = await self.collection.delete_one({"_id": ObjectId(file_id)})
|
||||
result = self.collection.delete_one({"_id": ObjectId(file_id)}, **get_extra_args(session))
|
||||
return result.deleted_count > 0
|
||||
|
||||
except PyMongoError:
|
||||
|
||||
230
src/file-processor/app/database/repositories/job_repository.py
Normal file
230
src/file-processor/app/database/repositories/job_repository.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Repository for managing processing jobs in MongoDB.
|
||||
|
||||
This module provides data access layer for ProcessingJob operations
|
||||
with automatic timestamp management and error handling.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
from pymongo.collection import Collection
|
||||
from pymongo.database import Database
|
||||
from pymongo.errors import PyMongoError
|
||||
|
||||
from app.exceptions.job_exceptions import JobRepositoryError
|
||||
from app.models.job import ProcessingJob, ProcessingStatus
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
|
||||
class JobRepository:
|
||||
"""
|
||||
Repository for processing job data access operations.
|
||||
|
||||
Provides CRUD operations for ProcessingJob documents with automatic
|
||||
timestamp management and proper error handling.
|
||||
"""
|
||||
|
||||
def __init__(self, database: Database):
|
||||
"""Initialize repository with MongoDB collection reference."""
|
||||
self.db = database
|
||||
self.collection: Collection = self.db.processing_jobs
|
||||
|
||||
def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
|
||||
Creates unique index on username field to prevent duplicates.
|
||||
"""
|
||||
try:
|
||||
self.collection.create_index("document_id", unique=True)
|
||||
except PyMongoError:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
|
||||
def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
self._ensure_indexes()
|
||||
return self
|
||||
|
||||
def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob:
|
||||
"""
|
||||
Create a new processing job.
|
||||
|
||||
Args:
|
||||
file_id: Reference to the file document
|
||||
task_id: Optional Celery task UUID
|
||||
|
||||
Returns:
|
||||
The created ProcessingJob
|
||||
|
||||
Raises:
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
try:
|
||||
job_data = {
|
||||
"document_id": document_id,
|
||||
"status": ProcessingStatus.PENDING,
|
||||
"task_id": task_id,
|
||||
"created_at": datetime.now(),
|
||||
"started_at": None,
|
||||
"completed_at": None,
|
||||
"error_message": None
|
||||
}
|
||||
|
||||
result = self.collection.insert_one(job_data)
|
||||
job_data["_id"] = result.inserted_id
|
||||
|
||||
return ProcessingJob(**job_data)
|
||||
|
||||
except PyMongoError as e:
|
||||
raise JobRepositoryError("create_job", e)
|
||||
|
||||
def find_job_by_id(self, job_id: PyObjectId) -> Optional[ProcessingJob]:
|
||||
"""
|
||||
Retrieve a job by its ID.
|
||||
|
||||
Args:
|
||||
job_id: The job ObjectId
|
||||
|
||||
Returns:
|
||||
The ProcessingJob document
|
||||
|
||||
Raises:
|
||||
JobNotFoundError: If job doesn't exist
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
try:
|
||||
job_data = self.collection.find_one({"_id": job_id})
|
||||
if job_data:
|
||||
return ProcessingJob(**job_data)
|
||||
|
||||
return None
|
||||
|
||||
except PyMongoError as e:
|
||||
raise JobRepositoryError("get_job_by_id", e)
|
||||
|
||||
def update_job_status(
|
||||
self,
|
||||
job_id: PyObjectId,
|
||||
status: ProcessingStatus,
|
||||
error_message: Optional[str] = None
|
||||
) -> Optional[ProcessingJob]:
|
||||
"""
|
||||
Update job status with automatic timestamp management.
|
||||
|
||||
Args:
|
||||
job_id: The job ObjectId
|
||||
status: New processing status
|
||||
error_message: Optional error message for failed jobs
|
||||
|
||||
Returns:
|
||||
The updated ProcessingJob
|
||||
|
||||
Raises:
|
||||
JobNotFoundError: If job doesn't exist
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
try:
|
||||
# Prepare update data
|
||||
update_data = {"status": status}
|
||||
|
||||
# Set appropriate timestamp based on status
|
||||
current_time = datetime.now()
|
||||
if status == ProcessingStatus.PROCESSING:
|
||||
update_data["started_at"] = current_time
|
||||
elif status in (ProcessingStatus.COMPLETED, ProcessingStatus.FAILED):
|
||||
update_data["completed_at"] = current_time
|
||||
|
||||
# Add error message if provided
|
||||
if error_message is not None:
|
||||
update_data["error_message"] = error_message
|
||||
|
||||
result = self.collection.find_one_and_update(
|
||||
{"_id": job_id},
|
||||
{"$set": update_data},
|
||||
return_document=True
|
||||
)
|
||||
|
||||
if result:
|
||||
return ProcessingJob(**result)
|
||||
|
||||
return None
|
||||
|
||||
except PyMongoError as e:
|
||||
raise JobRepositoryError("update_job_status", e)
|
||||
|
||||
def delete_job(self, job_id: PyObjectId) -> bool:
|
||||
"""
|
||||
Delete a job from the database.
|
||||
|
||||
Args:
|
||||
job_id: The job ObjectId
|
||||
|
||||
Returns:
|
||||
True if job was deleted, False if not found
|
||||
|
||||
Raises:
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
try:
|
||||
result = self.collection.delete_one({"_id": job_id})
|
||||
|
||||
return result.deleted_count > 0
|
||||
|
||||
except PyMongoError as e:
|
||||
raise JobRepositoryError("delete_job", e)
|
||||
|
||||
def find_jobs_by_document_id(self, document_id: PyObjectId) -> List[ProcessingJob]:
|
||||
"""
|
||||
Retrieve all jobs for a specific file.
|
||||
|
||||
Args:
|
||||
document_id: The file ObjectId
|
||||
|
||||
Returns:
|
||||
List of ProcessingJob documents
|
||||
|
||||
Raises:
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
try:
|
||||
cursor = self.collection.find({"document_id": document_id})
|
||||
|
||||
jobs = []
|
||||
for job_data in cursor:
|
||||
jobs.append(ProcessingJob(**job_data))
|
||||
|
||||
return jobs
|
||||
|
||||
except PyMongoError as e:
|
||||
raise JobRepositoryError("get_jobs_by_file_id", e)
|
||||
|
||||
def get_jobs_by_status(self, status: ProcessingStatus) -> List[ProcessingJob]:
|
||||
"""
|
||||
Retrieve all jobs with a specific status.
|
||||
|
||||
Args:
|
||||
status: The processing status to filter by
|
||||
|
||||
Returns:
|
||||
List of ProcessingJob documents
|
||||
|
||||
Raises:
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
try:
|
||||
cursor = self.collection.find({"status": status})
|
||||
|
||||
jobs = []
|
||||
for job_data in cursor:
|
||||
jobs.append(ProcessingJob(**job_data))
|
||||
|
||||
return jobs
|
||||
|
||||
except PyMongoError as e:
|
||||
raise JobRepositoryError("get_jobs_by_status", e)
|
||||
@@ -5,10 +5,12 @@ This module implements the repository pattern for user CRUD operations
|
||||
with dependency injection of the database connection using async/await.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
|
||||
from bson import ObjectId
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
|
||||
from pymongo.collection import Collection
|
||||
from pymongo.database import Database
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
|
||||
from app.models.user import UserCreate, UserInDB, UserUpdate
|
||||
@@ -23,7 +25,7 @@ class UserRepository:
|
||||
following the repository pattern with dependency injection and async/await.
|
||||
"""
|
||||
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
def __init__(self, database: Database):
|
||||
"""
|
||||
Initialize repository with database dependency.
|
||||
|
||||
@@ -31,30 +33,30 @@ class UserRepository:
|
||||
database (AsyncIOMotorDatabase): MongoDB database instance
|
||||
"""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = database.users
|
||||
self._ensure_indexes()
|
||||
self.collection: Collection = database.users
|
||||
|
||||
async def initialize(self):
|
||||
def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
self._ensure_indexes()
|
||||
return self
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
|
||||
Creates unique index on username field to prevent duplicates.
|
||||
"""
|
||||
try:
|
||||
await self.collection.create_index("username", unique=True)
|
||||
self.collection.create_index("username", unique=True)
|
||||
except PyMongoError:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
|
||||
async def create_user(self, user_data: UserCreate) -> UserInDB:
|
||||
def create_user(self, user_data: UserCreate) -> UserInDB:
|
||||
"""
|
||||
Create a new user in the database.
|
||||
|
||||
@@ -79,7 +81,7 @@ class UserRepository:
|
||||
}
|
||||
|
||||
try:
|
||||
result = await self.collection.insert_one(user_dict)
|
||||
result = self.collection.insert_one(user_dict)
|
||||
user_dict["_id"] = result.inserted_id
|
||||
return UserInDB(**user_dict)
|
||||
except DuplicateKeyError as e:
|
||||
@@ -87,7 +89,7 @@ class UserRepository:
|
||||
except PyMongoError as e:
|
||||
raise ValueError(f"Failed to create user: {e}")
|
||||
|
||||
async def find_user_by_username(self, username: str) -> Optional[UserInDB]:
|
||||
def find_user_by_username(self, username: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Find user by username.
|
||||
|
||||
@@ -98,14 +100,14 @@ class UserRepository:
|
||||
UserInDB or None: User if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
user_doc = await self.collection.find_one({"username": username})
|
||||
user_doc = self.collection.find_one({"username": username})
|
||||
if user_doc:
|
||||
return UserInDB(**user_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
|
||||
def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Find user by ID.
|
||||
|
||||
@@ -119,14 +121,14 @@ class UserRepository:
|
||||
if not ObjectId.is_valid(user_id):
|
||||
return None
|
||||
|
||||
user_doc = await self.collection.find_one({"_id": ObjectId(user_id)})
|
||||
user_doc = self.collection.find_one({"_id": ObjectId(user_id)})
|
||||
if user_doc:
|
||||
return UserInDB(**user_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_user_by_email(self, email: str) -> Optional[UserInDB]:
|
||||
def find_user_by_email(self, email: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Find user by email address.
|
||||
|
||||
@@ -137,14 +139,14 @@ class UserRepository:
|
||||
UserInDB or None: User if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
user_doc = await self.collection.find_one({"email": email})
|
||||
user_doc = self.collection.find_one({"email": email})
|
||||
if user_doc:
|
||||
return UserInDB(**user_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
|
||||
def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
|
||||
"""
|
||||
Update user information.
|
||||
|
||||
@@ -177,9 +179,9 @@ class UserRepository:
|
||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None}
|
||||
|
||||
if not clean_update_data:
|
||||
return await self.find_user_by_id(user_id)
|
||||
return self.find_user_by_id(user_id)
|
||||
|
||||
result = await self.collection.find_one_and_update(
|
||||
result = self.collection.find_one_and_update(
|
||||
{"_id": ObjectId(user_id)},
|
||||
{"$set": clean_update_data},
|
||||
return_document=True
|
||||
@@ -192,7 +194,7 @@ class UserRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def delete_user(self, user_id: str) -> bool:
|
||||
def delete_user(self, user_id: str) -> bool:
|
||||
"""
|
||||
Delete user from database.
|
||||
|
||||
@@ -206,12 +208,12 @@ class UserRepository:
|
||||
if not ObjectId.is_valid(user_id):
|
||||
return False
|
||||
|
||||
result = await self.collection.delete_one({"_id": ObjectId(user_id)})
|
||||
result = self.collection.delete_one({"_id": ObjectId(user_id)})
|
||||
return result.deleted_count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
|
||||
def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
|
||||
"""
|
||||
List users with pagination.
|
||||
|
||||
@@ -224,12 +226,12 @@ class UserRepository:
|
||||
"""
|
||||
try:
|
||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("created_at", -1)
|
||||
user_docs = await cursor.to_list(length=limit)
|
||||
user_docs = cursor.to_list(length=limit)
|
||||
return [UserInDB(**user_doc) for user_doc in user_docs]
|
||||
except PyMongoError:
|
||||
return []
|
||||
|
||||
async def count_users(self) -> int:
|
||||
def count_users(self) -> int:
|
||||
"""
|
||||
Count total number of users.
|
||||
|
||||
@@ -237,11 +239,11 @@ class UserRepository:
|
||||
int: Total number of users in database
|
||||
"""
|
||||
try:
|
||||
return await self.collection.count_documents({})
|
||||
return self.collection.count_documents({})
|
||||
except PyMongoError:
|
||||
return 0
|
||||
|
||||
async def user_exists(self, username: str) -> bool:
|
||||
def user_exists(self, username: str) -> bool:
|
||||
"""
|
||||
Check if user exists by username.
|
||||
|
||||
@@ -252,7 +254,7 @@ class UserRepository:
|
||||
bool: True if user exists, False otherwise
|
||||
"""
|
||||
try:
|
||||
count = await self.collection.count_documents({"username": username})
|
||||
count = self.collection.count_documents({"username": username})
|
||||
return count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
0
src/file-processor/app/exceptions/__init__.py
Normal file
0
src/file-processor/app/exceptions/__init__.py
Normal file
38
src/file-processor/app/exceptions/job_exceptions.py
Normal file
38
src/file-processor/app/exceptions/job_exceptions.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
Custom exceptions for job management operations.
|
||||
|
||||
This module defines specific exceptions for job processing lifecycle
|
||||
and repository operations to provide clear error handling.
|
||||
"""
|
||||
|
||||
from app.models.job import ProcessingStatus
|
||||
|
||||
|
||||
class InvalidStatusTransitionError(Exception):
|
||||
"""
|
||||
Raised when an invalid status transition is attempted.
|
||||
|
||||
This exception indicates that an attempt was made to change a job's
|
||||
status to an invalid target status given the current status.
|
||||
"""
|
||||
|
||||
def __init__(self, current_status: ProcessingStatus, target_status: ProcessingStatus):
|
||||
self.current_status = current_status
|
||||
self.target_status = target_status
|
||||
super().__init__(
|
||||
f"Invalid status transition from '{current_status}' to '{target_status}'"
|
||||
)
|
||||
|
||||
|
||||
class JobRepositoryError(Exception):
|
||||
"""
|
||||
Raised when a MongoDB operation fails in the job repository.
|
||||
|
||||
This exception wraps database-related errors that occur during
|
||||
job repository operations.
|
||||
"""
|
||||
|
||||
def __init__(self, operation: str, original_error: Exception):
|
||||
self.operation = operation
|
||||
self.original_error = original_error
|
||||
super().__init__(f"Repository operation '{operation}' failed: {str(original_error)}")
|
||||
243
src/file-processor/app/file_watcher.py
Normal file
243
src/file-processor/app/file_watcher.py
Normal file
@@ -0,0 +1,243 @@
|
||||
"""
|
||||
File watcher implementation with Watchdog observer and ProcessingJob management.
|
||||
|
||||
This module provides real-time file monitoring for document processing.
|
||||
When a file is created in the watched directory, it:
|
||||
1. Creates a document record via DocumentService
|
||||
2. Dispatches a Celery task for processing
|
||||
3. Creates a ProcessingJob to track the task lifecycle
|
||||
"""
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from watchdog.events import FileSystemEventHandler, FileCreatedEvent
|
||||
from watchdog.observers import Observer
|
||||
|
||||
from app.services.document_service import DocumentService
|
||||
from app.services.job_service import JobService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentFileEventHandler(FileSystemEventHandler):
|
||||
"""
|
||||
Event handler for document file creation events.
|
||||
|
||||
Processes newly created files by creating document records,
|
||||
dispatching Celery tasks, and managing processing jobs.
|
||||
"""
|
||||
|
||||
SUPPORTED_EXTENSIONS = {'.txt', '.pdf', '.docx'}
|
||||
|
||||
def __init__(self, document_service: DocumentService, job_service: JobService):
|
||||
"""
|
||||
Initialize the event handler.
|
||||
|
||||
Args:
|
||||
document_service: Service for document management
|
||||
job_service: Service for processing job management
|
||||
"""
|
||||
super().__init__()
|
||||
self.document_service = document_service
|
||||
self.job_service = job_service
|
||||
|
||||
def on_created(self, event: FileCreatedEvent) -> None:
|
||||
"""
|
||||
Handle file creation events.
|
||||
|
||||
Args:
|
||||
event: File system event containing file path information
|
||||
"""
|
||||
if event.is_directory:
|
||||
return
|
||||
|
||||
filepath = event.src_path
|
||||
file_extension = Path(filepath).suffix.lower()
|
||||
|
||||
if file_extension not in self.SUPPORTED_EXTENSIONS:
|
||||
logger.info(f"Ignoring unsupported file type: {filepath}")
|
||||
return
|
||||
|
||||
logger.info(f"Processing new file: {filepath}")
|
||||
|
||||
# try:
|
||||
from tasks.document_processing import process_document
|
||||
task_result = process_document.delay(filepath)
|
||||
print(task_result)
|
||||
print("hello world")
|
||||
# task_id = task_result.task_id
|
||||
# logger.info(f"Dispatched Celery task with ID: {task_id}")
|
||||
|
||||
# except Exception as e:
|
||||
# logger.error(f"Failed to process file {filepath}: {str(e)}")
|
||||
# # Note: We don't re-raise the exception to keep the watcher running
|
||||
|
||||
|
||||
class FileWatcher:
|
||||
"""
|
||||
File system watcher for automatic document processing.
|
||||
|
||||
Monitors a directory for new files and triggers processing pipeline
|
||||
using a dedicated observer thread.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
watch_directory: str,
|
||||
document_service: DocumentService,
|
||||
job_service: JobService,
|
||||
recursive: bool = True
|
||||
):
|
||||
"""
|
||||
Initialize the file watcher.
|
||||
|
||||
Args:
|
||||
watch_directory: Directory path to monitor
|
||||
document_service: Service for document management
|
||||
job_service: Service for processing job management
|
||||
recursive: Whether to watch subdirectories recursively
|
||||
"""
|
||||
self.watch_directory = Path(watch_directory)
|
||||
self.recursive = recursive
|
||||
self.observer: Optional[Observer] = None
|
||||
self._observer_thread: Optional[threading.Thread] = None
|
||||
self._stop_event = threading.Event()
|
||||
|
||||
# Validate watch directory
|
||||
if not self.watch_directory.exists():
|
||||
raise ValueError(f"Watch directory does not exist: {watch_directory}")
|
||||
|
||||
if not self.watch_directory.is_dir():
|
||||
raise ValueError(f"Watch path is not a directory: {watch_directory}")
|
||||
|
||||
# Create event handler
|
||||
self.event_handler = DocumentFileEventHandler(
|
||||
document_service=document_service,
|
||||
job_service=job_service
|
||||
)
|
||||
|
||||
logger.info(f"FileWatcher initialized for directory: {self.watch_directory}")
|
||||
|
||||
def start(self) -> None:
|
||||
"""
|
||||
Start the file watcher in a separate thread.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the watcher is already running
|
||||
"""
|
||||
if self.is_running():
|
||||
raise RuntimeError("FileWatcher is already running")
|
||||
|
||||
self.observer = Observer()
|
||||
self.observer.schedule(
|
||||
self.event_handler,
|
||||
str(self.watch_directory),
|
||||
recursive=self.recursive
|
||||
)
|
||||
|
||||
# Start observer in separate thread
|
||||
self._observer_thread = threading.Thread(
|
||||
target=self._run_observer,
|
||||
name="FileWatcher-Observer"
|
||||
)
|
||||
self._stop_event.clear()
|
||||
self._observer_thread.start()
|
||||
|
||||
logger.info("FileWatcher started successfully")
|
||||
|
||||
def stop(self, timeout: float = 5.0) -> None:
|
||||
"""
|
||||
Stop the file watcher gracefully.
|
||||
|
||||
Args:
|
||||
timeout: Maximum time to wait for graceful shutdown
|
||||
"""
|
||||
if not self.is_running():
|
||||
logger.warning("FileWatcher is not running")
|
||||
return
|
||||
|
||||
logger.info("Stopping FileWatcher...")
|
||||
|
||||
# Signal stop and wait for observer thread
|
||||
self._stop_event.set()
|
||||
|
||||
if self.observer:
|
||||
self.observer.stop()
|
||||
|
||||
if self._observer_thread and self._observer_thread.is_alive():
|
||||
self._observer_thread.join(timeout=timeout)
|
||||
|
||||
if self._observer_thread.is_alive():
|
||||
logger.warning("FileWatcher thread did not stop gracefully within timeout")
|
||||
else:
|
||||
logger.info("FileWatcher stopped gracefully")
|
||||
|
||||
# Clean up
|
||||
self.observer = None
|
||||
self._observer_thread = None
|
||||
|
||||
def is_running(self) -> bool:
|
||||
"""
|
||||
Check if the file watcher is currently running.
|
||||
|
||||
Returns:
|
||||
True if the watcher is running, False otherwise
|
||||
"""
|
||||
return (
|
||||
self.observer is not None
|
||||
and self._observer_thread is not None
|
||||
and self._observer_thread.is_alive()
|
||||
)
|
||||
|
||||
def _run_observer(self) -> None:
|
||||
"""
|
||||
Internal method to run the observer in a separate thread.
|
||||
|
||||
This method should not be called directly.
|
||||
"""
|
||||
if not self.observer:
|
||||
logger.error("Observer not initialized")
|
||||
return
|
||||
|
||||
try:
|
||||
self.observer.start()
|
||||
logger.info("Observer thread started")
|
||||
|
||||
# Keep the observer running until stop is requested
|
||||
while not self._stop_event.is_set():
|
||||
self._stop_event.wait(timeout=1.0)
|
||||
|
||||
logger.info("Observer thread stopping...")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Observer thread error: {str(e)}")
|
||||
finally:
|
||||
if self.observer:
|
||||
self.observer.join()
|
||||
logger.info("Observer thread stopped")
|
||||
|
||||
|
||||
def create_file_watcher(
|
||||
watch_directory: str,
|
||||
document_service: DocumentService,
|
||||
job_service: JobService
|
||||
) -> FileWatcher:
|
||||
"""
|
||||
Factory function to create a FileWatcher instance.
|
||||
|
||||
Args:
|
||||
watch_directory: Directory path to monitor
|
||||
document_service: Service for document management
|
||||
job_service: Service for processing job management
|
||||
|
||||
Returns:
|
||||
Configured FileWatcher instance
|
||||
"""
|
||||
return FileWatcher(
|
||||
watch_directory=watch_directory,
|
||||
document_service=document_service,
|
||||
job_service=job_service
|
||||
)
|
||||
@@ -1,203 +1,169 @@
|
||||
"""
|
||||
FastAPI application for MyDocManager file processor service.
|
||||
FastAPI application with integrated FileWatcher for document processing.
|
||||
|
||||
This service provides API endpoints for health checks and task dispatching.
|
||||
This module provides the main FastAPI application with:
|
||||
- JWT authentication
|
||||
- User management APIs
|
||||
- Real-time file monitoring via FileWatcher
|
||||
- Document processing via Celery tasks
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, HTTPException, Depends
|
||||
from pydantic import BaseModel
|
||||
import redis
|
||||
from celery import Celery
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from app.database.connection import test_database_connection, get_database
|
||||
from app.database.repositories.user_repository import UserRepository
|
||||
from app.models.user import UserCreate
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from app.api.routes.auth import router as auth_router
|
||||
from app.api.routes.users import router as users_router
|
||||
from app.config import settings
|
||||
from app.database.connection import get_database
|
||||
from app.file_watcher import create_file_watcher, FileWatcher
|
||||
from app.services.document_service import DocumentService
|
||||
from app.services.init_service import InitializationService
|
||||
from app.services.job_service import JobService
|
||||
from app.services.user_service import UserService
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Global file watcher instance
|
||||
file_watcher: FileWatcher = None
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
"""
|
||||
Application lifespan manager for startup and shutdown tasks.
|
||||
|
||||
Handles initialization tasks that need to run when the application starts,
|
||||
including admin user creation and other setup procedures.
|
||||
FastAPI lifespan context manager.
|
||||
|
||||
Handles application startup and shutdown events including:
|
||||
- Database connection
|
||||
- Default admin user creation
|
||||
- FileWatcher startup/shutdown
|
||||
"""
|
||||
# Startup tasks
|
||||
global file_watcher
|
||||
|
||||
# Startup
|
||||
logger.info("Starting MyDocManager application...")
|
||||
|
||||
try:
|
||||
# Initialize database connection
|
||||
database = get_database()
|
||||
logger.info("Database connection established")
|
||||
|
||||
# Initialize repositories and services
|
||||
user_repository = UserRepository(database)
|
||||
user_service = UserService(user_repository)
|
||||
document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder())
|
||||
job_service = JobService(database=database)
|
||||
user_service = UserService(database=database)
|
||||
logger.info("Service created")
|
||||
|
||||
# Create default admin user
|
||||
init_service = InitializationService(user_service)
|
||||
init_service.initialize_application()
|
||||
logger.info("Default admin user initialization completed")
|
||||
|
||||
# Run initialization tasks
|
||||
initialization_result = init_service.initialize_application()
|
||||
# Create and start file watcher
|
||||
file_watcher = create_file_watcher(
|
||||
watch_directory=settings.watch_directory(),
|
||||
document_service=document_service,
|
||||
job_service=job_service
|
||||
)
|
||||
file_watcher.start()
|
||||
logger.info(f"FileWatcher started for directory: {settings.watch_directory()}")
|
||||
|
||||
if initialization_result["initialization_success"]:
|
||||
logger.info("Application startup completed successfully")
|
||||
if initialization_result["admin_user_created"]:
|
||||
logger.info("Default admin user was created during startup")
|
||||
else:
|
||||
logger.error("Application startup completed with errors:")
|
||||
for error in initialization_result["errors"]:
|
||||
logger.error(f" - {error}")
|
||||
logger.info("Application startup completed successfully")
|
||||
|
||||
yield
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Critical error during application startup: {str(e)}")
|
||||
# You might want to decide if the app should continue or exit here
|
||||
# For now, we log the error but continue
|
||||
logger.error(f"Application startup failed: {str(e)}")
|
||||
raise
|
||||
|
||||
yield # Application is running
|
||||
|
||||
# Shutdown tasks (if needed)
|
||||
logger.info("Shutting down MyDocManager application...")
|
||||
finally:
|
||||
# Shutdown
|
||||
logger.info("Shutting down MyDocManager application...")
|
||||
|
||||
if file_watcher and file_watcher.is_running():
|
||||
file_watcher.stop()
|
||||
logger.info("FileWatcher stopped")
|
||||
|
||||
logger.info("Application shutdown completed")
|
||||
|
||||
|
||||
# Initialize FastAPI app
|
||||
# Create FastAPI application
|
||||
app = FastAPI(
|
||||
title="MyDocManager File Processor",
|
||||
description="File processing and task dispatch service",
|
||||
version="1.0.0",
|
||||
title="MyDocManager",
|
||||
description="Real-time document processing application with authentication",
|
||||
version="0.1.0",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# Environment variables
|
||||
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
||||
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
||||
|
||||
# Initialize Redis client
|
||||
try:
|
||||
redis_client = redis.from_url(REDIS_URL)
|
||||
except Exception as e:
|
||||
redis_client = None
|
||||
print(f"Warning: Could not connect to Redis: {e}")
|
||||
|
||||
# Initialize Celery
|
||||
celery_app = Celery(
|
||||
"file_processor",
|
||||
broker=REDIS_URL,
|
||||
backend=REDIS_URL
|
||||
# Configure CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["http://localhost:5173"], # React frontend
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Include routers
|
||||
app.include_router(auth_router, prefix="/auth", tags=["Authentication"])
|
||||
app.include_router(users_router, prefix="/users", tags=["User Management"])
|
||||
# app.include_router(documents_router, prefix="/documents", tags=["Documents"])
|
||||
# app.include_router(jobs_router, prefix="/jobs", tags=["Processing Jobs"])
|
||||
|
||||
# Pydantic models
|
||||
class TestTaskRequest(BaseModel):
|
||||
"""Request model for test task."""
|
||||
message: str
|
||||
|
||||
|
||||
def get_user_service() -> UserService:
|
||||
"""
|
||||
Dependency to get user service instance.
|
||||
|
||||
This should be properly implemented with database connection management
|
||||
in your actual application.
|
||||
"""
|
||||
database = get_database()
|
||||
user_repository = UserRepository(database)
|
||||
return UserService(user_repository)
|
||||
|
||||
|
||||
# Your API routes would use the service like this:
|
||||
@app.post("/api/users")
|
||||
async def create_user(
|
||||
user_data: UserCreate,
|
||||
user_service: UserService = Depends(get_user_service)
|
||||
):
|
||||
return user_service.create_user(user_data)
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""
|
||||
Health check endpoint.
|
||||
|
||||
|
||||
Returns:
|
||||
dict: Service health status with dependencies
|
||||
Dictionary containing application health status
|
||||
"""
|
||||
health_status = {
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": "file-processor",
|
||||
"dependencies": {
|
||||
"redis": "unknown",
|
||||
"mongodb": "unknown"
|
||||
},
|
||||
"service": "MyDocManager",
|
||||
"version": "1.0.0",
|
||||
"file_watcher_running": file_watcher.is_running() if file_watcher else False
|
||||
}
|
||||
|
||||
# Check Redis connection
|
||||
if redis_client:
|
||||
try:
|
||||
redis_client.ping()
|
||||
health_status["dependencies"]["redis"] = "connected"
|
||||
except Exception:
|
||||
health_status["dependencies"]["redis"] = "disconnected"
|
||||
health_status["status"] = "degraded"
|
||||
|
||||
# check MongoDB connection
|
||||
if test_database_connection():
|
||||
health_status["dependencies"]["mongodb"] = "connected"
|
||||
else:
|
||||
health_status["dependencies"]["mongodb"] = "disconnected"
|
||||
|
||||
return health_status
|
||||
|
||||
|
||||
@app.post("/test-task")
|
||||
async def dispatch_test_task(request: TestTaskRequest):
|
||||
"""
|
||||
Dispatch a test task to Celery worker.
|
||||
|
||||
Args:
|
||||
request: Test task request containing message
|
||||
|
||||
Returns:
|
||||
dict: Task dispatch information
|
||||
|
||||
Raises:
|
||||
HTTPException: If task dispatch fails
|
||||
"""
|
||||
try:
|
||||
# Send task to worker
|
||||
task = celery_app.send_task(
|
||||
"main.test_task",
|
||||
args=[request.message]
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "dispatched",
|
||||
"task_id": task.id,
|
||||
"message": f"Test task dispatched with message: {request.message}"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to dispatch task: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""
|
||||
Root endpoint.
|
||||
|
||||
Root endpoint with basic application information.
|
||||
|
||||
Returns:
|
||||
dict: Basic service information
|
||||
Dictionary containing welcome message and available endpoints
|
||||
"""
|
||||
return {
|
||||
"service": "MyDocManager File Processor",
|
||||
"version": "1.0.0",
|
||||
"status": "running"
|
||||
"message": "Welcome to MyDocManager",
|
||||
"description": "Real-time document processing application",
|
||||
"docs": "/docs",
|
||||
"health": "/health"
|
||||
}
|
||||
|
||||
|
||||
@app.get("/watcher/status")
|
||||
async def watcher_status():
|
||||
"""
|
||||
Get file watcher status.
|
||||
|
||||
Returns:
|
||||
Dictionary containing file watcher status information
|
||||
"""
|
||||
if not file_watcher:
|
||||
return {
|
||||
"status": "not_initialized",
|
||||
"running": False
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "initialized",
|
||||
"running": file_watcher.is_running(),
|
||||
"watch_directory": str(file_watcher.watch_directory),
|
||||
"recursive": file_watcher.recursive
|
||||
}
|
||||
|
||||
@@ -3,12 +3,45 @@ Authentication models and enums for user management.
|
||||
|
||||
Contains user roles enumeration and authentication-related Pydantic models.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
|
||||
class UserRole(str, Enum):
|
||||
"""User roles enumeration with string values."""
|
||||
|
||||
USER = "user"
|
||||
ADMIN = "admin"
|
||||
ADMIN = "admin"
|
||||
|
||||
|
||||
class UserResponse(BaseModel):
|
||||
"""Model for user data in API responses (excludes password_hash)."""
|
||||
|
||||
id: PyObjectId = Field(alias="_id")
|
||||
username: str
|
||||
email: str
|
||||
role: UserRole
|
||||
is_active: bool
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
model_config = {
|
||||
"populate_by_name": True,
|
||||
"arbitrary_types_allowed": True,
|
||||
}
|
||||
|
||||
|
||||
class LoginResponse(BaseModel):
|
||||
"""Response model for successful login."""
|
||||
access_token: str
|
||||
token_type: str = "bearer"
|
||||
user: UserResponse
|
||||
|
||||
|
||||
class MessageResponse(BaseModel):
|
||||
"""Generic message response."""
|
||||
message: str
|
||||
|
||||
@@ -33,15 +33,6 @@ class ExtractionMethod(str, Enum):
|
||||
HYBRID = "hybrid"
|
||||
|
||||
|
||||
class ProcessingStatus(str, Enum):
|
||||
"""Status values for processing jobs."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class FileDocument(BaseModel):
|
||||
"""
|
||||
Model for file documents stored in the 'files' collection.
|
||||
@@ -58,6 +49,9 @@ class FileDocument(BaseModel):
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
@field_validator('filepath')
|
||||
@classmethod
|
||||
@@ -74,69 +68,3 @@ class FileDocument(BaseModel):
|
||||
if not v.strip():
|
||||
raise ValueError("Filename cannot be empty")
|
||||
return v.strip()
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
populate_by_name = True
|
||||
arbitrary_types_allowed = True
|
||||
json_encoders = {ObjectId: str}
|
||||
|
||||
|
||||
class DocumentContent(BaseModel):
|
||||
"""Model for document content."""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
content: str = Field(..., description="File content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
|
||||
class ProcessingJob(BaseModel):
|
||||
"""
|
||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||
|
||||
Tracks the lifecycle and status of document processing tasks.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_id: PyObjectId = Field(..., description="Reference to file document")
|
||||
status: ProcessingStatus = Field(
|
||||
default=ProcessingStatus.PENDING,
|
||||
description="Current processing status"
|
||||
)
|
||||
task_id: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Celery task UUID"
|
||||
)
|
||||
created_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when job was created"
|
||||
)
|
||||
started_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when processing started"
|
||||
)
|
||||
completed_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when processing completed"
|
||||
)
|
||||
error_message: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Error message if processing failed"
|
||||
)
|
||||
|
||||
@field_validator('error_message')
|
||||
@classmethod
|
||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||
"""Clean up error message."""
|
||||
if v is not None:
|
||||
return v.strip() if v.strip() else None
|
||||
return v
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
populate_by_name = True
|
||||
arbitrary_types_allowed = True
|
||||
json_encoders = {ObjectId: str}
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
from bson import ObjectId
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
|
||||
class ProcessingStatus(str, Enum):
|
||||
"""Status values for processing jobs."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class ProcessingJob(BaseModel):
|
||||
"""
|
||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||
|
||||
Tracks the lifecycle and status of document processing tasks.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
document_id: PyObjectId = Field(..., description="Reference to file document")
|
||||
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
|
||||
task_id: Optional[str] = Field(default=None, description="Celery task UUID")
|
||||
created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
|
||||
started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
|
||||
completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
|
||||
error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
|
||||
|
||||
@field_validator('error_message')
|
||||
@classmethod
|
||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||
"""Clean up error message."""
|
||||
if v is not None:
|
||||
return v.strip() if v.strip() else None
|
||||
return v
|
||||
@@ -7,10 +7,10 @@ and API responses with proper validation and type safety.
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional, Any
|
||||
from typing import Optional
|
||||
|
||||
from bson import ObjectId
|
||||
from pydantic import BaseModel, Field, field_validator, EmailStr
|
||||
from pydantic_core import core_schema
|
||||
|
||||
from app.models.auth import UserRole
|
||||
from app.models.types import PyObjectId
|
||||
@@ -138,21 +138,3 @@ class UserInDB(BaseModel):
|
||||
"arbitrary_types_allowed": True,
|
||||
"json_encoders": {ObjectId: str}
|
||||
}
|
||||
|
||||
|
||||
class UserResponse(BaseModel):
|
||||
"""Model for user data in API responses (excludes password_hash)."""
|
||||
|
||||
id: PyObjectId = Field(alias="_id")
|
||||
username: str
|
||||
email: str
|
||||
role: UserRole
|
||||
is_active: bool
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
model_config = {
|
||||
"populate_by_name": True,
|
||||
"arbitrary_types_allowed": True,
|
||||
"json_encoders": {ObjectId: str}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,11 @@ Authentication service for password hashing and verification.
|
||||
This module provides authentication-related functionality including
|
||||
password hashing, verification, and JWT token management.
|
||||
"""
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import jwt
|
||||
|
||||
from app.config import settings
|
||||
from app.utils.security import hash_password, verify_password
|
||||
|
||||
|
||||
@@ -55,4 +59,26 @@ class AuthService:
|
||||
>>> auth.verify_user_password("wrongpassword", hashed)
|
||||
False
|
||||
"""
|
||||
return verify_password(password, hashed_password)
|
||||
return verify_password(password, hashed_password)
|
||||
|
||||
@staticmethod
|
||||
def create_access_token(data=dict) -> str:
|
||||
"""
|
||||
Create a JWT access token.
|
||||
|
||||
Args:
|
||||
data (dict): Payload data to include in the token.
|
||||
|
||||
Returns:
|
||||
str: Encoded JWT token.
|
||||
"""
|
||||
# Copy data to avoid modifying the original dict
|
||||
to_encode = data.copy()
|
||||
|
||||
# Add expiration time
|
||||
expire = datetime.now() + timedelta(hours=settings.get_jwt_expire_hours())
|
||||
to_encode.update({"exp": expire})
|
||||
|
||||
# Encode JWT
|
||||
encoded_jwt = jwt.encode(to_encode, settings.get_jwt_secret_key(), algorithm=settings.get_jwt_algorithm())
|
||||
return encoded_jwt
|
||||
|
||||
@@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import magic
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClientSession
|
||||
import magic
|
||||
from pymongo.errors import PyMongoError
|
||||
|
||||
from app.database.connection import get_database
|
||||
from app.config.settings import get_objects_folder
|
||||
from app.database.repositories.document_repository import FileDocumentRepository
|
||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
||||
from app.models.document import (
|
||||
FileDocument,
|
||||
DocumentContent,
|
||||
FileType,
|
||||
ProcessingStatus
|
||||
)
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
@@ -34,13 +31,25 @@ class DocumentService:
|
||||
and their content while ensuring data consistency through transactions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the document service with repository dependencies."""
|
||||
self.db = get_database()
|
||||
self.file_repository = FileDocumentRepository(self.db)
|
||||
self.content_repository = DocumentContentRepository(self.db)
|
||||
def __init__(self, database, objects_folder: str = None):
|
||||
"""
|
||||
Initialize the document service with repository dependencies.
|
||||
|
||||
Args:
|
||||
database: Database instance
|
||||
objects_folder: folder to store files by their hash
|
||||
"""
|
||||
|
||||
self.db = database
|
||||
self.document_repository = FileDocumentRepository(self.db)
|
||||
self.objects_folder = objects_folder or get_objects_folder()
|
||||
|
||||
def _calculate_file_hash(self, file_bytes: bytes) -> str:
|
||||
def initialize(self):
|
||||
self.document_repository.initialize()
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def _calculate_file_hash(file_bytes: bytes) -> str:
|
||||
"""
|
||||
Calculate SHA256 hash of file content.
|
||||
|
||||
@@ -52,7 +61,8 @@ class DocumentService:
|
||||
"""
|
||||
return hashlib.sha256(file_bytes).hexdigest()
|
||||
|
||||
def _detect_file_type(self, file_path: str) -> FileType:
|
||||
@staticmethod
|
||||
def _detect_file_type(file_path: str) -> FileType:
|
||||
"""
|
||||
Detect file type from file extension.
|
||||
|
||||
@@ -72,7 +82,8 @@ class DocumentService:
|
||||
except ValueError:
|
||||
raise ValueError(f"Unsupported file type: {extension}")
|
||||
|
||||
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
||||
@staticmethod
|
||||
def _detect_mime_type(file_bytes: bytes) -> str:
|
||||
"""
|
||||
Detect MIME type from file content.
|
||||
|
||||
@@ -84,10 +95,51 @@ class DocumentService:
|
||||
"""
|
||||
return magic.from_buffer(file_bytes, mime=True)
|
||||
|
||||
async def create_document(
|
||||
@staticmethod
|
||||
def _read_file_bytes(file_path: str | Path) -> bytes:
|
||||
"""
|
||||
Read file content as bytes asynchronously.
|
||||
|
||||
Args:
|
||||
file_path (str | Path): Path of the file to read
|
||||
|
||||
Returns:
|
||||
bytes: Content of the file
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist
|
||||
OSError: If any I/O error occurs
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
return path.read_bytes()
|
||||
|
||||
def _get_document_path(self, file_hash):
|
||||
"""
|
||||
|
||||
:param file_hash:
|
||||
:return:
|
||||
"""
|
||||
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
||||
|
||||
def save_content_if_needed(self, file_hash, content: bytes):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if os.path.exists(target_path):
|
||||
return
|
||||
|
||||
if not os.path.exists(os.path.dirname(target_path)):
|
||||
os.makedirs(os.path.dirname(target_path))
|
||||
|
||||
with open(target_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
def create_document(
|
||||
self,
|
||||
file_path: str,
|
||||
file_bytes: bytes,
|
||||
file_bytes: bytes | None = None,
|
||||
encoding: str = "utf-8"
|
||||
) -> FileDocument:
|
||||
"""
|
||||
@@ -110,57 +162,40 @@ class DocumentService:
|
||||
PyMongoError: If database operation fails
|
||||
"""
|
||||
# Calculate automatic attributes
|
||||
file_bytes = file_bytes if file_bytes is not None else self._read_file_bytes(file_path)
|
||||
file_hash = self._calculate_file_hash(file_bytes)
|
||||
file_type = self._detect_file_type(file_path)
|
||||
mime_type = self._detect_mime_type(file_bytes)
|
||||
file_size = len(file_bytes)
|
||||
filename = Path(file_path).name
|
||||
detected_at = datetime.utcnow()
|
||||
detected_at = datetime.now()
|
||||
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
try:
|
||||
# Check if content already exists
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash, session=session
|
||||
)
|
||||
|
||||
# Create DocumentContent if it doesn't exist
|
||||
if not existing_content:
|
||||
content_data = DocumentContent(
|
||||
file_hash=file_hash,
|
||||
content="", # Will be populated by processing workers
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
await self.content_repository.create_document_content(
|
||||
content_data, session=session
|
||||
)
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash
|
||||
)
|
||||
|
||||
created_file = await self.file_repository.create_document(
|
||||
file_data, session=session
|
||||
)
|
||||
|
||||
return created_file
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
try:
|
||||
self.save_content_if_needed(file_hash, file_bytes)
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash,
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
|
||||
created_file = self.document_repository.create_document(file_data)
|
||||
|
||||
return created_file
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
|
||||
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||
def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||
"""
|
||||
Retrieve a document by its ID.
|
||||
|
||||
@@ -170,9 +205,9 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_id(document_id)
|
||||
return self.document_repository.find_document_by_id(str(document_id))
|
||||
|
||||
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Retrieve a document by its file hash.
|
||||
|
||||
@@ -182,9 +217,9 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_hash(file_hash)
|
||||
return self.document_repository.find_document_by_hash(file_hash)
|
||||
|
||||
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
Retrieve a document by its file path.
|
||||
|
||||
@@ -194,34 +229,17 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_filepath(filepath)
|
||||
return self.document_repository.find_document_by_filepath(filepath)
|
||||
|
||||
async def get_document_with_content(
|
||||
self,
|
||||
document_id: PyObjectId
|
||||
) -> Optional[Tuple[FileDocument, DocumentContent]]:
|
||||
"""
|
||||
Retrieve a document with its associated content.
|
||||
|
||||
Args:
|
||||
document_id: Document ObjectId
|
||||
|
||||
Returns:
|
||||
Tuple of (FileDocument, DocumentContent) if found, None otherwise
|
||||
"""
|
||||
document = await self.get_document_by_id(document_id)
|
||||
if not document:
|
||||
def get_document_content_by_hash(self, file_hash):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if not os.path.exists(target_path):
|
||||
return None
|
||||
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash
|
||||
)
|
||||
if not content:
|
||||
return None
|
||||
|
||||
return (document, content)
|
||||
with open(target_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
async def list_documents(
|
||||
def list_documents(
|
||||
self,
|
||||
skip: int = 0,
|
||||
limit: int = 100
|
||||
@@ -236,18 +254,18 @@ class DocumentService:
|
||||
Returns:
|
||||
List of FileDocument instances
|
||||
"""
|
||||
return await self.file_repository.list_documents(skip=skip, limit=limit)
|
||||
return self.document_repository.list_documents(skip=skip, limit=limit)
|
||||
|
||||
async def count_documents(self) -> int:
|
||||
def count_documents(self) -> int:
|
||||
"""
|
||||
Get total number of documents.
|
||||
|
||||
Returns:
|
||||
Total document count
|
||||
"""
|
||||
return await self.file_repository.count_documents()
|
||||
return self.document_repository.count_documents()
|
||||
|
||||
async def update_document(
|
||||
def update_document(
|
||||
self,
|
||||
document_id: PyObjectId,
|
||||
update_data: Dict[str, Any]
|
||||
@@ -262,9 +280,14 @@ class DocumentService:
|
||||
Returns:
|
||||
Updated FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.update_document(document_id, update_data)
|
||||
if "file_bytes" in update_data:
|
||||
file_hash = self._calculate_file_hash(update_data["file_bytes"])
|
||||
update_data["file_hash"] = file_hash
|
||||
self.save_content_if_needed(file_hash, update_data["file_bytes"])
|
||||
|
||||
return self.document_repository.update_document(document_id, update_data)
|
||||
|
||||
async def delete_document(self, document_id: PyObjectId) -> bool:
|
||||
def delete_document(self, document_id: PyObjectId) -> bool:
|
||||
"""
|
||||
Delete a document and its orphaned content.
|
||||
|
||||
@@ -281,100 +304,31 @@ class DocumentService:
|
||||
Raises:
|
||||
PyMongoError: If database operation fails
|
||||
"""
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
# Start transaction
|
||||
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = self.document_repository.find_document_by_id(document_id)
|
||||
if not document:
|
||||
return False
|
||||
|
||||
# Delete the document
|
||||
deleted = self.document_repository.delete_document(document_id)
|
||||
if not deleted:
|
||||
return False
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = self.document_repository.find_document_by_hash(document.file_hash)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = await self.file_repository.find_document_by_id(
|
||||
document_id, session=session
|
||||
)
|
||||
if not document:
|
||||
return False
|
||||
|
||||
# Delete the document
|
||||
deleted = await self.file_repository.delete_document(
|
||||
document_id, session=session
|
||||
)
|
||||
if not deleted:
|
||||
return False
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = await self.file_repository.find_document_by_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
if content:
|
||||
await self.content_repository.delete_document_content(
|
||||
content.id, session=session
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if content with given hash exists.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
True if content exists, False otherwise
|
||||
"""
|
||||
return await self.content_repository.content_exists(file_hash)
|
||||
|
||||
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Retrieve content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
DocumentContent if found, None otherwise
|
||||
"""
|
||||
return await self.content_repository.find_document_content_by_file_hash(file_hash)
|
||||
|
||||
async def update_document_content(
|
||||
self,
|
||||
file_hash: str,
|
||||
content: str,
|
||||
encoding: str = "utf-8"
|
||||
) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update the extracted content for a document.
|
||||
|
||||
This method is typically called by processing workers to store
|
||||
the extracted text content.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
content: Extracted text content
|
||||
encoding: Character encoding
|
||||
|
||||
Returns:
|
||||
Updated DocumentContent if found, None otherwise
|
||||
"""
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash
|
||||
)
|
||||
if not existing_content:
|
||||
return None
|
||||
os.remove(self._get_document_path(document.file_hash))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
update_data = {
|
||||
"content": content,
|
||||
"encoding": encoding
|
||||
}
|
||||
|
||||
return await self.content_repository.update_document_content(
|
||||
existing_content.id, update_data
|
||||
)
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
@@ -8,8 +8,8 @@ creating default admin user if none exists.
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from app.models.user import UserCreate, UserInDB, UserCreateNoValidation
|
||||
from app.models.auth import UserRole
|
||||
from app.models.user import UserInDB, UserCreateNoValidation
|
||||
from app.services.user_service import UserService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -31,7 +31,6 @@ class InitializationService:
|
||||
user_service (UserService): Service for user operations
|
||||
"""
|
||||
self.user_service = user_service
|
||||
|
||||
|
||||
def ensure_admin_user_exists(self) -> Optional[UserInDB]:
|
||||
"""
|
||||
@@ -131,4 +130,23 @@ class InitializationService:
|
||||
logger.error(error_msg)
|
||||
initialization_summary["errors"].append(error_msg)
|
||||
|
||||
return initialization_summary
|
||||
self.log_initialization_result(initialization_summary)
|
||||
|
||||
return initialization_summary
|
||||
|
||||
@staticmethod
|
||||
def log_initialization_result(summary: dict) -> None:
|
||||
"""
|
||||
Log the result of the initialization process.
|
||||
|
||||
Args:
|
||||
summary (dict): Summary of initialization tasks performed
|
||||
"""
|
||||
if summary["initialization_success"]:
|
||||
logger.info("Application startup completed successfully")
|
||||
if summary["admin_user_created"]:
|
||||
logger.info("Default admin user was created during startup")
|
||||
else:
|
||||
logger.error("Application startup completed with errors:")
|
||||
for error in summary["errors"]:
|
||||
logger.error(f" - {error}")
|
||||
|
||||
182
src/file-processor/app/services/job_service.py
Normal file
182
src/file-processor/app/services/job_service.py
Normal file
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Service layer for job processing business logic.
|
||||
|
||||
This module provides high-level operations for managing processing jobs
|
||||
with strict status transition validation and business rules enforcement.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from app.database.repositories.job_repository import JobRepository
|
||||
from app.exceptions.job_exceptions import InvalidStatusTransitionError
|
||||
from app.models.job import ProcessingJob, ProcessingStatus
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
|
||||
class JobService:
|
||||
"""
|
||||
Service for processing job business logic operations.
|
||||
|
||||
Provides high-level job management with strict status transition
|
||||
validation and business rule enforcement.
|
||||
"""
|
||||
|
||||
def __init__(self, database):
|
||||
"""
|
||||
Initialize service with job repository.
|
||||
|
||||
Args:
|
||||
repository: Optional JobRepository instance (creates default if None)
|
||||
"""
|
||||
self.db = database
|
||||
self.repository = JobRepository(database)
|
||||
|
||||
def initialize(self):
|
||||
self.repository.initialize()
|
||||
return self
|
||||
|
||||
def create_job(self, document_id: PyObjectId, task_id: Optional[str] = None) -> ProcessingJob:
|
||||
"""
|
||||
Create a new processing job.
|
||||
|
||||
Args:
|
||||
document_id: Reference to the file document
|
||||
task_id: Optional Celery task UUID
|
||||
|
||||
Returns:
|
||||
The created ProcessingJob
|
||||
|
||||
Raises:
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
return self.repository.create_job(document_id, task_id)
|
||||
|
||||
def get_job_by_id(self, job_id: PyObjectId) -> ProcessingJob:
|
||||
"""
|
||||
Retrieve a job by its ID.
|
||||
|
||||
Args:
|
||||
job_id: The job ObjectId
|
||||
|
||||
Returns:
|
||||
The ProcessingJob document
|
||||
|
||||
Raises:
|
||||
JobNotFoundError: If job doesn't exist
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
return self.repository.find_job_by_id(job_id)
|
||||
|
||||
def mark_job_as_started(self, job_id: PyObjectId) -> ProcessingJob:
|
||||
"""
|
||||
Mark a job as started (PENDING → PROCESSING).
|
||||
|
||||
Args:
|
||||
job_id: The job ObjectId
|
||||
|
||||
Returns:
|
||||
The updated ProcessingJob
|
||||
|
||||
Raises:
|
||||
JobNotFoundError: If job doesn't exist
|
||||
InvalidStatusTransitionError: If job is not in PENDING status
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
# Get current job to validate transition
|
||||
current_job = self.repository.find_job_by_id(job_id)
|
||||
|
||||
# Validate status transition
|
||||
if current_job.status != ProcessingStatus.PENDING:
|
||||
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.PROCESSING)
|
||||
|
||||
# Update status
|
||||
return self.repository.update_job_status(job_id, ProcessingStatus.PROCESSING)
|
||||
|
||||
def mark_job_as_completed(self, job_id: PyObjectId) -> ProcessingJob:
|
||||
"""
|
||||
Mark a job as completed (PROCESSING → COMPLETED).
|
||||
|
||||
Args:
|
||||
job_id: The job ObjectId
|
||||
|
||||
Returns:
|
||||
The updated ProcessingJob
|
||||
|
||||
Raises:
|
||||
JobNotFoundError: If job doesn't exist
|
||||
InvalidStatusTransitionError: If job is not in PROCESSING status
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
# Get current job to validate transition
|
||||
current_job = self.repository.find_job_by_id(job_id)
|
||||
|
||||
# Validate status transition
|
||||
if current_job.status != ProcessingStatus.PROCESSING:
|
||||
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.COMPLETED)
|
||||
|
||||
# Update status
|
||||
return self.repository.update_job_status(job_id, ProcessingStatus.COMPLETED)
|
||||
|
||||
def mark_job_as_failed(
|
||||
self,
|
||||
job_id: PyObjectId,
|
||||
error_message: Optional[str] = None
|
||||
) -> ProcessingJob:
|
||||
"""
|
||||
Mark a job as failed (PROCESSING → FAILED).
|
||||
|
||||
Args:
|
||||
job_id: The job ObjectId
|
||||
error_message: Optional error description
|
||||
|
||||
Returns:
|
||||
The updated ProcessingJob
|
||||
|
||||
Raises:
|
||||
JobNotFoundError: If job doesn't exist
|
||||
InvalidStatusTransitionError: If job is not in PROCESSING status
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
# Get current job to validate transition
|
||||
current_job = self.repository.find_job_by_id(job_id)
|
||||
|
||||
# Validate status transition
|
||||
if current_job.status != ProcessingStatus.PROCESSING:
|
||||
raise InvalidStatusTransitionError(current_job.status, ProcessingStatus.FAILED)
|
||||
|
||||
# Update status with error message
|
||||
return self.repository.update_job_status(
|
||||
job_id,
|
||||
ProcessingStatus.FAILED,
|
||||
error_message
|
||||
)
|
||||
|
||||
def delete_job(self, job_id: PyObjectId) -> bool:
|
||||
"""
|
||||
Delete a job from the database.
|
||||
|
||||
Args:
|
||||
job_id: The job ObjectId
|
||||
|
||||
Returns:
|
||||
True if job was deleted, False if not found
|
||||
|
||||
Raises:
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
return self.repository.delete_job(job_id)
|
||||
|
||||
def get_jobs_by_status(self, status: ProcessingStatus) -> list[ProcessingJob]:
|
||||
"""
|
||||
Retrieve all jobs with a specific status.
|
||||
|
||||
Args:
|
||||
status: The processing status to filter by
|
||||
|
||||
Returns:
|
||||
List of ProcessingJob documents
|
||||
|
||||
Raises:
|
||||
JobRepositoryError: If database operation fails
|
||||
"""
|
||||
return self.repository.get_jobs_by_status(status)
|
||||
@@ -6,11 +6,11 @@ retrieval, updates, and authentication operations with proper error handling.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
|
||||
from app.models.user import UserCreate, UserInDB, UserUpdate, UserResponse, UserCreateNoValidation
|
||||
from app.models.auth import UserRole
|
||||
from app.database.repositories.user_repository import UserRepository
|
||||
from app.models.user import UserCreate, UserInDB, UserUpdate, UserCreateNoValidation
|
||||
from app.services.auth_service import AuthService
|
||||
|
||||
|
||||
@@ -22,16 +22,21 @@ class UserService:
|
||||
authentication, and data management with proper validation.
|
||||
"""
|
||||
|
||||
def __init__(self, user_repository: UserRepository):
|
||||
def __init__(self, database):
|
||||
"""
|
||||
Initialize user service with repository dependency.
|
||||
|
||||
Args:
|
||||
user_repository (UserRepository): Repository for user data operations
|
||||
"""
|
||||
self.user_repository = user_repository
|
||||
self.db = database
|
||||
self.user_repository = UserRepository(self.db)
|
||||
self.auth_service = AuthService()
|
||||
|
||||
def initialize(self):
|
||||
self.user_repository.initialize()
|
||||
return self
|
||||
|
||||
def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
|
||||
"""
|
||||
Create a new user with business logic validation.
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
asgiref==3.9.1
|
||||
bcrypt==4.3.0
|
||||
celery==5.5.3
|
||||
email-validator==2.3.0
|
||||
fastapi==0.116.1
|
||||
httptools==0.6.4
|
||||
motor==3.7.1
|
||||
pymongo==4.15.0
|
||||
pydantic==2.11.9
|
||||
PyJWT==2.10.1
|
||||
pymongo==4.15.0
|
||||
redis==6.4.0
|
||||
uvicorn==0.35.0
|
||||
python-magic==0.4.27
|
||||
python-magic==0.4.27
|
||||
watchdog==6.0.0
|
||||
41
src/frontend/.dockerignore
Normal file
41
src/frontend/.dockerignore
Normal file
@@ -0,0 +1,41 @@
|
||||
# Dependencies
|
||||
node_modules
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# Build outputs
|
||||
dist
|
||||
build
|
||||
|
||||
# Environment files
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
# IDE files
|
||||
.vscode
|
||||
.idea
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# OS generated files
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
# Git
|
||||
.git
|
||||
.gitignore
|
||||
|
||||
# Docker
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
20
src/frontend/Dockerfile
Normal file
20
src/frontend/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
# Use Node.js 20 Alpine for lightweight container
|
||||
FROM node:20-alpine
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy package.json and package-lock.json (if available)
|
||||
COPY package*.json ./
|
||||
|
||||
# Install dependencies
|
||||
RUN npm install
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Expose Vite default port
|
||||
EXPOSE 5173
|
||||
|
||||
# Start development server with host 0.0.0.0 to accept external connections
|
||||
CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0", "--port", "5173"]
|
||||
@@ -3,12 +3,18 @@ FROM python:3.12-slim
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install libmagic
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libmagic1 \
|
||||
file \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY tasks/ .
|
||||
COPY . .
|
||||
|
||||
# Command will be overridden by docker-compose
|
||||
CMD ["celery", "-A", "main", "worker", "--loglevel=info"]
|
||||
CMD ["celery", "-A", "main", "worker", "--loglevel=info"]
|
||||
|
||||
@@ -1,4 +1,13 @@
|
||||
|
||||
asgiref==3.9.1
|
||||
bcrypt==4.3.0
|
||||
celery==5.5.3
|
||||
email-validator==2.3.0
|
||||
fastapi==0.116.1
|
||||
httptools==0.6.4
|
||||
motor==3.7.1
|
||||
pymongo==4.15.0
|
||||
pydantic==2.11.9
|
||||
redis==6.4.0
|
||||
pymongo==4.15.0
|
||||
uvicorn==0.35.0
|
||||
python-magic==0.4.27
|
||||
watchdog==6.0.0
|
||||
85
src/worker/tasks/document_processing.py
Normal file
85
src/worker/tasks/document_processing.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Celery tasks for document processing with ProcessingJob status management.
|
||||
|
||||
This module contains Celery tasks that handle document content extraction
|
||||
and update processing job statuses throughout the task lifecycle.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict
|
||||
|
||||
from app.config import settings
|
||||
from app.database.connection import get_database
|
||||
from app.services.document_service import DocumentService
|
||||
from tasks.main import celery_app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@celery_app.task(bind=True, autoretry_for=(Exception,), retry_kwargs={'max_retries': 3, 'countdown': 60})
|
||||
def process_document(self, filepath: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a document file and extract its content.
|
||||
|
||||
This task:
|
||||
1. Updates the processing job status to PROCESSING
|
||||
2. Performs document content extraction
|
||||
3. Updates job status to COMPLETED or FAILED based on result
|
||||
|
||||
Args:
|
||||
self : Celery task instance
|
||||
filepath: Full path to the document file to process
|
||||
|
||||
Returns:
|
||||
Dictionary containing processing results
|
||||
|
||||
Raises:
|
||||
Exception: Any processing error (will trigger retry)
|
||||
"""
|
||||
task_id = self.request.id
|
||||
logger.info(f"Starting document processing task {task_id} for file: {filepath}")
|
||||
|
||||
database = get_database()
|
||||
document_service = DocumentService(database=database, objects_folder=settings.get_objects_folder())
|
||||
from app.services.job_service import JobService
|
||||
job_service = JobService(database=database)
|
||||
|
||||
job = None
|
||||
try:
|
||||
# Step 1: Insert the document in DB
|
||||
document = document_service.create_document(filepath)
|
||||
logger.info(f"Job {task_id} created for document {document.id} with file path: {filepath}")
|
||||
|
||||
# Step 2: Create a new job record for the document
|
||||
job = job_service.create_job(task_id=task_id, document_id=document.id)
|
||||
|
||||
# Step 3: Mark job as started
|
||||
job_service.mark_job_as_started(job_id=job.id)
|
||||
logger.info(f"Job {task_id} marked as PROCESSING")
|
||||
|
||||
# Step 4: Mark job as completed
|
||||
job_service.mark_job_as_completed(job_id=job.id)
|
||||
logger.info(f"Job {task_id} marked as COMPLETED")
|
||||
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"filepath": filepath,
|
||||
"status": "completed",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_message = f"Document processing failed: {str(e)}"
|
||||
logger.error(f"Task {task_id} failed: {error_message}")
|
||||
|
||||
try:
|
||||
# Mark job as failed
|
||||
if job is not None:
|
||||
job_service.mark_job_as_failed(job_id=job.id, error_message=error_message)
|
||||
logger.info(f"Job {task_id} marked as FAILED")
|
||||
else:
|
||||
logger.error(f"Failed to process {filepath}. error = {str(e)}")
|
||||
except Exception as job_error:
|
||||
logger.error(f"Failed to update job status for task {task_id}: {str(job_error)}")
|
||||
|
||||
# Re-raise the exception to trigger Celery retry mechanism
|
||||
raise
|
||||
|
||||
@@ -3,9 +3,8 @@ Celery worker for MyDocManager document processing tasks.
|
||||
|
||||
This module contains all Celery tasks for processing documents.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from celery import Celery
|
||||
|
||||
# Environment variables
|
||||
@@ -13,101 +12,25 @@ REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
||||
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
||||
|
||||
# Initialize Celery app
|
||||
app = Celery(
|
||||
celery_app = Celery(
|
||||
"mydocmanager_worker",
|
||||
broker=REDIS_URL,
|
||||
backend=REDIS_URL
|
||||
backend=REDIS_URL,
|
||||
)
|
||||
|
||||
celery_app.autodiscover_tasks(["tasks.document_processing"])
|
||||
|
||||
# Celery configuration
|
||||
app.conf.update(
|
||||
celery_app.conf.update(
|
||||
task_serializer="json",
|
||||
accept_content=["json"],
|
||||
result_serializer="json",
|
||||
timezone="UTC",
|
||||
enable_utc=True,
|
||||
task_track_started=True,
|
||||
task_time_limit=300, # 5 minutes
|
||||
task_soft_time_limit=240, # 4 minutes
|
||||
task_time_limit=300, # 5 minutes
|
||||
task_soft_time_limit=240, # 4 minutes
|
||||
)
|
||||
|
||||
|
||||
@app.task(bind=True)
|
||||
def test_task(self, message: str):
|
||||
"""
|
||||
Test task for validating worker functionality.
|
||||
|
||||
Args:
|
||||
message: Test message to process
|
||||
|
||||
Returns:
|
||||
dict: Task result with processing information
|
||||
"""
|
||||
try:
|
||||
print(f"[WORKER] Starting test task with message: {message}")
|
||||
|
||||
# Simulate some work
|
||||
for i in range(5):
|
||||
print(f"[WORKER] Processing step {i + 1}/5...")
|
||||
time.sleep(1)
|
||||
|
||||
# Update task progress
|
||||
self.update_state(
|
||||
state="PROGRESS",
|
||||
meta={
|
||||
"current": i + 1,
|
||||
"total": 5,
|
||||
"message": f"Processing step {i + 1}"
|
||||
}
|
||||
)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"message": f"Successfully processed: {message}",
|
||||
"processed_at": time.time(),
|
||||
"worker_id": self.request.id
|
||||
}
|
||||
|
||||
print(f"[WORKER] Test task completed successfully: {result}")
|
||||
return result
|
||||
|
||||
except Exception as exc:
|
||||
print(f"[WORKER] Test task failed: {str(exc)}")
|
||||
raise self.retry(exc=exc, countdown=60, max_retries=3)
|
||||
|
||||
|
||||
@app.task(bind=True)
|
||||
def process_document_task(self, file_path: str):
|
||||
"""
|
||||
Placeholder task for document processing.
|
||||
|
||||
Args:
|
||||
file_path: Path to the document to process
|
||||
|
||||
Returns:
|
||||
dict: Processing result
|
||||
"""
|
||||
try:
|
||||
print(f"[WORKER] Starting document processing for: {file_path}")
|
||||
|
||||
# Placeholder for document processing logic
|
||||
time.sleep(2) # Simulate processing time
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"file_path": file_path,
|
||||
"processed_at": time.time(),
|
||||
"content": f"Placeholder content for {file_path}",
|
||||
"worker_id": self.request.id
|
||||
}
|
||||
|
||||
print(f"[WORKER] Document processing completed: {file_path}")
|
||||
return result
|
||||
|
||||
except Exception as exc:
|
||||
print(f"[WORKER] Document processing failed for {file_path}: {str(exc)}")
|
||||
raise self.retry(exc=exc, countdown=60, max_retries=3)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.start()
|
||||
celery_app.start()
|
||||
|
||||
Reference in New Issue
Block a user