Refactored DocumentService to save document in the filesystem. Fixed docker application

This commit is contained in:
2025-09-20 21:06:27 +02:00
parent f1b551d243
commit 9564cfadd5
28 changed files with 1577 additions and 2442 deletions

View File

@@ -51,6 +51,15 @@ def get_jwt_secret_key() -> str:
raise ValueError("JWT_SECRET environment variable must be set in production")
return secret
def get_objects_folder() -> str:
"""
Get Vault path from environment variables.
Returns:
str: Vault path
"""
return os.getenv("OBJECTS_FOLDER", "/objects")
def get_jwt_algorithm() -> str:
"""

View File

@@ -10,7 +10,7 @@ from typing import Optional
from pymongo import MongoClient
from pymongo.database import Database
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
from app.config.settings import get_mongodb_url, get_mongodb_database_name
# Global variables for singleton pattern
@@ -18,7 +18,7 @@ _client: Optional[MongoClient] = None
_database: Optional[Database] = None
def create_mongodb_client() -> MongoClient:
def create_mongodb_client() -> AsyncIOMotorClient:
"""
Create MongoDB client with connection validation.
@@ -32,7 +32,7 @@ def create_mongodb_client() -> MongoClient:
try:
# Create client with short timeout for fail-fast behavior
client = MongoClient(
client = AsyncIOMotorClient(
mongodb_url,
serverSelectionTimeoutMS=5000, # 5 seconds timeout
connectTimeoutMS=5000,
@@ -107,6 +107,15 @@ def get_mongodb_client() -> Optional[MongoClient]:
return _client
def get_extra_args(session):
# Build kwargs only if session is provided
kwargs = {}
if session is not None:
kwargs["session"] = session
return kwargs
def test_database_connection() -> bool:
"""
Test if database connection is working.
@@ -122,4 +131,4 @@ def test_database_connection() -> bool:
db.command('ping')
return True
except Exception:
return False
return False

View File

@@ -1,214 +0,0 @@
from typing import List, Optional
from datetime import datetime
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
from pymongo.errors import DuplicateKeyError, PyMongoError
from bson import ObjectId
from app.models.document import DocumentContent
class DocumentContentRepository:
"""
Repository class for document content CRUD operations in MongoDB.
This class handles all database operations related to document content,
following the repository pattern with dependency injection and async/await.
"""
def __init__(self, database: AsyncIOMotorDatabase):
"""
Initialize repository with database dependency.
Args:
database (AsyncIOMotorDatabase): MongoDB database instance
"""
self.db = database
self.collection: AsyncIOMotorCollection = database.document_contents
self._ensure_indexes()
async def initialize(self):
"""
Initialize repository by ensuring required indexes exist.
Should be called after repository instantiation to setup database indexes.
"""
await self._ensure_indexes()
async def _ensure_indexes(self):
"""
Ensure required database indexes exist.
Creates unique index on file_hash field to prevent duplicates.
"""
try:
await self.collection.create_index("file_hash", unique=True)
except PyMongoError:
# Index might already exist, ignore error
pass
async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
"""
Create a new document content in the database.
Args:
document_content (DocumentContent): Document content data
Returns:
DocumentContent: Created document content with database ID
Raises:
DuplicateKeyError: If file_hash already exists
ValueError: If document content creation fails due to validation
"""
document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
# Remove _id if it's None to let MongoDB generate it
if document_dict.get("_id") is None:
document_dict.pop("_id", None)
try:
result = await self.collection.insert_one(document_dict)
document_dict["_id"] = result.inserted_id
return DocumentContent(**document_dict)
except DuplicateKeyError as e:
raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
except PyMongoError as e:
raise ValueError(f"Failed to create document content: {e}")
async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
"""
Find document content by ID.
Args:
document_id (str): Document content ID to search for
Returns:
DocumentContent or None: Document content if found, None otherwise
"""
try:
if not ObjectId.is_valid(document_id):
return None
document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
if document_doc:
return DocumentContent(**document_doc)
return None
except PyMongoError:
return None
async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
"""
Find document content by file hash.
Args:
file_hash (str): File hash to search for
Returns:
DocumentContent or None: Document content if found, None otherwise
"""
try:
document_doc = await self.collection.find_one({"file_hash": file_hash})
if document_doc:
return DocumentContent(**document_doc)
return None
except PyMongoError:
return None
async def content_exists(self, file_hash: str) -> bool:
"""
Check if document content exists by file hash.
Args:
file_hash (str): File hash to check
Returns:
bool: True if document content exists, False otherwise
"""
try:
count = await self.collection.count_documents({"file_hash": file_hash})
return count > 0
except PyMongoError:
return False
async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
"""
Update document content information.
Args:
document_id (str): Document content ID to update
update_data (dict): Updated document content data
Returns:
DocumentContent or None: Updated document content if found, None otherwise
"""
try:
if not ObjectId.is_valid(document_id):
return None
# Remove None values and _id from update data
clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
if not clean_update_data:
return await self.find_document_content_by_id(document_id)
result = await self.collection.find_one_and_update(
{"_id": ObjectId(document_id)},
{"$set": clean_update_data},
return_document=True
)
if result:
return DocumentContent(**result)
return None
except PyMongoError:
return None
async def delete_document_content(self, document_id: str) -> bool:
"""
Delete document content from database.
Args:
document_id (str): Document content ID to delete
Returns:
bool: True if document content was deleted, False otherwise
"""
try:
if not ObjectId.is_valid(document_id):
return False
result = await self.collection.delete_one({"_id": ObjectId(document_id)})
return result.deleted_count > 0
except PyMongoError:
return False
async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
"""
List document contents with pagination.
Args:
skip (int): Number of document contents to skip (default: 0)
limit (int): Maximum number of document contents to return (default: 100)
Returns:
List[DocumentContent]: List of document contents
"""
try:
cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
document_docs = await cursor.to_list(length=limit)
return [DocumentContent(**document_doc) for document_doc in document_docs]
except PyMongoError:
return []
async def count_document_contents(self) -> int:
"""
Count total number of document contents.
Returns:
int: Total number of document contents in database
"""
try:
return await self.collection.count_documents({})
except PyMongoError:
return 0

View File

@@ -9,6 +9,8 @@ from typing import Optional, List
from bson import ObjectId
from pymongo.errors import DuplicateKeyError, PyMongoError
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
from app.database.connection import get_extra_args
from app.models.document import FileDocument
from app.utils.document_matching import fuzzy_matching, subsequence_matching
@@ -37,7 +39,7 @@ class FileDocumentRepository:
def __init__(self, database: AsyncIOMotorDatabase):
"""Initialize file repository with database connection."""
self.db = database
self.collection: AsyncIOMotorCollection = self.db.files
self.collection: AsyncIOMotorCollection = self.db.documents
self._ensure_indexes()
async def initialize(self):
@@ -47,6 +49,7 @@ class FileDocumentRepository:
Should be called after repository instantiation to setup database indexes.
"""
await self._ensure_indexes()
return self
async def _ensure_indexes(self):
"""
@@ -60,26 +63,27 @@ class FileDocumentRepository:
# Index might already exist, ignore error
pass
async def create_document(self, file_data: FileDocument) -> FileDocument:
async def create_document(self, file_data: FileDocument, session=None) -> FileDocument:
"""
Create a new file document in database.
Args:
file_data (FileDocument): File document data to create
session (AsyncIOMotorClientSession, optional): MongoDB session
Returns:
FileDocument: Created file document with database ID
FileDocument: Created document with database ID
Raises:
ValueError: If file creation fails due to validation
DuplicateKeyError: If file with same hash already exists
DuplicateKeyError: If a document with same hash already exists
"""
try:
file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
if "_id" in file_dict and file_dict["_id"] is None:
del file_dict["_id"]
result = await self.collection.insert_one(file_dict)
result = await self.collection.insert_one(file_dict, **get_extra_args(session))
file_data.id = result.inserted_id
return file_data
@@ -204,13 +208,14 @@ class FileDocumentRepository:
except PyMongoError:
return 0
async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]:
async def update_document(self, file_id: str, update_data: dict, session=None) -> Optional[FileDocument]:
"""
Update file document with new data.
Args:
file_id (str): File document ID to update
update_data (dict): Fields to update
session (AsyncIOMotorClientSession, optional): MongoDB session
Returns:
FileDocument or None: Updated file document if successful, None otherwise
@@ -228,7 +233,8 @@ class FileDocumentRepository:
result = await self.collection.find_one_and_update(
{"_id": ObjectId(file_id)},
{"$set": clean_update_data},
return_document=True
return_document=True,
**get_extra_args(session)
)
if result:
@@ -238,12 +244,13 @@ class FileDocumentRepository:
except PyMongoError:
return None
async def delete_document(self, file_id: str) -> bool:
async def delete_document(self, file_id: str, session=None) -> bool:
"""
Delete file document from database.
Args:
file_id (str): File document ID to delete
session (AsyncIOMotorClientSession, optional): MongoDB session
Returns:
bool: True if file was deleted, False otherwise
@@ -252,7 +259,7 @@ class FileDocumentRepository:
if not ObjectId.is_valid(file_id):
return False
result = await self.collection.delete_one({"_id": ObjectId(file_id)})
result = await self.collection.delete_one({"_id": ObjectId(file_id)}, **get_extra_args(session))
return result.deleted_count > 0
except PyMongoError:

View File

@@ -32,7 +32,6 @@ class UserRepository:
"""
self.db = database
self.collection: AsyncIOMotorCollection = database.users
self._ensure_indexes()
async def initialize(self):
"""
@@ -41,6 +40,7 @@ class UserRepository:
Should be called after repository instantiation to setup database indexes.
"""
await self._ensure_indexes()
return self
async def _ensure_indexes(self):
"""

View File

@@ -7,10 +7,11 @@ This service provides API endpoints for health checks and task dispatching.
import logging
import os
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel
import redis
from celery import Celery
from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel
from app.database.connection import test_database_connection, get_database
from app.database.repositories.user_repository import UserRepository
@@ -39,12 +40,11 @@ async def lifespan(app: FastAPI):
database = get_database()
# Initialize repositories and services
user_repository = UserRepository(database)
user_service = UserService(user_repository)
user_service = await UserService(database).initialize()
init_service = InitializationService(user_service)
# Run initialization tasks
initialization_result = init_service.initialize_application()
initialization_result = await init_service.initialize_application()
if initialization_result["initialization_success"]:
logger.info("Application startup completed successfully")
@@ -56,6 +56,7 @@ async def lifespan(app: FastAPI):
logger.error(f" - {error}")
except Exception as e:
raise e
logger.error(f"Critical error during application startup: {str(e)}")
# You might want to decide if the app should continue or exit here
# For now, we log the error but continue
@@ -119,6 +120,7 @@ async def create_user(
):
return user_service.create_user(user_data)
@app.get("/health")
async def health_check():
"""

View File

@@ -33,15 +33,6 @@ class ExtractionMethod(str, Enum):
HYBRID = "hybrid"
class ProcessingStatus(str, Enum):
"""Status values for processing jobs."""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class FileDocument(BaseModel):
"""
Model for file documents stored in the 'files' collection.
@@ -58,6 +49,9 @@ class FileDocument(BaseModel):
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
encoding: str = Field(default="utf-8", description="Character encoding for text files")
file_size: int = Field(..., ge=0, description="File size in bytes")
mime_type: str = Field(..., description="MIME type detected")
@field_validator('filepath')
@classmethod
@@ -74,69 +68,3 @@ class FileDocument(BaseModel):
if not v.strip():
raise ValueError("Filename cannot be empty")
return v.strip()
class Config:
"""Pydantic configuration."""
populate_by_name = True
arbitrary_types_allowed = True
json_encoders = {ObjectId: str}
class DocumentContent(BaseModel):
"""Model for document content."""
id: Optional[PyObjectId] = Field(default=None, alias="_id")
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
content: str = Field(..., description="File content")
encoding: str = Field(default="utf-8", description="Character encoding for text files")
file_size: int = Field(..., ge=0, description="File size in bytes")
mime_type: str = Field(..., description="MIME type detected")
class ProcessingJob(BaseModel):
"""
Model for processing jobs stored in the 'processing_jobs' collection.
Tracks the lifecycle and status of document processing tasks.
"""
id: Optional[PyObjectId] = Field(default=None, alias="_id")
file_id: PyObjectId = Field(..., description="Reference to file document")
status: ProcessingStatus = Field(
default=ProcessingStatus.PENDING,
description="Current processing status"
)
task_id: Optional[str] = Field(
default=None,
description="Celery task UUID"
)
created_at: Optional[datetime] = Field(
default=None,
description="Timestamp when job was created"
)
started_at: Optional[datetime] = Field(
default=None,
description="Timestamp when processing started"
)
completed_at: Optional[datetime] = Field(
default=None,
description="Timestamp when processing completed"
)
error_message: Optional[str] = Field(
default=None,
description="Error message if processing failed"
)
@field_validator('error_message')
@classmethod
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
"""Clean up error message."""
if v is not None:
return v.strip() if v.strip() else None
return v
class Config:
"""Pydantic configuration."""
populate_by_name = True
arbitrary_types_allowed = True
json_encoders = {ObjectId: str}

View File

@@ -0,0 +1,42 @@
from datetime import datetime
from enum import Enum
from typing import Optional
from bson import ObjectId
from pydantic import BaseModel, Field, field_validator
from app.models.types import PyObjectId
class ProcessingStatus(str, Enum):
"""Status values for processing jobs."""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class ProcessingJob(BaseModel):
"""
Model for processing jobs stored in the 'processing_jobs' collection.
Tracks the lifecycle and status of document processing tasks.
"""
id: Optional[PyObjectId] = Field(default=None, alias="_id")
file_id: PyObjectId = Field(..., description="Reference to file document")
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
task_id: Optional[str] = Field(default=None, description="Celery task UUID")
created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
@field_validator('error_message')
@classmethod
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
"""Clean up error message."""
if v is not None:
return v.strip() if v.strip() else None
return v

View File

@@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions.
"""
import hashlib
import magic
import os
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any, Tuple
from typing import List, Optional, Dict, Any
from motor.motor_asyncio import AsyncIOMotorClientSession
import magic
from pymongo.errors import PyMongoError
from app.database.connection import get_database
from app.config.settings import get_objects_folder
from app.database.repositories.document_repository import FileDocumentRepository
from app.database.repositories.document_content_repository import DocumentContentRepository
from app.models.document import (
FileDocument,
DocumentContent,
FileType,
ProcessingStatus
)
from app.models.types import PyObjectId
@@ -34,13 +31,25 @@ class DocumentService:
and their content while ensuring data consistency through transactions.
"""
def __init__(self):
"""Initialize the document service with repository dependencies."""
self.db = get_database()
self.file_repository = FileDocumentRepository(self.db)
self.content_repository = DocumentContentRepository(self.db)
def __init__(self, database, objects_folder: str = None):
"""
Initialize the document service with repository dependencies.
Args:
database: Database instance
objects_folder: folder to store files by their hash
"""
self.db = database
self.document_repository = FileDocumentRepository(self.db)
self.objects_folder = objects_folder or get_objects_folder()
def _calculate_file_hash(self, file_bytes: bytes) -> str:
async def initialize(self):
await self.document_repository.initialize()
return self
@staticmethod
def _calculate_file_hash(file_bytes: bytes) -> str:
"""
Calculate SHA256 hash of file content.
@@ -52,7 +61,8 @@ class DocumentService:
"""
return hashlib.sha256(file_bytes).hexdigest()
def _detect_file_type(self, file_path: str) -> FileType:
@staticmethod
def _detect_file_type(file_path: str) -> FileType:
"""
Detect file type from file extension.
@@ -72,7 +82,8 @@ class DocumentService:
except ValueError:
raise ValueError(f"Unsupported file type: {extension}")
def _detect_mime_type(self, file_bytes: bytes) -> str:
@staticmethod
def _detect_mime_type(file_bytes: bytes) -> str:
"""
Detect MIME type from file content.
@@ -84,6 +95,25 @@ class DocumentService:
"""
return magic.from_buffer(file_bytes, mime=True)
def _get_document_path(self, file_hash):
"""
:param file_hash:
:return:
"""
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
def save_content_if_needed(self, file_hash, content: bytes):
target_path = self._get_document_path(file_hash)
if os.path.exists(target_path):
return
if not os.path.exists(os.path.dirname(target_path)):
os.makedirs(os.path.dirname(target_path))
with open(target_path, "wb") as f:
f.write(content)
async def create_document(
self,
file_path: str,
@@ -115,50 +145,32 @@ class DocumentService:
mime_type = self._detect_mime_type(file_bytes)
file_size = len(file_bytes)
filename = Path(file_path).name
detected_at = datetime.utcnow()
detected_at = datetime.now()
# Start MongoDB transaction
async with await self.db.client.start_session() as session:
async with session.start_transaction():
try:
# Check if content already exists
existing_content = await self.content_repository.find_document_content_by_file_hash(
file_hash, session=session
)
# Create DocumentContent if it doesn't exist
if not existing_content:
content_data = DocumentContent(
file_hash=file_hash,
content="", # Will be populated by processing workers
encoding=encoding,
file_size=file_size,
mime_type=mime_type
)
await self.content_repository.create_document_content(
content_data, session=session
)
# Create FileDocument
file_data = FileDocument(
filename=filename,
filepath=file_path,
file_type=file_type,
extraction_method=None, # Will be set by processing workers
metadata={}, # Empty for now
detected_at=detected_at,
file_hash=file_hash
)
created_file = await self.file_repository.create_document(
file_data, session=session
)
return created_file
except Exception as e:
# Transaction will automatically rollback
raise PyMongoError(f"Failed to create document: {str(e)}")
try:
self.save_content_if_needed(file_hash, file_bytes)
# Create FileDocument
file_data = FileDocument(
filename=filename,
filepath=file_path,
file_type=file_type,
extraction_method=None, # Will be set by processing workers
metadata={}, # Empty for now
detected_at=detected_at,
file_hash=file_hash,
encoding=encoding,
file_size=file_size,
mime_type=mime_type
)
created_file = await self.document_repository.create_document(file_data)
return created_file
except Exception as e:
# Transaction will automatically rollback if supported
raise PyMongoError(f"Failed to create document: {str(e)}")
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
"""
@@ -170,7 +182,7 @@ class DocumentService:
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_id(document_id)
return await self.document_repository.find_document_by_id(str(document_id))
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
"""
@@ -182,7 +194,7 @@ class DocumentService:
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_hash(file_hash)
return await self.document_repository.find_document_by_hash(file_hash)
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
"""
@@ -194,32 +206,15 @@ class DocumentService:
Returns:
FileDocument if found, None otherwise
"""
return await self.file_repository.find_document_by_filepath(filepath)
return await self.document_repository.find_document_by_filepath(filepath)
async def get_document_with_content(
self,
document_id: PyObjectId
) -> Optional[Tuple[FileDocument, DocumentContent]]:
"""
Retrieve a document with its associated content.
Args:
document_id: Document ObjectId
Returns:
Tuple of (FileDocument, DocumentContent) if found, None otherwise
"""
document = await self.get_document_by_id(document_id)
if not document:
async def get_document_content_by_hash(self, file_hash):
target_path = self._get_document_path(file_hash)
if not os.path.exists(target_path):
return None
content = await self.content_repository.find_document_content_by_file_hash(
document.file_hash
)
if not content:
return None
return (document, content)
with open(target_path, "rb") as f:
return f.read()
async def list_documents(
self,
@@ -236,7 +231,7 @@ class DocumentService:
Returns:
List of FileDocument instances
"""
return await self.file_repository.list_documents(skip=skip, limit=limit)
return await self.document_repository.list_documents(skip=skip, limit=limit)
async def count_documents(self) -> int:
"""
@@ -245,7 +240,7 @@ class DocumentService:
Returns:
Total document count
"""
return await self.file_repository.count_documents()
return await self.document_repository.count_documents()
async def update_document(
self,
@@ -262,7 +257,12 @@ class DocumentService:
Returns:
Updated FileDocument if found, None otherwise
"""
return await self.file_repository.update_document(document_id, update_data)
if "file_bytes" in update_data:
file_hash = self._calculate_file_hash(update_data["file_bytes"])
update_data["file_hash"] = file_hash
self.save_content_if_needed(file_hash, update_data["file_bytes"])
return await self.document_repository.update_document(document_id, update_data)
async def delete_document(self, document_id: PyObjectId) -> bool:
"""
@@ -281,100 +281,31 @@ class DocumentService:
Raises:
PyMongoError: If database operation fails
"""
# Start MongoDB transaction
async with await self.db.client.start_session() as session:
async with session.start_transaction():
# Start transaction
try:
# Get document to find its hash
document = await self.document_repository.find_document_by_id(document_id)
if not document:
return False
# Delete the document
deleted = await self.document_repository.delete_document(document_id)
if not deleted:
return False
# Check if content is orphaned
remaining_files = await self.document_repository.find_document_by_hash(document.file_hash)
# If no other files reference this content, delete it
if not remaining_files:
try:
# Get document to find its hash
document = await self.file_repository.find_document_by_id(
document_id, session=session
)
if not document:
return False
# Delete the document
deleted = await self.file_repository.delete_document(
document_id, session=session
)
if not deleted:
return False
# Check if content is orphaned
remaining_files = await self.file_repository.find_document_by_hash(
document.file_hash, session=session
)
# If no other files reference this content, delete it
if not remaining_files:
content = await self.content_repository.find_document_content_by_file_hash(
document.file_hash, session=session
)
if content:
await self.content_repository.delete_document_content(
content.id, session=session
)
return True
except Exception as e:
# Transaction will automatically rollback
raise PyMongoError(f"Failed to delete document: {str(e)}")
async def content_exists(self, file_hash: str) -> bool:
"""
Check if content with given hash exists.
Args:
file_hash: SHA256 hash of file content
Returns:
True if content exists, False otherwise
"""
return await self.content_repository.content_exists(file_hash)
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
"""
Retrieve content by file hash.
Args:
file_hash: SHA256 hash of file content
Returns:
DocumentContent if found, None otherwise
"""
return await self.content_repository.find_document_content_by_file_hash(file_hash)
async def update_document_content(
self,
file_hash: str,
content: str,
encoding: str = "utf-8"
) -> Optional[DocumentContent]:
"""
Update the extracted content for a document.
This method is typically called by processing workers to store
the extracted text content.
Args:
file_hash: SHA256 hash of file content
content: Extracted text content
encoding: Character encoding
Returns:
Updated DocumentContent if found, None otherwise
"""
existing_content = await self.content_repository.find_document_content_by_file_hash(
file_hash
)
if not existing_content:
return None
os.remove(self._get_document_path(document.file_hash))
except Exception:
pass
return True
update_data = {
"content": content,
"encoding": encoding
}
return await self.content_repository.update_document_content(
existing_content.id, update_data
)
except Exception as e:
# Transaction will automatically rollback if supported
raise PyMongoError(f"Failed to delete document: {str(e)}")

View File

@@ -33,7 +33,7 @@ class InitializationService:
self.user_service = user_service
def ensure_admin_user_exists(self) -> Optional[UserInDB]:
async def ensure_admin_user_exists(self) -> Optional[UserInDB]:
"""
Ensure default admin user exists in the system.
@@ -49,7 +49,7 @@ class InitializationService:
logger.info("Checking if admin user exists...")
# Check if any admin user already exists
if self._admin_user_exists():
if await self._admin_user_exists():
logger.info("Admin user already exists, skipping creation")
return None
@@ -64,7 +64,7 @@ class InitializationService:
role=UserRole.ADMIN
)
created_user = self.user_service.create_user(admin_data)
created_user = await self.user_service.create_user(admin_data)
logger.info(f"Default admin user created successfully with ID: {created_user.id}")
logger.warning(
"Default admin user created with username 'admin' and password 'admin'. "
@@ -77,7 +77,7 @@ class InitializationService:
logger.error(f"Failed to create default admin user: {str(e)}")
raise Exception(f"Admin user creation failed: {str(e)}")
def _admin_user_exists(self) -> bool:
async def _admin_user_exists(self) -> bool:
"""
Check if any admin user exists in the system.
@@ -86,7 +86,7 @@ class InitializationService:
"""
try:
# Get all users and check if any have admin role
users = self.user_service.list_users(limit=1000) # Reasonable limit for admin check
users = await self.user_service.list_users(limit=1000) # Reasonable limit for admin check
for user in users:
if user.role == UserRole.ADMIN and user.is_active:
@@ -99,7 +99,7 @@ class InitializationService:
# In case of error, assume admin exists to avoid creating duplicates
return True
def initialize_application(self) -> dict:
async def initialize_application(self) -> dict:
"""
Perform all application initialization tasks.
@@ -119,7 +119,7 @@ class InitializationService:
try:
# Ensure admin user exists
created_admin = self.ensure_admin_user_exists()
created_admin = await self.ensure_admin_user_exists()
if created_admin:
initialization_summary["admin_user_created"] = True

View File

@@ -6,11 +6,11 @@ retrieval, updates, and authentication operations with proper error handling.
"""
from typing import Optional, List
from pymongo.errors import DuplicateKeyError
from app.models.user import UserCreate, UserInDB, UserUpdate, UserResponse, UserCreateNoValidation
from app.models.auth import UserRole
from app.database.repositories.user_repository import UserRepository
from app.models.user import UserCreate, UserInDB, UserUpdate, UserCreateNoValidation
from app.services.auth_service import AuthService
@@ -22,17 +22,22 @@ class UserService:
authentication, and data management with proper validation.
"""
def __init__(self, user_repository: UserRepository):
def __init__(self, database):
"""
Initialize user service with repository dependency.
Args:
user_repository (UserRepository): Repository for user data operations
"""
self.user_repository = user_repository
self.db = database
self.user_repository = UserRepository(self.db)
self.auth_service = AuthService()
async def initialize(self):
await self.user_repository.initialize()
return self
def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
async def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
"""
Create a new user with business logic validation.
@@ -55,11 +60,11 @@ class UserService:
raise ValueError(f"User with email '{user_data.email}' already exists")
try:
return self.user_repository.create_user(user_data)
return await self.user_repository.create_user(user_data)
except DuplicateKeyError:
raise ValueError(f"User with username '{user_data.username}' already exists")
def get_user_by_username(self, username: str) -> Optional[UserInDB]:
async def get_user_by_username(self, username: str) -> Optional[UserInDB]:
"""
Retrieve user by username.
@@ -69,9 +74,9 @@ class UserService:
Returns:
UserInDB or None: User if found, None otherwise
"""
return self.user_repository.find_user_by_username(username)
return await self.user_repository.find_user_by_username(username)
def get_user_by_id(self, user_id: str) -> Optional[UserInDB]:
async def get_user_by_id(self, user_id: str) -> Optional[UserInDB]:
"""
Retrieve user by ID.
@@ -81,9 +86,9 @@ class UserService:
Returns:
UserInDB or None: User if found, None otherwise
"""
return self.user_repository.find_user_by_id(user_id)
return await self.user_repository.find_user_by_id(user_id)
def authenticate_user(self, username: str, password: str) -> Optional[UserInDB]:
async def authenticate_user(self, username: str, password: str) -> Optional[UserInDB]:
"""
Authenticate user with username and password.
@@ -106,7 +111,7 @@ class UserService:
return user
def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
"""
Update user information.
@@ -132,9 +137,9 @@ class UserService:
if existing_user and str(existing_user.id) != user_id:
raise ValueError(f"Email '{user_update.email}' is already taken")
return self.user_repository.update_user(user_id, user_update)
return await self.user_repository.update_user(user_id, user_update)
def delete_user(self, user_id: str) -> bool:
async def delete_user(self, user_id: str) -> bool:
"""
Delete user from system.
@@ -146,7 +151,7 @@ class UserService:
"""
return self.user_repository.delete_user(user_id)
def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
"""
List users with pagination.
@@ -157,18 +162,18 @@ class UserService:
Returns:
List[UserInDB]: List of users
"""
return self.user_repository.list_users(skip=skip, limit=limit)
return await self.user_repository.list_users(skip=skip, limit=limit)
def count_users(self) -> int:
async def count_users(self) -> int:
"""
Count total number of users.
Returns:
int: Total number of users in system
"""
return self.user_repository.count_users()
return await self.user_repository.count_users()
def user_exists(self, username: str) -> bool:
async def user_exists(self, username: str) -> bool:
"""
Check if user exists by username.
@@ -178,4 +183,4 @@ class UserService:
Returns:
bool: True if user exists, False otherwise
"""
return self.user_repository.user_exists(username)
return await self.user_repository.user_exists(username)