Refactored DocumentService to save document in the filesystem. Fixed docker application
This commit is contained in:
@@ -51,6 +51,15 @@ def get_jwt_secret_key() -> str:
|
||||
raise ValueError("JWT_SECRET environment variable must be set in production")
|
||||
return secret
|
||||
|
||||
def get_objects_folder() -> str:
|
||||
"""
|
||||
Get Vault path from environment variables.
|
||||
|
||||
Returns:
|
||||
str: Vault path
|
||||
"""
|
||||
return os.getenv("OBJECTS_FOLDER", "/objects")
|
||||
|
||||
|
||||
def get_jwt_algorithm() -> str:
|
||||
"""
|
||||
|
||||
@@ -10,7 +10,7 @@ from typing import Optional
|
||||
from pymongo import MongoClient
|
||||
from pymongo.database import Database
|
||||
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
|
||||
from app.config.settings import get_mongodb_url, get_mongodb_database_name
|
||||
|
||||
# Global variables for singleton pattern
|
||||
@@ -18,7 +18,7 @@ _client: Optional[MongoClient] = None
|
||||
_database: Optional[Database] = None
|
||||
|
||||
|
||||
def create_mongodb_client() -> MongoClient:
|
||||
def create_mongodb_client() -> AsyncIOMotorClient:
|
||||
"""
|
||||
Create MongoDB client with connection validation.
|
||||
|
||||
@@ -32,7 +32,7 @@ def create_mongodb_client() -> MongoClient:
|
||||
|
||||
try:
|
||||
# Create client with short timeout for fail-fast behavior
|
||||
client = MongoClient(
|
||||
client = AsyncIOMotorClient(
|
||||
mongodb_url,
|
||||
serverSelectionTimeoutMS=5000, # 5 seconds timeout
|
||||
connectTimeoutMS=5000,
|
||||
@@ -107,6 +107,15 @@ def get_mongodb_client() -> Optional[MongoClient]:
|
||||
return _client
|
||||
|
||||
|
||||
def get_extra_args(session):
|
||||
# Build kwargs only if session is provided
|
||||
kwargs = {}
|
||||
if session is not None:
|
||||
kwargs["session"] = session
|
||||
|
||||
return kwargs
|
||||
|
||||
|
||||
def test_database_connection() -> bool:
|
||||
"""
|
||||
Test if database connection is working.
|
||||
@@ -122,4 +131,4 @@ def test_database_connection() -> bool:
|
||||
db.command('ping')
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
@@ -1,214 +0,0 @@
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from bson import ObjectId
|
||||
|
||||
from app.models.document import DocumentContent
|
||||
|
||||
|
||||
class DocumentContentRepository:
|
||||
"""
|
||||
Repository class for document content CRUD operations in MongoDB.
|
||||
|
||||
This class handles all database operations related to document content,
|
||||
following the repository pattern with dependency injection and async/await.
|
||||
"""
|
||||
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
"""
|
||||
Initialize repository with database dependency.
|
||||
|
||||
Args:
|
||||
database (AsyncIOMotorDatabase): MongoDB database instance
|
||||
"""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = database.document_contents
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize repository by ensuring required indexes exist.
|
||||
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
Ensure required database indexes exist.
|
||||
|
||||
Creates unique index on file_hash field to prevent duplicates.
|
||||
"""
|
||||
try:
|
||||
await self.collection.create_index("file_hash", unique=True)
|
||||
except PyMongoError:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
|
||||
async def create_document_content(self, document_content: DocumentContent) -> DocumentContent:
|
||||
"""
|
||||
Create a new document content in the database.
|
||||
|
||||
Args:
|
||||
document_content (DocumentContent): Document content data
|
||||
|
||||
Returns:
|
||||
DocumentContent: Created document content with database ID
|
||||
|
||||
Raises:
|
||||
DuplicateKeyError: If file_hash already exists
|
||||
ValueError: If document content creation fails due to validation
|
||||
"""
|
||||
document_dict = document_content.model_dump(by_alias=True, exclude_unset=True)
|
||||
|
||||
# Remove _id if it's None to let MongoDB generate it
|
||||
if document_dict.get("_id") is None:
|
||||
document_dict.pop("_id", None)
|
||||
|
||||
try:
|
||||
result = await self.collection.insert_one(document_dict)
|
||||
document_dict["_id"] = result.inserted_id
|
||||
return DocumentContent(**document_dict)
|
||||
except DuplicateKeyError as e:
|
||||
raise DuplicateKeyError(f"Document content with file_hash '{document_content.file_hash}' already exists: {e}")
|
||||
except PyMongoError as e:
|
||||
raise ValueError(f"Failed to create document content: {e}")
|
||||
|
||||
async def find_document_content_by_id(self, document_id: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Find document content by ID.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to search for
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return None
|
||||
|
||||
document_doc = await self.collection.find_one({"_id": ObjectId(document_id)})
|
||||
if document_doc:
|
||||
return DocumentContent(**document_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def find_document_content_by_file_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Find document content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash (str): File hash to search for
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
document_doc = await self.collection.find_one({"file_hash": file_hash})
|
||||
if document_doc:
|
||||
return DocumentContent(**document_doc)
|
||||
return None
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if document content exists by file hash.
|
||||
|
||||
Args:
|
||||
file_hash (str): File hash to check
|
||||
|
||||
Returns:
|
||||
bool: True if document content exists, False otherwise
|
||||
"""
|
||||
try:
|
||||
count = await self.collection.count_documents({"file_hash": file_hash})
|
||||
return count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def update_document_content(self, document_id: str, update_data: dict) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update document content information.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to update
|
||||
update_data (dict): Updated document content data
|
||||
|
||||
Returns:
|
||||
DocumentContent or None: Updated document content if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return None
|
||||
|
||||
# Remove None values and _id from update data
|
||||
clean_update_data = {k: v for k, v in update_data.items() if v is not None and k != "_id"}
|
||||
|
||||
if not clean_update_data:
|
||||
return await self.find_document_content_by_id(document_id)
|
||||
|
||||
result = await self.collection.find_one_and_update(
|
||||
{"_id": ObjectId(document_id)},
|
||||
{"$set": clean_update_data},
|
||||
return_document=True
|
||||
)
|
||||
|
||||
if result:
|
||||
return DocumentContent(**result)
|
||||
return None
|
||||
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def delete_document_content(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete document content from database.
|
||||
|
||||
Args:
|
||||
document_id (str): Document content ID to delete
|
||||
|
||||
Returns:
|
||||
bool: True if document content was deleted, False otherwise
|
||||
"""
|
||||
try:
|
||||
if not ObjectId.is_valid(document_id):
|
||||
return False
|
||||
|
||||
result = await self.collection.delete_one({"_id": ObjectId(document_id)})
|
||||
return result.deleted_count > 0
|
||||
except PyMongoError:
|
||||
return False
|
||||
|
||||
async def list_document_contents(self, skip: int = 0, limit: int = 100) -> List[DocumentContent]:
|
||||
"""
|
||||
List document contents with pagination.
|
||||
|
||||
Args:
|
||||
skip (int): Number of document contents to skip (default: 0)
|
||||
limit (int): Maximum number of document contents to return (default: 100)
|
||||
|
||||
Returns:
|
||||
List[DocumentContent]: List of document contents
|
||||
"""
|
||||
try:
|
||||
cursor = self.collection.find({}).skip(skip).limit(limit).sort("_id", -1)
|
||||
document_docs = await cursor.to_list(length=limit)
|
||||
return [DocumentContent(**document_doc) for document_doc in document_docs]
|
||||
except PyMongoError:
|
||||
return []
|
||||
|
||||
async def count_document_contents(self) -> int:
|
||||
"""
|
||||
Count total number of document contents.
|
||||
|
||||
Returns:
|
||||
int: Total number of document contents in database
|
||||
"""
|
||||
try:
|
||||
return await self.collection.count_documents({})
|
||||
except PyMongoError:
|
||||
return 0
|
||||
@@ -9,6 +9,8 @@ from typing import Optional, List
|
||||
from bson import ObjectId
|
||||
from pymongo.errors import DuplicateKeyError, PyMongoError
|
||||
from motor.motor_asyncio import AsyncIOMotorCollection, AsyncIOMotorDatabase
|
||||
|
||||
from app.database.connection import get_extra_args
|
||||
from app.models.document import FileDocument
|
||||
from app.utils.document_matching import fuzzy_matching, subsequence_matching
|
||||
|
||||
@@ -37,7 +39,7 @@ class FileDocumentRepository:
|
||||
def __init__(self, database: AsyncIOMotorDatabase):
|
||||
"""Initialize file repository with database connection."""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = self.db.files
|
||||
self.collection: AsyncIOMotorCollection = self.db.documents
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
@@ -47,6 +49,7 @@ class FileDocumentRepository:
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
return self
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
@@ -60,26 +63,27 @@ class FileDocumentRepository:
|
||||
# Index might already exist, ignore error
|
||||
pass
|
||||
|
||||
async def create_document(self, file_data: FileDocument) -> FileDocument:
|
||||
async def create_document(self, file_data: FileDocument, session=None) -> FileDocument:
|
||||
"""
|
||||
Create a new file document in database.
|
||||
|
||||
Args:
|
||||
file_data (FileDocument): File document data to create
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
FileDocument: Created file document with database ID
|
||||
FileDocument: Created document with database ID
|
||||
|
||||
Raises:
|
||||
ValueError: If file creation fails due to validation
|
||||
DuplicateKeyError: If file with same hash already exists
|
||||
DuplicateKeyError: If a document with same hash already exists
|
||||
"""
|
||||
try:
|
||||
file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
|
||||
if "_id" in file_dict and file_dict["_id"] is None:
|
||||
del file_dict["_id"]
|
||||
|
||||
result = await self.collection.insert_one(file_dict)
|
||||
result = await self.collection.insert_one(file_dict, **get_extra_args(session))
|
||||
file_data.id = result.inserted_id
|
||||
return file_data
|
||||
|
||||
@@ -204,13 +208,14 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return 0
|
||||
|
||||
async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]:
|
||||
async def update_document(self, file_id: str, update_data: dict, session=None) -> Optional[FileDocument]:
|
||||
"""
|
||||
Update file document with new data.
|
||||
|
||||
Args:
|
||||
file_id (str): File document ID to update
|
||||
update_data (dict): Fields to update
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
FileDocument or None: Updated file document if successful, None otherwise
|
||||
@@ -228,7 +233,8 @@ class FileDocumentRepository:
|
||||
result = await self.collection.find_one_and_update(
|
||||
{"_id": ObjectId(file_id)},
|
||||
{"$set": clean_update_data},
|
||||
return_document=True
|
||||
return_document=True,
|
||||
**get_extra_args(session)
|
||||
)
|
||||
|
||||
if result:
|
||||
@@ -238,12 +244,13 @@ class FileDocumentRepository:
|
||||
except PyMongoError:
|
||||
return None
|
||||
|
||||
async def delete_document(self, file_id: str) -> bool:
|
||||
async def delete_document(self, file_id: str, session=None) -> bool:
|
||||
"""
|
||||
Delete file document from database.
|
||||
|
||||
Args:
|
||||
file_id (str): File document ID to delete
|
||||
session (AsyncIOMotorClientSession, optional): MongoDB session
|
||||
|
||||
Returns:
|
||||
bool: True if file was deleted, False otherwise
|
||||
@@ -252,7 +259,7 @@ class FileDocumentRepository:
|
||||
if not ObjectId.is_valid(file_id):
|
||||
return False
|
||||
|
||||
result = await self.collection.delete_one({"_id": ObjectId(file_id)})
|
||||
result = await self.collection.delete_one({"_id": ObjectId(file_id)}, **get_extra_args(session))
|
||||
return result.deleted_count > 0
|
||||
|
||||
except PyMongoError:
|
||||
|
||||
@@ -32,7 +32,6 @@ class UserRepository:
|
||||
"""
|
||||
self.db = database
|
||||
self.collection: AsyncIOMotorCollection = database.users
|
||||
self._ensure_indexes()
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
@@ -41,6 +40,7 @@ class UserRepository:
|
||||
Should be called after repository instantiation to setup database indexes.
|
||||
"""
|
||||
await self._ensure_indexes()
|
||||
return self
|
||||
|
||||
async def _ensure_indexes(self):
|
||||
"""
|
||||
|
||||
@@ -7,10 +7,11 @@ This service provides API endpoints for health checks and task dispatching.
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, HTTPException, Depends
|
||||
from pydantic import BaseModel
|
||||
|
||||
import redis
|
||||
from celery import Celery
|
||||
from fastapi import FastAPI, HTTPException, Depends
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.database.connection import test_database_connection, get_database
|
||||
from app.database.repositories.user_repository import UserRepository
|
||||
@@ -39,12 +40,11 @@ async def lifespan(app: FastAPI):
|
||||
database = get_database()
|
||||
|
||||
# Initialize repositories and services
|
||||
user_repository = UserRepository(database)
|
||||
user_service = UserService(user_repository)
|
||||
user_service = await UserService(database).initialize()
|
||||
init_service = InitializationService(user_service)
|
||||
|
||||
# Run initialization tasks
|
||||
initialization_result = init_service.initialize_application()
|
||||
initialization_result = await init_service.initialize_application()
|
||||
|
||||
if initialization_result["initialization_success"]:
|
||||
logger.info("Application startup completed successfully")
|
||||
@@ -56,6 +56,7 @@ async def lifespan(app: FastAPI):
|
||||
logger.error(f" - {error}")
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
logger.error(f"Critical error during application startup: {str(e)}")
|
||||
# You might want to decide if the app should continue or exit here
|
||||
# For now, we log the error but continue
|
||||
@@ -119,6 +120,7 @@ async def create_user(
|
||||
):
|
||||
return user_service.create_user(user_data)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""
|
||||
|
||||
@@ -33,15 +33,6 @@ class ExtractionMethod(str, Enum):
|
||||
HYBRID = "hybrid"
|
||||
|
||||
|
||||
class ProcessingStatus(str, Enum):
|
||||
"""Status values for processing jobs."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class FileDocument(BaseModel):
|
||||
"""
|
||||
Model for file documents stored in the 'files' collection.
|
||||
@@ -58,6 +49,9 @@ class FileDocument(BaseModel):
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
@field_validator('filepath')
|
||||
@classmethod
|
||||
@@ -74,69 +68,3 @@ class FileDocument(BaseModel):
|
||||
if not v.strip():
|
||||
raise ValueError("Filename cannot be empty")
|
||||
return v.strip()
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
populate_by_name = True
|
||||
arbitrary_types_allowed = True
|
||||
json_encoders = {ObjectId: str}
|
||||
|
||||
|
||||
class DocumentContent(BaseModel):
|
||||
"""Model for document content."""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
content: str = Field(..., description="File content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
|
||||
class ProcessingJob(BaseModel):
|
||||
"""
|
||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||
|
||||
Tracks the lifecycle and status of document processing tasks.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_id: PyObjectId = Field(..., description="Reference to file document")
|
||||
status: ProcessingStatus = Field(
|
||||
default=ProcessingStatus.PENDING,
|
||||
description="Current processing status"
|
||||
)
|
||||
task_id: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Celery task UUID"
|
||||
)
|
||||
created_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when job was created"
|
||||
)
|
||||
started_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when processing started"
|
||||
)
|
||||
completed_at: Optional[datetime] = Field(
|
||||
default=None,
|
||||
description="Timestamp when processing completed"
|
||||
)
|
||||
error_message: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Error message if processing failed"
|
||||
)
|
||||
|
||||
@field_validator('error_message')
|
||||
@classmethod
|
||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||
"""Clean up error message."""
|
||||
if v is not None:
|
||||
return v.strip() if v.strip() else None
|
||||
return v
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
populate_by_name = True
|
||||
arbitrary_types_allowed = True
|
||||
json_encoders = {ObjectId: str}
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
from bson import ObjectId
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
|
||||
class ProcessingStatus(str, Enum):
|
||||
"""Status values for processing jobs."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class ProcessingJob(BaseModel):
|
||||
"""
|
||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||
|
||||
Tracks the lifecycle and status of document processing tasks.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_id: PyObjectId = Field(..., description="Reference to file document")
|
||||
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
|
||||
task_id: Optional[str] = Field(default=None, description="Celery task UUID")
|
||||
created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
|
||||
started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
|
||||
completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
|
||||
error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
|
||||
|
||||
@field_validator('error_message')
|
||||
@classmethod
|
||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||
"""Clean up error message."""
|
||||
if v is not None:
|
||||
return v.strip() if v.strip() else None
|
||||
return v
|
||||
@@ -6,22 +6,19 @@ while maintaining data consistency through MongoDB transactions.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import magic
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClientSession
|
||||
import magic
|
||||
from pymongo.errors import PyMongoError
|
||||
|
||||
from app.database.connection import get_database
|
||||
from app.config.settings import get_objects_folder
|
||||
from app.database.repositories.document_repository import FileDocumentRepository
|
||||
from app.database.repositories.document_content_repository import DocumentContentRepository
|
||||
from app.models.document import (
|
||||
FileDocument,
|
||||
DocumentContent,
|
||||
FileType,
|
||||
ProcessingStatus
|
||||
)
|
||||
from app.models.types import PyObjectId
|
||||
|
||||
@@ -34,13 +31,25 @@ class DocumentService:
|
||||
and their content while ensuring data consistency through transactions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the document service with repository dependencies."""
|
||||
self.db = get_database()
|
||||
self.file_repository = FileDocumentRepository(self.db)
|
||||
self.content_repository = DocumentContentRepository(self.db)
|
||||
def __init__(self, database, objects_folder: str = None):
|
||||
"""
|
||||
Initialize the document service with repository dependencies.
|
||||
|
||||
Args:
|
||||
database: Database instance
|
||||
objects_folder: folder to store files by their hash
|
||||
"""
|
||||
|
||||
self.db = database
|
||||
self.document_repository = FileDocumentRepository(self.db)
|
||||
self.objects_folder = objects_folder or get_objects_folder()
|
||||
|
||||
def _calculate_file_hash(self, file_bytes: bytes) -> str:
|
||||
async def initialize(self):
|
||||
await self.document_repository.initialize()
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def _calculate_file_hash(file_bytes: bytes) -> str:
|
||||
"""
|
||||
Calculate SHA256 hash of file content.
|
||||
|
||||
@@ -52,7 +61,8 @@ class DocumentService:
|
||||
"""
|
||||
return hashlib.sha256(file_bytes).hexdigest()
|
||||
|
||||
def _detect_file_type(self, file_path: str) -> FileType:
|
||||
@staticmethod
|
||||
def _detect_file_type(file_path: str) -> FileType:
|
||||
"""
|
||||
Detect file type from file extension.
|
||||
|
||||
@@ -72,7 +82,8 @@ class DocumentService:
|
||||
except ValueError:
|
||||
raise ValueError(f"Unsupported file type: {extension}")
|
||||
|
||||
def _detect_mime_type(self, file_bytes: bytes) -> str:
|
||||
@staticmethod
|
||||
def _detect_mime_type(file_bytes: bytes) -> str:
|
||||
"""
|
||||
Detect MIME type from file content.
|
||||
|
||||
@@ -84,6 +95,25 @@ class DocumentService:
|
||||
"""
|
||||
return magic.from_buffer(file_bytes, mime=True)
|
||||
|
||||
def _get_document_path(self, file_hash):
|
||||
"""
|
||||
|
||||
:param file_hash:
|
||||
:return:
|
||||
"""
|
||||
return os.path.join(self.objects_folder, file_hash[:24], file_hash)
|
||||
|
||||
def save_content_if_needed(self, file_hash, content: bytes):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if os.path.exists(target_path):
|
||||
return
|
||||
|
||||
if not os.path.exists(os.path.dirname(target_path)):
|
||||
os.makedirs(os.path.dirname(target_path))
|
||||
|
||||
with open(target_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
async def create_document(
|
||||
self,
|
||||
file_path: str,
|
||||
@@ -115,50 +145,32 @@ class DocumentService:
|
||||
mime_type = self._detect_mime_type(file_bytes)
|
||||
file_size = len(file_bytes)
|
||||
filename = Path(file_path).name
|
||||
detected_at = datetime.utcnow()
|
||||
detected_at = datetime.now()
|
||||
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
try:
|
||||
# Check if content already exists
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash, session=session
|
||||
)
|
||||
|
||||
# Create DocumentContent if it doesn't exist
|
||||
if not existing_content:
|
||||
content_data = DocumentContent(
|
||||
file_hash=file_hash,
|
||||
content="", # Will be populated by processing workers
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
await self.content_repository.create_document_content(
|
||||
content_data, session=session
|
||||
)
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash
|
||||
)
|
||||
|
||||
created_file = await self.file_repository.create_document(
|
||||
file_data, session=session
|
||||
)
|
||||
|
||||
return created_file
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
try:
|
||||
self.save_content_if_needed(file_hash, file_bytes)
|
||||
|
||||
# Create FileDocument
|
||||
file_data = FileDocument(
|
||||
filename=filename,
|
||||
filepath=file_path,
|
||||
file_type=file_type,
|
||||
extraction_method=None, # Will be set by processing workers
|
||||
metadata={}, # Empty for now
|
||||
detected_at=detected_at,
|
||||
file_hash=file_hash,
|
||||
encoding=encoding,
|
||||
file_size=file_size,
|
||||
mime_type=mime_type
|
||||
)
|
||||
|
||||
created_file = await self.document_repository.create_document(file_data)
|
||||
|
||||
return created_file
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to create document: {str(e)}")
|
||||
|
||||
async def get_document_by_id(self, document_id: PyObjectId) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -170,7 +182,7 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_id(document_id)
|
||||
return await self.document_repository.find_document_by_id(str(document_id))
|
||||
|
||||
async def get_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -182,7 +194,7 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_hash(file_hash)
|
||||
return await self.document_repository.find_document_by_hash(file_hash)
|
||||
|
||||
async def get_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
|
||||
"""
|
||||
@@ -194,32 +206,15 @@ class DocumentService:
|
||||
Returns:
|
||||
FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.find_document_by_filepath(filepath)
|
||||
return await self.document_repository.find_document_by_filepath(filepath)
|
||||
|
||||
async def get_document_with_content(
|
||||
self,
|
||||
document_id: PyObjectId
|
||||
) -> Optional[Tuple[FileDocument, DocumentContent]]:
|
||||
"""
|
||||
Retrieve a document with its associated content.
|
||||
|
||||
Args:
|
||||
document_id: Document ObjectId
|
||||
|
||||
Returns:
|
||||
Tuple of (FileDocument, DocumentContent) if found, None otherwise
|
||||
"""
|
||||
document = await self.get_document_by_id(document_id)
|
||||
if not document:
|
||||
async def get_document_content_by_hash(self, file_hash):
|
||||
target_path = self._get_document_path(file_hash)
|
||||
if not os.path.exists(target_path):
|
||||
return None
|
||||
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash
|
||||
)
|
||||
if not content:
|
||||
return None
|
||||
|
||||
return (document, content)
|
||||
with open(target_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
async def list_documents(
|
||||
self,
|
||||
@@ -236,7 +231,7 @@ class DocumentService:
|
||||
Returns:
|
||||
List of FileDocument instances
|
||||
"""
|
||||
return await self.file_repository.list_documents(skip=skip, limit=limit)
|
||||
return await self.document_repository.list_documents(skip=skip, limit=limit)
|
||||
|
||||
async def count_documents(self) -> int:
|
||||
"""
|
||||
@@ -245,7 +240,7 @@ class DocumentService:
|
||||
Returns:
|
||||
Total document count
|
||||
"""
|
||||
return await self.file_repository.count_documents()
|
||||
return await self.document_repository.count_documents()
|
||||
|
||||
async def update_document(
|
||||
self,
|
||||
@@ -262,7 +257,12 @@ class DocumentService:
|
||||
Returns:
|
||||
Updated FileDocument if found, None otherwise
|
||||
"""
|
||||
return await self.file_repository.update_document(document_id, update_data)
|
||||
if "file_bytes" in update_data:
|
||||
file_hash = self._calculate_file_hash(update_data["file_bytes"])
|
||||
update_data["file_hash"] = file_hash
|
||||
self.save_content_if_needed(file_hash, update_data["file_bytes"])
|
||||
|
||||
return await self.document_repository.update_document(document_id, update_data)
|
||||
|
||||
async def delete_document(self, document_id: PyObjectId) -> bool:
|
||||
"""
|
||||
@@ -281,100 +281,31 @@ class DocumentService:
|
||||
Raises:
|
||||
PyMongoError: If database operation fails
|
||||
"""
|
||||
# Start MongoDB transaction
|
||||
async with await self.db.client.start_session() as session:
|
||||
async with session.start_transaction():
|
||||
# Start transaction
|
||||
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = await self.document_repository.find_document_by_id(document_id)
|
||||
if not document:
|
||||
return False
|
||||
|
||||
# Delete the document
|
||||
deleted = await self.document_repository.delete_document(document_id)
|
||||
if not deleted:
|
||||
return False
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = await self.document_repository.find_document_by_hash(document.file_hash)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
try:
|
||||
# Get document to find its hash
|
||||
document = await self.file_repository.find_document_by_id(
|
||||
document_id, session=session
|
||||
)
|
||||
if not document:
|
||||
return False
|
||||
|
||||
# Delete the document
|
||||
deleted = await self.file_repository.delete_document(
|
||||
document_id, session=session
|
||||
)
|
||||
if not deleted:
|
||||
return False
|
||||
|
||||
# Check if content is orphaned
|
||||
remaining_files = await self.file_repository.find_document_by_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
|
||||
# If no other files reference this content, delete it
|
||||
if not remaining_files:
|
||||
content = await self.content_repository.find_document_content_by_file_hash(
|
||||
document.file_hash, session=session
|
||||
)
|
||||
if content:
|
||||
await self.content_repository.delete_document_content(
|
||||
content.id, session=session
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
async def content_exists(self, file_hash: str) -> bool:
|
||||
"""
|
||||
Check if content with given hash exists.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
True if content exists, False otherwise
|
||||
"""
|
||||
return await self.content_repository.content_exists(file_hash)
|
||||
|
||||
async def get_content_by_hash(self, file_hash: str) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Retrieve content by file hash.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
|
||||
Returns:
|
||||
DocumentContent if found, None otherwise
|
||||
"""
|
||||
return await self.content_repository.find_document_content_by_file_hash(file_hash)
|
||||
|
||||
async def update_document_content(
|
||||
self,
|
||||
file_hash: str,
|
||||
content: str,
|
||||
encoding: str = "utf-8"
|
||||
) -> Optional[DocumentContent]:
|
||||
"""
|
||||
Update the extracted content for a document.
|
||||
|
||||
This method is typically called by processing workers to store
|
||||
the extracted text content.
|
||||
|
||||
Args:
|
||||
file_hash: SHA256 hash of file content
|
||||
content: Extracted text content
|
||||
encoding: Character encoding
|
||||
|
||||
Returns:
|
||||
Updated DocumentContent if found, None otherwise
|
||||
"""
|
||||
existing_content = await self.content_repository.find_document_content_by_file_hash(
|
||||
file_hash
|
||||
)
|
||||
if not existing_content:
|
||||
return None
|
||||
os.remove(self._get_document_path(document.file_hash))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
update_data = {
|
||||
"content": content,
|
||||
"encoding": encoding
|
||||
}
|
||||
|
||||
return await self.content_repository.update_document_content(
|
||||
existing_content.id, update_data
|
||||
)
|
||||
except Exception as e:
|
||||
# Transaction will automatically rollback if supported
|
||||
raise PyMongoError(f"Failed to delete document: {str(e)}")
|
||||
|
||||
@@ -33,7 +33,7 @@ class InitializationService:
|
||||
self.user_service = user_service
|
||||
|
||||
|
||||
def ensure_admin_user_exists(self) -> Optional[UserInDB]:
|
||||
async def ensure_admin_user_exists(self) -> Optional[UserInDB]:
|
||||
"""
|
||||
Ensure default admin user exists in the system.
|
||||
|
||||
@@ -49,7 +49,7 @@ class InitializationService:
|
||||
logger.info("Checking if admin user exists...")
|
||||
|
||||
# Check if any admin user already exists
|
||||
if self._admin_user_exists():
|
||||
if await self._admin_user_exists():
|
||||
logger.info("Admin user already exists, skipping creation")
|
||||
return None
|
||||
|
||||
@@ -64,7 +64,7 @@ class InitializationService:
|
||||
role=UserRole.ADMIN
|
||||
)
|
||||
|
||||
created_user = self.user_service.create_user(admin_data)
|
||||
created_user = await self.user_service.create_user(admin_data)
|
||||
logger.info(f"Default admin user created successfully with ID: {created_user.id}")
|
||||
logger.warning(
|
||||
"Default admin user created with username 'admin' and password 'admin'. "
|
||||
@@ -77,7 +77,7 @@ class InitializationService:
|
||||
logger.error(f"Failed to create default admin user: {str(e)}")
|
||||
raise Exception(f"Admin user creation failed: {str(e)}")
|
||||
|
||||
def _admin_user_exists(self) -> bool:
|
||||
async def _admin_user_exists(self) -> bool:
|
||||
"""
|
||||
Check if any admin user exists in the system.
|
||||
|
||||
@@ -86,7 +86,7 @@ class InitializationService:
|
||||
"""
|
||||
try:
|
||||
# Get all users and check if any have admin role
|
||||
users = self.user_service.list_users(limit=1000) # Reasonable limit for admin check
|
||||
users = await self.user_service.list_users(limit=1000) # Reasonable limit for admin check
|
||||
|
||||
for user in users:
|
||||
if user.role == UserRole.ADMIN and user.is_active:
|
||||
@@ -99,7 +99,7 @@ class InitializationService:
|
||||
# In case of error, assume admin exists to avoid creating duplicates
|
||||
return True
|
||||
|
||||
def initialize_application(self) -> dict:
|
||||
async def initialize_application(self) -> dict:
|
||||
"""
|
||||
Perform all application initialization tasks.
|
||||
|
||||
@@ -119,7 +119,7 @@ class InitializationService:
|
||||
|
||||
try:
|
||||
# Ensure admin user exists
|
||||
created_admin = self.ensure_admin_user_exists()
|
||||
created_admin = await self.ensure_admin_user_exists()
|
||||
if created_admin:
|
||||
initialization_summary["admin_user_created"] = True
|
||||
|
||||
|
||||
@@ -6,11 +6,11 @@ retrieval, updates, and authentication operations with proper error handling.
|
||||
"""
|
||||
|
||||
from typing import Optional, List
|
||||
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
|
||||
from app.models.user import UserCreate, UserInDB, UserUpdate, UserResponse, UserCreateNoValidation
|
||||
from app.models.auth import UserRole
|
||||
from app.database.repositories.user_repository import UserRepository
|
||||
from app.models.user import UserCreate, UserInDB, UserUpdate, UserCreateNoValidation
|
||||
from app.services.auth_service import AuthService
|
||||
|
||||
|
||||
@@ -22,17 +22,22 @@ class UserService:
|
||||
authentication, and data management with proper validation.
|
||||
"""
|
||||
|
||||
def __init__(self, user_repository: UserRepository):
|
||||
def __init__(self, database):
|
||||
"""
|
||||
Initialize user service with repository dependency.
|
||||
|
||||
Args:
|
||||
user_repository (UserRepository): Repository for user data operations
|
||||
"""
|
||||
self.user_repository = user_repository
|
||||
self.db = database
|
||||
self.user_repository = UserRepository(self.db)
|
||||
self.auth_service = AuthService()
|
||||
|
||||
async def initialize(self):
|
||||
await self.user_repository.initialize()
|
||||
return self
|
||||
|
||||
def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
|
||||
async def create_user(self, user_data: UserCreate | UserCreateNoValidation) -> UserInDB:
|
||||
"""
|
||||
Create a new user with business logic validation.
|
||||
|
||||
@@ -55,11 +60,11 @@ class UserService:
|
||||
raise ValueError(f"User with email '{user_data.email}' already exists")
|
||||
|
||||
try:
|
||||
return self.user_repository.create_user(user_data)
|
||||
return await self.user_repository.create_user(user_data)
|
||||
except DuplicateKeyError:
|
||||
raise ValueError(f"User with username '{user_data.username}' already exists")
|
||||
|
||||
def get_user_by_username(self, username: str) -> Optional[UserInDB]:
|
||||
async def get_user_by_username(self, username: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Retrieve user by username.
|
||||
|
||||
@@ -69,9 +74,9 @@ class UserService:
|
||||
Returns:
|
||||
UserInDB or None: User if found, None otherwise
|
||||
"""
|
||||
return self.user_repository.find_user_by_username(username)
|
||||
return await self.user_repository.find_user_by_username(username)
|
||||
|
||||
def get_user_by_id(self, user_id: str) -> Optional[UserInDB]:
|
||||
async def get_user_by_id(self, user_id: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Retrieve user by ID.
|
||||
|
||||
@@ -81,9 +86,9 @@ class UserService:
|
||||
Returns:
|
||||
UserInDB or None: User if found, None otherwise
|
||||
"""
|
||||
return self.user_repository.find_user_by_id(user_id)
|
||||
return await self.user_repository.find_user_by_id(user_id)
|
||||
|
||||
def authenticate_user(self, username: str, password: str) -> Optional[UserInDB]:
|
||||
async def authenticate_user(self, username: str, password: str) -> Optional[UserInDB]:
|
||||
"""
|
||||
Authenticate user with username and password.
|
||||
|
||||
@@ -106,7 +111,7 @@ class UserService:
|
||||
|
||||
return user
|
||||
|
||||
def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
|
||||
async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
|
||||
"""
|
||||
Update user information.
|
||||
|
||||
@@ -132,9 +137,9 @@ class UserService:
|
||||
if existing_user and str(existing_user.id) != user_id:
|
||||
raise ValueError(f"Email '{user_update.email}' is already taken")
|
||||
|
||||
return self.user_repository.update_user(user_id, user_update)
|
||||
return await self.user_repository.update_user(user_id, user_update)
|
||||
|
||||
def delete_user(self, user_id: str) -> bool:
|
||||
async def delete_user(self, user_id: str) -> bool:
|
||||
"""
|
||||
Delete user from system.
|
||||
|
||||
@@ -146,7 +151,7 @@ class UserService:
|
||||
"""
|
||||
return self.user_repository.delete_user(user_id)
|
||||
|
||||
def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
|
||||
async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
|
||||
"""
|
||||
List users with pagination.
|
||||
|
||||
@@ -157,18 +162,18 @@ class UserService:
|
||||
Returns:
|
||||
List[UserInDB]: List of users
|
||||
"""
|
||||
return self.user_repository.list_users(skip=skip, limit=limit)
|
||||
return await self.user_repository.list_users(skip=skip, limit=limit)
|
||||
|
||||
def count_users(self) -> int:
|
||||
async def count_users(self) -> int:
|
||||
"""
|
||||
Count total number of users.
|
||||
|
||||
Returns:
|
||||
int: Total number of users in system
|
||||
"""
|
||||
return self.user_repository.count_users()
|
||||
return await self.user_repository.count_users()
|
||||
|
||||
def user_exists(self, username: str) -> bool:
|
||||
async def user_exists(self, username: str) -> bool:
|
||||
"""
|
||||
Check if user exists by username.
|
||||
|
||||
@@ -178,4 +183,4 @@ class UserService:
|
||||
Returns:
|
||||
bool: True if user exists, False otherwise
|
||||
"""
|
||||
return self.user_repository.user_exists(username)
|
||||
return await self.user_repository.user_exists(username)
|
||||
|
||||
Reference in New Issue
Block a user