Working on document repository

This commit is contained in:
2025-09-18 22:53:51 +02:00
parent df86a3d998
commit c3ea80363f
12 changed files with 1597 additions and 526 deletions

View File

@@ -0,0 +1,248 @@
"""
File repository for database operations on FileDocument collection.
This module provides data access operations for file documents stored
in MongoDB with proper error handling and type safety.
"""
from typing import Optional, List
from bson import ObjectId
from pymongo.errors import DuplicateKeyError, PyMongoError
from difflib import SequenceMatcher
from motor.motor_asyncio import AsyncIOMotorCollection
from app.models.document import FileDocument
from app.database.connection import get_database
class FileDocumentRepository:
"""
Repository class for file document database operations.
This class handles all database operations for FileDocument objects
with proper error handling and data validation.
"""
def __init__(self):
"""Initialize file repository with database connection."""
self.db = get_database()
self.collection: AsyncIOMotorCollection = self.db.files
self._ensure_indexes()
async def _ensure_indexes(self):
"""
Ensure required database indexes exist.
Creates unique index on username field to prevent duplicates.
"""
try:
await self.collection.create_index("filepath", unique=True)
except PyMongoError:
# Index might already exist, ignore error
pass
async def create_document(self, file_data: FileDocument) -> FileDocument:
"""
Create a new file document in database.
Args:
file_data (FileDocument): File document data to create
Returns:
FileDocument: Created file document with database ID
Raises:
ValueError: If file creation fails due to validation
DuplicateKeyError: If file with same hash already exists
"""
try:
file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
if "_id" in file_dict and file_dict["_id"] is None:
del file_dict["_id"]
result = await self.collection.insert_one(file_dict)
file_data.id = result.inserted_id
return file_data
except DuplicateKeyError as e:
raise DuplicateKeyError(f"File with same hash already exists: {e}")
except PyMongoError as e:
raise ValueError(f"Failed to create file document: {e}")
async def find_document_by_id(self, file_id: str) -> Optional[FileDocument]:
"""
Find file document by ID.
Args:
file_id (str): File document ID to search for
Returns:
FileDocument or None: File document if found, None otherwise
"""
try:
if not ObjectId.is_valid(file_id):
return None
file_doc = await self.collection.find_one({"_id": ObjectId(file_id)})
if file_doc:
return FileDocument(**file_doc)
return None
except PyMongoError:
return None
async def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
"""
Find file document by file hash to detect duplicates.
Args:
file_hash (str): SHA256 hash of file content
Returns:
FileDocument or None: File document if found, None otherwise
"""
try:
file_doc = await self.collection.find_one({"file_hash": file_hash})
if file_doc:
return FileDocument(**file_doc)
return None
except PyMongoError:
return None
async def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
"""
Find file document by exact filepath.
Args:
filepath (str): Full path to the file
Returns:
FileDocument or None: File document if found, None otherwise
"""
try:
file_doc = await self.collection.find_one({"filepath": filepath})
if file_doc:
return FileDocument(**file_doc)
return None
except PyMongoError:
return None
async def find_document_by_name(self, filename: str, similarity_threshold: float = 0.6) -> List[FileDocument]:
"""
Find file documents by filename using fuzzy matching.
Args:
filename (str): Filename to search for
similarity_threshold (float): Minimum similarity ratio (0.0 to 1.0)
Returns:
List[FileDocument]: List of matching files sorted by similarity score
"""
try:
# Get all files from database
cursor = self.collection.find({})
all_files = await cursor.to_list(length=None)
matches = []
for file_doc in all_files:
file_obj = FileDocument(**file_doc)
# Calculate similarity between search term and filename
similarity = SequenceMatcher(None, filename.lower(), file_obj.filename.lower()).ratio()
if similarity >= similarity_threshold:
matches.append((file_obj, similarity))
# Sort by similarity score (highest first)
matches.sort(key=lambda x: x[1], reverse=True)
# Return only the FileDocument objects
return [match[0] for match in matches]
except PyMongoError:
return []
async def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]:
"""
List file documents with pagination.
Args:
skip (int): Number of documents to skip (default: 0)
limit (int): Maximum number of documents to return (default: 100)
Returns:
List[FileDocument]: List of file documents
"""
try:
cursor = self.collection.find({}).skip(skip).limit(limit).sort("detected_at", -1)
file_docs = await cursor.to_list(length=limit)
return [FileDocument(**doc) for doc in file_docs]
except PyMongoError:
return []
async def count_documents(self) -> int:
"""
Count total number of file documents.
Returns:
int: Total number of file documents in collection
"""
try:
return await self.collection.count_documents({})
except PyMongoError:
return 0
async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]:
"""
Update file document with new data.
Args:
file_id (str): File document ID to update
update_data (dict): Fields to update
Returns:
FileDocument or None: Updated file document if successful, None otherwise
"""
try:
if not ObjectId.is_valid(file_id):
return None
# Remove None values from update data
clean_update_data = {k: v for k, v in update_data.items() if v is not None}
if not clean_update_data:
return await self.find_document_by_id(file_id)
result = await self.collection.find_one_and_update(
{"_id": ObjectId(file_id)},
{"$set": clean_update_data},
return_document=True
)
if result:
return FileDocument(**result)
return None
except PyMongoError:
return None
async def delete_document(self, file_id: str) -> bool:
"""
Delete file document from database.
Args:
file_id (str): File document ID to delete
Returns:
bool: True if file was deleted, False otherwise
"""
try:
if not ObjectId.is_valid(file_id):
return False
result = await self.collection.delete_one({"_id": ObjectId(file_id)})
return result.deleted_count > 0
except PyMongoError:
return False

View File

@@ -2,15 +2,14 @@
User repository for MongoDB operations.
This module implements the repository pattern for user CRUD operations
with dependency injection of the database connection.
with dependency injection of the database connection using async/await.
"""
from typing import Optional, List, Dict, Any
from typing import Optional, List
from datetime import datetime
from bson import ObjectId
from pymongo.database import Database
from pymongo.errors import DuplicateKeyError
from pymongo.collection import Collection
from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
from pymongo.errors import DuplicateKeyError, PyMongoError
from app.models.user import UserCreate, UserInDB, UserUpdate
from app.utils.security import hash_password
@@ -21,35 +20,33 @@ class UserRepository:
Repository class for user CRUD operations in MongoDB.
This class handles all database operations related to users,
following the repository pattern with dependency injection.
following the repository pattern with dependency injection and async/await.
"""
def __init__(self, database: Database):
def __init__(self, database: AsyncIOMotorDatabase):
"""
Initialize repository with database dependency.
Args:
database (Database): MongoDB database instance
database (AsyncIOMotorDatabase): MongoDB database instance
"""
self.db = database
self.collection: Collection = database.users
# Create unique index on username for duplicate prevention
self.collection: AsyncIOMotorCollection = database.users
self._ensure_indexes()
def _ensure_indexes(self):
async def _ensure_indexes(self):
"""
Ensure required database indexes exist.
Creates unique index on username field to prevent duplicates.
"""
try:
self.collection.create_index("username", unique=True)
except Exception:
await self.collection.create_index("username", unique=True)
except PyMongoError:
# Index might already exist, ignore error
pass
def create_user(self, user_data: UserCreate) -> UserInDB:
async def create_user(self, user_data: UserCreate) -> UserInDB:
"""
Create a new user in the database.
@@ -61,6 +58,7 @@ class UserRepository:
Raises:
DuplicateKeyError: If username already exists
ValueError: If user creation fails due to validation
"""
user_dict = {
"username": user_data.username,
@@ -73,13 +71,15 @@ class UserRepository:
}
try:
result = self.collection.insert_one(user_dict)
result = await self.collection.insert_one(user_dict)
user_dict["_id"] = result.inserted_id
return UserInDB(**user_dict)
except DuplicateKeyError:
raise DuplicateKeyError(f"User with username '{user_data.username}' already exists")
except DuplicateKeyError as e:
raise DuplicateKeyError(f"User with username '{user_data.username}' already exists: {e}")
except PyMongoError as e:
raise ValueError(f"Failed to create user: {e}")
def find_user_by_username(self, username: str) -> Optional[UserInDB]:
async def find_user_by_username(self, username: str) -> Optional[UserInDB]:
"""
Find user by username.
@@ -89,12 +89,15 @@ class UserRepository:
Returns:
UserInDB or None: User if found, None otherwise
"""
user_doc = self.collection.find_one({"username": username})
if user_doc:
return UserInDB(**user_doc)
return None
try:
user_doc = await self.collection.find_one({"username": username})
if user_doc:
return UserInDB(**user_doc)
return None
except PyMongoError:
return None
def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
async def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
"""
Find user by ID.
@@ -105,16 +108,17 @@ class UserRepository:
UserInDB or None: User if found, None otherwise
"""
try:
object_id = ObjectId(user_id)
user_doc = self.collection.find_one({"_id": object_id})
if not ObjectId.is_valid(user_id):
return None
user_doc = await self.collection.find_one({"_id": ObjectId(user_id)})
if user_doc:
return UserInDB(**user_doc)
except Exception:
# Invalid ObjectId format
pass
return None
return None
except PyMongoError:
return None
def find_user_by_email(self, email: str) -> Optional[UserInDB]:
async def find_user_by_email(self, email: str) -> Optional[UserInDB]:
"""
Find user by email address.
@@ -124,12 +128,15 @@ class UserRepository:
Returns:
UserInDB or None: User if found, None otherwise
"""
user_doc = self.collection.find_one({"email": email})
if user_doc:
return UserInDB(**user_doc)
return None
try:
user_doc = await self.collection.find_one({"email": email})
if user_doc:
return UserInDB(**user_doc)
return None
except PyMongoError:
return None
def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
"""
Update user information.
@@ -141,11 +148,12 @@ class UserRepository:
UserInDB or None: Updated user if found, None otherwise
"""
try:
object_id = ObjectId(user_id)
if not ObjectId.is_valid(user_id):
return None
# Build update document with only provided fields
update_data = {"updated_at": datetime.now()}
if user_update.username is not None:
update_data["username"] = user_update.username
if user_update.email is not None:
@@ -157,20 +165,26 @@ class UserRepository:
if user_update.is_active is not None:
update_data["is_active"] = user_update.is_active
result = self.collection.update_one(
{"_id": object_id},
{"$set": update_data}
# Remove None values from update data
clean_update_data = {k: v for k, v in update_data.items() if v is not None}
if not clean_update_data:
return await self.find_user_by_id(user_id)
result = await self.collection.find_one_and_update(
{"_id": ObjectId(user_id)},
{"$set": clean_update_data},
return_document=True
)
if result.matched_count > 0:
return self.find_user_by_id(user_id)
if result:
return UserInDB(**result)
return None
except Exception:
# Invalid ObjectId format or other errors
pass
return None
except PyMongoError:
return None
def delete_user(self, user_id: str) -> bool:
async def delete_user(self, user_id: str) -> bool:
"""
Delete user from database.
@@ -181,14 +195,15 @@ class UserRepository:
bool: True if user was deleted, False otherwise
"""
try:
object_id = ObjectId(user_id)
result = self.collection.delete_one({"_id": object_id})
if not ObjectId.is_valid(user_id):
return False
result = await self.collection.delete_one({"_id": ObjectId(user_id)})
return result.deleted_count > 0
except Exception:
# Invalid ObjectId format
except PyMongoError:
return False
def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
"""
List users with pagination.
@@ -199,19 +214,26 @@ class UserRepository:
Returns:
List[UserInDB]: List of users
"""
cursor = self.collection.find().skip(skip).limit(limit)
return [UserInDB(**user_doc) for user_doc in cursor]
try:
cursor = self.collection.find({}).skip(skip).limit(limit).sort("created_at", -1)
user_docs = await cursor.to_list(length=limit)
return [UserInDB(**user_doc) for user_doc in user_docs]
except PyMongoError:
return []
def count_users(self) -> int:
async def count_users(self) -> int:
"""
Count total number of users.
Returns:
int: Total number of users in database
"""
return self.collection.count_documents({})
try:
return await self.collection.count_documents({})
except PyMongoError:
return 0
def user_exists(self, username: str) -> bool:
async def user_exists(self, username: str) -> bool:
"""
Check if user exists by username.
@@ -221,4 +243,8 @@ class UserRepository:
Returns:
bool: True if user exists, False otherwise
"""
return self.collection.count_documents({"username": username}) > 0
try:
count = await self.collection.count_documents({"username": username})
return count > 0
except PyMongoError:
return False

View File

@@ -119,7 +119,6 @@ async def create_user(
):
return user_service.create_user(user_data)
@app.get("/health")
async def health_check():
"""

View File

@@ -0,0 +1,142 @@
"""
Pydantic models for document processing collections.
This module defines the data models for file documents and processing jobs
stored in MongoDB collections.
"""
from datetime import datetime
from enum import Enum
from typing import Any, Dict, Optional
from bson import ObjectId
from pydantic import BaseModel, Field, field_validator
from app.models.types import PyObjectId
class FileType(str, Enum):
"""Supported file types for document processing."""
TXT = "txt"
PDF = "pdf"
DOCX = "docx"
JPG = "jpg"
PNG = "png"
class ExtractionMethod(str, Enum):
"""Methods used to extract content from documents."""
DIRECT_TEXT = "direct_text"
OCR = "ocr"
HYBRID = "hybrid"
class ProcessingStatus(str, Enum):
"""Status values for processing jobs."""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class FileDocument(BaseModel):
"""
Model for file documents stored in the 'files' collection.
Represents a file detected in the watched directory with its
metadata and extracted content.
"""
id: Optional[PyObjectId] = Field(default=None, alias="_id")
filename: str = Field(..., description="Original filename")
filepath: str = Field(..., description="Full path to the file")
file_type: FileType = Field(..., description="Type of the file")
extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content")
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
@field_validator('filepath')
@classmethod
def validate_filepath(cls, v: str) -> str:
"""Validate filepath format."""
if not v.strip():
raise ValueError("Filepath cannot be empty")
return v.strip()
@field_validator('filename')
@classmethod
def validate_filename(cls, v: str) -> str:
"""Validate filename format."""
if not v.strip():
raise ValueError("Filename cannot be empty")
return v.strip()
class Config:
"""Pydantic configuration."""
populate_by_name = True
arbitrary_types_allowed = True
json_encoders = {ObjectId: str}
class DocumentContent(BaseModel):
"""Model for document content."""
id: Optional[PyObjectId] = Field(default=None, alias="_id")
file_hash: Optional[str] = Field(..., description="SHA256 hash of file content")
content: str = Field(..., description="File content")
encoding: str = Field(default="utf-8", description="Character encoding for text files")
file_size: int = Field(..., ge=0, description="File size in bytes")
mime_type: str = Field(..., description="MIME type detected")
class ProcessingJob(BaseModel):
"""
Model for processing jobs stored in the 'processing_jobs' collection.
Tracks the lifecycle and status of document processing tasks.
"""
id: Optional[PyObjectId] = Field(default=None, alias="_id")
file_id: PyObjectId = Field(..., description="Reference to file document")
status: ProcessingStatus = Field(
default=ProcessingStatus.PENDING,
description="Current processing status"
)
task_id: Optional[str] = Field(
default=None,
description="Celery task UUID"
)
created_at: Optional[datetime] = Field(
default=None,
description="Timestamp when job was created"
)
started_at: Optional[datetime] = Field(
default=None,
description="Timestamp when processing started"
)
completed_at: Optional[datetime] = Field(
default=None,
description="Timestamp when processing completed"
)
error_message: Optional[str] = Field(
default=None,
description="Error message if processing failed"
)
@field_validator('error_message')
@classmethod
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
"""Clean up error message."""
if v is not None:
return v.strip() if v.strip() else None
return v
class Config:
"""Pydantic configuration."""
populate_by_name = True
arbitrary_types_allowed = True
json_encoders = {ObjectId: str}

View File

@@ -0,0 +1,32 @@
from typing import Any
from bson import ObjectId
from pydantic_core import core_schema
class PyObjectId(ObjectId):
"""Custom ObjectId type for Pydantic v2 compatibility."""
@classmethod
def __get_pydantic_core_schema__(
cls, source_type: Any, handler
) -> core_schema.CoreSchema:
return core_schema.json_or_python_schema(
json_schema=core_schema.str_schema(),
python_schema=core_schema.union_schema([
core_schema.is_instance_schema(ObjectId),
core_schema.chain_schema([
core_schema.str_schema(),
core_schema.no_info_plain_validator_function(cls.validate),
])
]),
serialization=core_schema.plain_serializer_function_ser_schema(
lambda x: str(x)
),
)
@classmethod
def validate(cls, v):
if not ObjectId.is_valid(v):
raise ValueError("Invalid ObjectId")
return ObjectId(v)

View File

@@ -13,34 +13,7 @@ from pydantic import BaseModel, Field, field_validator, EmailStr
from pydantic_core import core_schema
from app.models.auth import UserRole
class PyObjectId(ObjectId):
"""Custom ObjectId type for Pydantic v2 compatibility."""
@classmethod
def __get_pydantic_core_schema__(
cls, source_type: Any, handler
) -> core_schema.CoreSchema:
return core_schema.json_or_python_schema(
json_schema=core_schema.str_schema(),
python_schema=core_schema.union_schema([
core_schema.is_instance_schema(ObjectId),
core_schema.chain_schema([
core_schema.str_schema(),
core_schema.no_info_plain_validator_function(cls.validate),
])
]),
serialization=core_schema.plain_serializer_function_ser_schema(
lambda x: str(x)
),
)
@classmethod
def validate(cls, v):
if not ObjectId.is_valid(v):
raise ValueError("Invalid ObjectId")
return ObjectId(v)
from app.models.types import PyObjectId
def validate_password_strength(password: str) -> str:

View File

@@ -3,6 +3,7 @@ celery==5.5.3
email-validator==2.3.0
fastapi==0.116.1
httptools==0.6.4
motor==3.7.1
pymongo==4.15.0
pydantic==2.11.9
redis==6.4.0