Working on document repository

2025-09-18 22:53:51 +02:00
parent df86a3d998
commit c3ea80363f
12 changed files with 1597 additions and 526 deletions
--- a/src/file-processor/app/database/repositories/document_repository.py
+++ b/src/file-processor/app/database/repositories/document_repository.py
@@ -0,0 +1,248 @@
+"""
+File repository for database operations on FileDocument collection.
+
+This module provides data access operations for file documents stored
+in MongoDB with proper error handling and type safety.
+"""
+
+from typing import Optional, List
+from bson import ObjectId
+from pymongo.errors import DuplicateKeyError, PyMongoError
+from difflib import SequenceMatcher
+from motor.motor_asyncio import AsyncIOMotorCollection
+from app.models.document import FileDocument
+from app.database.connection import get_database
+
+
+class FileDocumentRepository:
+  """
+  Repository class for file document database operations.
+  
+  This class handles all database operations for FileDocument objects
+  with proper error handling and data validation.
+  """
+  
+  def __init__(self):
+    """Initialize file repository with database connection."""
+    self.db = get_database()
+    self.collection: AsyncIOMotorCollection = self.db.files
+    self._ensure_indexes()
+  
+  async def _ensure_indexes(self):
+    """
+    Ensure required database indexes exist.
+
+    Creates unique index on username field to prevent duplicates.
+    """
+    try:
+      await self.collection.create_index("filepath", unique=True)
+    except PyMongoError:
+      # Index might already exist, ignore error
+      pass
+  
+  async def create_document(self, file_data: FileDocument) -> FileDocument:
+    """
+    Create a new file document in database.
+    
+    Args:
+        file_data (FileDocument): File document data to create
+        
+    Returns:
+        FileDocument: Created file document with database ID
+        
+    Raises:
+        ValueError: If file creation fails due to validation
+        DuplicateKeyError: If file with same hash already exists
+    """
+    try:
+      file_dict = file_data.model_dump(by_alias=True, exclude_unset=True)
+      if "_id" in file_dict and file_dict["_id"] is None:
+        del file_dict["_id"]
+      
+      result = await self.collection.insert_one(file_dict)
+      file_data.id = result.inserted_id
+      return file_data
+    
+    except DuplicateKeyError as e:
+      raise DuplicateKeyError(f"File with same hash already exists: {e}")
+    except PyMongoError as e:
+      raise ValueError(f"Failed to create file document: {e}")
+  
+  async def find_document_by_id(self, file_id: str) -> Optional[FileDocument]:
+    """
+    Find file document by ID.
+    
+    Args:
+        file_id (str): File document ID to search for
+        
+    Returns:
+        FileDocument or None: File document if found, None otherwise
+    """
+    try:
+      if not ObjectId.is_valid(file_id):
+        return None
+      
+      file_doc = await self.collection.find_one({"_id": ObjectId(file_id)})
+      if file_doc:
+        return FileDocument(**file_doc)
+      return None
+    
+    except PyMongoError:
+      return None
+  
+  async def find_document_by_hash(self, file_hash: str) -> Optional[FileDocument]:
+    """
+    Find file document by file hash to detect duplicates.
+    
+    Args:
+        file_hash (str): SHA256 hash of file content
+        
+    Returns:
+        FileDocument or None: File document if found, None otherwise
+    """
+    try:
+      file_doc = await self.collection.find_one({"file_hash": file_hash})
+      if file_doc:
+        return FileDocument(**file_doc)
+      return None
+    
+    except PyMongoError:
+      return None
+  
+  async def find_document_by_filepath(self, filepath: str) -> Optional[FileDocument]:
+    """
+    Find file document by exact filepath.
+    
+    Args:
+        filepath (str): Full path to the file
+        
+    Returns:
+        FileDocument or None: File document if found, None otherwise
+    """
+    try:
+      file_doc = await self.collection.find_one({"filepath": filepath})
+      if file_doc:
+        return FileDocument(**file_doc)
+      return None
+    
+    except PyMongoError:
+      return None
+  
+  async def find_document_by_name(self, filename: str, similarity_threshold: float = 0.6) -> List[FileDocument]:
+    """
+    Find file documents by filename using fuzzy matching.
+    
+    Args:
+        filename (str): Filename to search for
+        similarity_threshold (float): Minimum similarity ratio (0.0 to 1.0)
+        
+    Returns:
+        List[FileDocument]: List of matching files sorted by similarity score
+    """
+    try:
+      # Get all files from database
+      cursor = self.collection.find({})
+      all_files = await cursor.to_list(length=None)
+      
+      matches = []
+      for file_doc in all_files:
+        file_obj = FileDocument(**file_doc)
+        # Calculate similarity between search term and filename
+        similarity = SequenceMatcher(None, filename.lower(), file_obj.filename.lower()).ratio()
+        
+        if similarity >= similarity_threshold:
+          matches.append((file_obj, similarity))
+      
+      # Sort by similarity score (highest first)
+      matches.sort(key=lambda x: x[1], reverse=True)
+      
+      # Return only the FileDocument objects
+      return [match[0] for match in matches]
+    
+    except PyMongoError:
+      return []
+  
+  async def list_documents(self, skip: int = 0, limit: int = 100) -> List[FileDocument]:
+    """
+    List file documents with pagination.
+    
+    Args:
+        skip (int): Number of documents to skip (default: 0)
+        limit (int): Maximum number of documents to return (default: 100)
+        
+    Returns:
+        List[FileDocument]: List of file documents
+    """
+    try:
+      cursor = self.collection.find({}).skip(skip).limit(limit).sort("detected_at", -1)
+      file_docs = await cursor.to_list(length=limit)
+      return [FileDocument(**doc) for doc in file_docs]
+    
+    except PyMongoError:
+      return []
+  
+  async def count_documents(self) -> int:
+    """
+    Count total number of file documents.
+    
+    Returns:
+        int: Total number of file documents in collection
+    """
+    try:
+      return await self.collection.count_documents({})
+    except PyMongoError:
+      return 0
+  
+  async def update_document(self, file_id: str, update_data: dict) -> Optional[FileDocument]:
+    """
+    Update file document with new data.
+    
+    Args:
+        file_id (str): File document ID to update
+        update_data (dict): Fields to update
+        
+    Returns:
+        FileDocument or None: Updated file document if successful, None otherwise
+    """
+    try:
+      if not ObjectId.is_valid(file_id):
+        return None
+      
+      # Remove None values from update data
+      clean_update_data = {k: v for k, v in update_data.items() if v is not None}
+      
+      if not clean_update_data:
+        return await self.find_document_by_id(file_id)
+      
+      result = await self.collection.find_one_and_update(
+        {"_id": ObjectId(file_id)},
+        {"$set": clean_update_data},
+        return_document=True
+      )
+      
+      if result:
+        return FileDocument(**result)
+      return None
+    
+    except PyMongoError:
+      return None
+  
+  async def delete_document(self, file_id: str) -> bool:
+    """
+    Delete file document from database.
+    
+    Args:
+        file_id (str): File document ID to delete
+        
+    Returns:
+        bool: True if file was deleted, False otherwise
+    """
+    try:
+      if not ObjectId.is_valid(file_id):
+        return False
+      
+      result = await self.collection.delete_one({"_id": ObjectId(file_id)})
+      return result.deleted_count > 0
+    
+    except PyMongoError:
+      return False
--- a/src/file-processor/app/database/repositories/user_repository.py
+++ b/src/file-processor/app/database/repositories/user_repository.py
@@ -2,15 +2,14 @@
 User repository for MongoDB operations.

 This module implements the repository pattern for user CRUD operations
-with dependency injection of the database connection.
+with dependency injection of the database connection using async/await.
 """

-from typing import Optional, List, Dict, Any
+from typing import Optional, List
 from datetime import datetime
 from bson import ObjectId
-from pymongo.database import Database
-from pymongo.errors import DuplicateKeyError
-from pymongo.collection import Collection
+from motor.motor_asyncio import AsyncIOMotorDatabase, AsyncIOMotorCollection
+from pymongo.errors import DuplicateKeyError, PyMongoError

 from app.models.user import UserCreate, UserInDB, UserUpdate
 from app.utils.security import hash_password
@@ -21,35 +20,33 @@ class UserRepository:
  Repository class for user CRUD operations in MongoDB.

  This class handles all database operations related to users,
-  following the repository pattern with dependency injection.
+  following the repository pattern with dependency injection and async/await.
  """
  
-  def __init__(self, database: Database):
+  def __init__(self, database: AsyncIOMotorDatabase):
    """
    Initialize repository with database dependency.

    Args:
-        database (Database): MongoDB database instance
+        database (AsyncIOMotorDatabase): MongoDB database instance
    """
    self.db = database
-    self.collection: Collection = database.users
-    
-    # Create unique index on username for duplicate prevention
+    self.collection: AsyncIOMotorCollection = database.users
    self._ensure_indexes()
  
-  def _ensure_indexes(self):
+  async def _ensure_indexes(self):
    """
    Ensure required database indexes exist.

    Creates unique index on username field to prevent duplicates.
    """
    try:
-      self.collection.create_index("username", unique=True)
-    except Exception:
+      await self.collection.create_index("username", unique=True)
+    except PyMongoError:
      # Index might already exist, ignore error
      pass
  
-  def create_user(self, user_data: UserCreate) -> UserInDB:
+  async def create_user(self, user_data: UserCreate) -> UserInDB:
    """
    Create a new user in the database.

@@ -61,6 +58,7 @@ class UserRepository:

    Raises:
        DuplicateKeyError: If username already exists
+        ValueError: If user creation fails due to validation
    """
    user_dict = {
        "username": user_data.username,
@@ -73,13 +71,15 @@ class UserRepository:
    }
    
    try:
-      result = self.collection.insert_one(user_dict)
+      result = await self.collection.insert_one(user_dict)
      user_dict["_id"] = result.inserted_id
      return UserInDB(**user_dict)
-    except DuplicateKeyError:
-      raise DuplicateKeyError(f"User with username '{user_data.username}' already exists")
+    except DuplicateKeyError as e:
+      raise DuplicateKeyError(f"User with username '{user_data.username}' already exists: {e}")
+    except PyMongoError as e:
+      raise ValueError(f"Failed to create user: {e}")
  
-  def find_user_by_username(self, username: str) -> Optional[UserInDB]:
+  async def find_user_by_username(self, username: str) -> Optional[UserInDB]:
    """
    Find user by username.

@@ -89,12 +89,15 @@ class UserRepository:
    Returns:
        UserInDB or None: User if found, None otherwise
    """
-    user_doc = self.collection.find_one({"username": username})
-    if user_doc:
-      return UserInDB(**user_doc)
-    return None
+    try:
+      user_doc = await self.collection.find_one({"username": username})
+      if user_doc:
+        return UserInDB(**user_doc)
+      return None
+    except PyMongoError:
+      return None
  
-  def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
+  async def find_user_by_id(self, user_id: str) -> Optional[UserInDB]:
    """
    Find user by ID.

@@ -105,16 +108,17 @@ class UserRepository:
        UserInDB or None: User if found, None otherwise
    """
    try:
-      object_id = ObjectId(user_id)
-      user_doc = self.collection.find_one({"_id": object_id})
+      if not ObjectId.is_valid(user_id):
+        return None
+      
+      user_doc = await self.collection.find_one({"_id": ObjectId(user_id)})
      if user_doc:
        return UserInDB(**user_doc)
-    except Exception:
-      # Invalid ObjectId format
-      pass
-    return None
+      return None
+    except PyMongoError:
+      return None
  
-  def find_user_by_email(self, email: str) -> Optional[UserInDB]:
+  async def find_user_by_email(self, email: str) -> Optional[UserInDB]:
    """
    Find user by email address.

@@ -124,12 +128,15 @@ class UserRepository:
    Returns:
        UserInDB or None: User if found, None otherwise
    """
-    user_doc = self.collection.find_one({"email": email})
-    if user_doc:
-      return UserInDB(**user_doc)
-    return None
+    try:
+      user_doc = await self.collection.find_one({"email": email})
+      if user_doc:
+        return UserInDB(**user_doc)
+      return None
+    except PyMongoError:
+      return None
  
-  def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
+  async def update_user(self, user_id: str, user_update: UserUpdate) -> Optional[UserInDB]:
    """
    Update user information.

@@ -141,11 +148,12 @@ class UserRepository:
        UserInDB or None: Updated user if found, None otherwise
    """
    try:
-      object_id = ObjectId(user_id)
+      if not ObjectId.is_valid(user_id):
+        return None
      
      # Build update document with only provided fields
      update_data = {"updated_at": datetime.now()}
-
+      
      if user_update.username is not None:
        update_data["username"] = user_update.username
      if user_update.email is not None:
@@ -157,20 +165,26 @@ class UserRepository:
      if user_update.is_active is not None:
        update_data["is_active"] = user_update.is_active
      
-      result = self.collection.update_one(
-        {"_id": object_id},
-        {"$set": update_data}
+      # Remove None values from update data
+      clean_update_data = {k: v for k, v in update_data.items() if v is not None}
+      
+      if not clean_update_data:
+        return await self.find_user_by_id(user_id)
+      
+      result = await self.collection.find_one_and_update(
+        {"_id": ObjectId(user_id)},
+        {"$set": clean_update_data},
+        return_document=True
      )
      
-      if result.matched_count > 0:
-        return self.find_user_by_id(user_id)
+      if result:
+        return UserInDB(**result)
+      return None
    
-    except Exception:
-      # Invalid ObjectId format or other errors
-      pass
-    return None
+    except PyMongoError:
+      return None
  
-  def delete_user(self, user_id: str) -> bool:
+  async def delete_user(self, user_id: str) -> bool:
    """
    Delete user from database.

@@ -181,14 +195,15 @@ class UserRepository:
        bool: True if user was deleted, False otherwise
    """
    try:
-      object_id = ObjectId(user_id)
-      result = self.collection.delete_one({"_id": object_id})
+      if not ObjectId.is_valid(user_id):
+        return False
+      
+      result = await self.collection.delete_one({"_id": ObjectId(user_id)})
      return result.deleted_count > 0
-    except Exception:
-      # Invalid ObjectId format
+    except PyMongoError:
      return False
  
-  def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
+  async def list_users(self, skip: int = 0, limit: int = 100) -> List[UserInDB]:
    """
    List users with pagination.

@@ -199,19 +214,26 @@ class UserRepository:
    Returns:
        List[UserInDB]: List of users
    """
-    cursor = self.collection.find().skip(skip).limit(limit)
-    return [UserInDB(**user_doc) for user_doc in cursor]
+    try:
+      cursor = self.collection.find({}).skip(skip).limit(limit).sort("created_at", -1)
+      user_docs = await cursor.to_list(length=limit)
+      return [UserInDB(**user_doc) for user_doc in user_docs]
+    except PyMongoError:
+      return []
  
-  def count_users(self) -> int:
+  async def count_users(self) -> int:
    """
    Count total number of users.

    Returns:
        int: Total number of users in database
    """
-    return self.collection.count_documents({})
+    try:
+      return await self.collection.count_documents({})
+    except PyMongoError:
+      return 0
  
-  def user_exists(self, username: str) -> bool:
+  async def user_exists(self, username: str) -> bool:
    """
    Check if user exists by username.

@@ -221,4 +243,8 @@ class UserRepository:
    Returns:
        bool: True if user exists, False otherwise
    """
-    return self.collection.count_documents({"username": username}) > 0
+    try:
+      count = await self.collection.count_documents({"username": username})
+      return count > 0
+    except PyMongoError:
+      return False
--- a/src/file-processor/app/main.py
+++ b/src/file-processor/app/main.py
@@ -119,7 +119,6 @@ async def create_user(
 ):
  return user_service.create_user(user_data)

-
@app.get("/health")
 async def health_check():
  """
--- a/src/file-processor/app/models/document.py
+++ b/src/file-processor/app/models/document.py
@@ -0,0 +1,142 @@
+"""
+Pydantic models for document processing collections.
+
+This module defines the data models for file documents and processing jobs
+stored in MongoDB collections.
+"""
+
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, Optional
+
+from bson import ObjectId
+from pydantic import BaseModel, Field, field_validator
+
+from app.models.types import PyObjectId
+
+
+class FileType(str, Enum):
+  """Supported file types for document processing."""
+  
+  TXT = "txt"
+  PDF = "pdf"
+  DOCX = "docx"
+  JPG = "jpg"
+  PNG = "png"
+
+
+class ExtractionMethod(str, Enum):
+  """Methods used to extract content from documents."""
+  
+  DIRECT_TEXT = "direct_text"
+  OCR = "ocr"
+  HYBRID = "hybrid"
+
+
+class ProcessingStatus(str, Enum):
+  """Status values for processing jobs."""
+  
+  PENDING = "pending"
+  PROCESSING = "processing"
+  COMPLETED = "completed"
+  FAILED = "failed"
+
+
+class FileDocument(BaseModel):
+  """
+  Model for file documents stored in the 'files' collection.
+
+  Represents a file detected in the watched directory with its
+  metadata and extracted content.
+  """
+  
+  id: Optional[PyObjectId] = Field(default=None, alias="_id")
+  filename: str = Field(..., description="Original filename")
+  filepath: str = Field(..., description="Full path to the file")
+  file_type: FileType = Field(..., description="Type of the file")
+  extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content")
+  metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
+  detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
+  file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
+  
+  @field_validator('filepath')
+  @classmethod
+  def validate_filepath(cls, v: str) -> str:
+    """Validate filepath format."""
+    if not v.strip():
+      raise ValueError("Filepath cannot be empty")
+    return v.strip()
+  
+  @field_validator('filename')
+  @classmethod
+  def validate_filename(cls, v: str) -> str:
+    """Validate filename format."""
+    if not v.strip():
+      raise ValueError("Filename cannot be empty")
+    return v.strip()
+  
+  class Config:
+    """Pydantic configuration."""
+    populate_by_name = True
+    arbitrary_types_allowed = True
+    json_encoders = {ObjectId: str}
+
+
+class DocumentContent(BaseModel):
+  """Model for document content."""
+  
+  id: Optional[PyObjectId] = Field(default=None, alias="_id")
+  file_hash: Optional[str] = Field(..., description="SHA256 hash of file content")
+  content: str = Field(..., description="File content")
+  encoding: str = Field(default="utf-8", description="Character encoding for text files")
+  file_size: int = Field(..., ge=0, description="File size in bytes")
+  mime_type: str = Field(..., description="MIME type detected")
+
+
+class ProcessingJob(BaseModel):
+  """
+  Model for processing jobs stored in the 'processing_jobs' collection.
+
+  Tracks the lifecycle and status of document processing tasks.
+  """
+  
+  id: Optional[PyObjectId] = Field(default=None, alias="_id")
+  file_id: PyObjectId = Field(..., description="Reference to file document")
+  status: ProcessingStatus = Field(
+    default=ProcessingStatus.PENDING,
+    description="Current processing status"
+  )
+  task_id: Optional[str] = Field(
+    default=None,
+    description="Celery task UUID"
+  )
+  created_at: Optional[datetime] = Field(
+    default=None,
+    description="Timestamp when job was created"
+  )
+  started_at: Optional[datetime] = Field(
+    default=None,
+    description="Timestamp when processing started"
+  )
+  completed_at: Optional[datetime] = Field(
+    default=None,
+    description="Timestamp when processing completed"
+  )
+  error_message: Optional[str] = Field(
+    default=None,
+    description="Error message if processing failed"
+  )
+  
+  @field_validator('error_message')
+  @classmethod
+  def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
+    """Clean up error message."""
+    if v is not None:
+      return v.strip() if v.strip() else None
+    return v
+  
+  class Config:
+    """Pydantic configuration."""
+    populate_by_name = True
+    arbitrary_types_allowed = True
+    json_encoders = {ObjectId: str}
--- a/src/file-processor/app/models/types.py
+++ b/src/file-processor/app/models/types.py
@@ -0,0 +1,32 @@
+from typing import Any
+
+from bson import ObjectId
+from pydantic_core import core_schema
+
+
+class PyObjectId(ObjectId):
+  """Custom ObjectId type for Pydantic v2 compatibility."""
+  
+  @classmethod
+  def __get_pydantic_core_schema__(
+      cls, source_type: Any, handler
+  ) -> core_schema.CoreSchema:
+    return core_schema.json_or_python_schema(
+      json_schema=core_schema.str_schema(),
+      python_schema=core_schema.union_schema([
+          core_schema.is_instance_schema(ObjectId),
+          core_schema.chain_schema([
+              core_schema.str_schema(),
+              core_schema.no_info_plain_validator_function(cls.validate),
+          ])
+      ]),
+      serialization=core_schema.plain_serializer_function_ser_schema(
+        lambda x: str(x)
+      ),
+    )
+  
+  @classmethod
+  def validate(cls, v):
+    if not ObjectId.is_valid(v):
+      raise ValueError("Invalid ObjectId")
+    return ObjectId(v)
--- a/src/file-processor/app/models/user.py
+++ b/src/file-processor/app/models/user.py
@@ -13,34 +13,7 @@ from pydantic import BaseModel, Field, field_validator, EmailStr
 from pydantic_core import core_schema

 from app.models.auth import UserRole
-
-
-class PyObjectId(ObjectId):
-  """Custom ObjectId type for Pydantic v2 compatibility."""
-  
-  @classmethod
-  def __get_pydantic_core_schema__(
-      cls, source_type: Any, handler
-  ) -> core_schema.CoreSchema:
-    return core_schema.json_or_python_schema(
-      json_schema=core_schema.str_schema(),
-      python_schema=core_schema.union_schema([
-          core_schema.is_instance_schema(ObjectId),
-          core_schema.chain_schema([
-              core_schema.str_schema(),
-              core_schema.no_info_plain_validator_function(cls.validate),
-          ])
-      ]),
-      serialization=core_schema.plain_serializer_function_ser_schema(
-        lambda x: str(x)
-      ),
-    )
-  
-  @classmethod
-  def validate(cls, v):
-    if not ObjectId.is_valid(v):
-      raise ValueError("Invalid ObjectId")
-    return ObjectId(v)
+from app.models.types import PyObjectId


 def validate_password_strength(password: str) -> str:
--- a/src/file-processor/requirements.txt
+++ b/src/file-processor/requirements.txt
@@ -3,6 +3,7 @@ celery==5.5.3
 email-validator==2.3.0
 fastapi==0.116.1
 httptools==0.6.4
+motor==3.7.1
 pymongo==4.15.0
 pydantic==2.11.9
 redis==6.4.0