Refactored DocumentService to save document in the filesystem. Fixed docker application
This commit is contained in:
243
Readme.md
243
Readme.md
@@ -103,17 +103,22 @@ MyDocManager/
|
||||
│ │ │ ├── models/
|
||||
│ │ │ │ ├── __init__.py
|
||||
│ │ │ │ ├── user.py # User Pydantic models
|
||||
│ │ │ │ └── auth.py # Auth Pydantic models
|
||||
│ │ │ │ ├── auth.py # Auth Pydantic models
|
||||
│ │ │ │ ├── document.py # Document Pydantic models
|
||||
│ │ │ │ ├── job.py # Job Processing Pydantic models
|
||||
│ │ │ │ └── types.py # PyObjectId and other useful types
|
||||
│ │ │ ├── database/
|
||||
│ │ │ │ ├── __init__.py
|
||||
│ │ │ │ ├── connection.py # MongoDB connection
|
||||
│ │ │ │ └── repositories/
|
||||
│ │ │ │ ├── __init__.py
|
||||
│ │ │ │ └── user_repository.py # User CRUD operations
|
||||
│ │ │ │ ├── user_repository.py # User CRUD operations
|
||||
│ │ │ │ └── document_repository.py # User CRUD operations
|
||||
│ │ │ ├── services/
|
||||
│ │ │ │ ├── __init__.py
|
||||
│ │ │ │ ├── auth_service.py # JWT & password logic
|
||||
│ │ │ │ ├── user_service.py # User business logic
|
||||
│ │ │ │ ├── document_service.py # Document business logic
|
||||
│ │ │ │ └── init_service.py # Admin creation at startup
|
||||
│ │ │ ├── api/
|
||||
│ │ │ │ ├── __init__.py
|
||||
@@ -125,7 +130,7 @@ MyDocManager/
|
||||
│ │ │ └── utils/
|
||||
│ │ │ ├── __init__.py
|
||||
│ │ │ ├── security.py # Password utilities
|
||||
│ │ │ └── exceptions.py # Custom exceptions
|
||||
│ │ │ └── document_matching.py # Fuzzy matching Algorithms
|
||||
│ ├── worker/
|
||||
│ │ ├── Dockerfile
|
||||
│ │ ├── requirements.txt
|
||||
@@ -224,78 +229,76 @@ On first startup, the application automatically creates a default admin user:
|
||||
|
||||
#### Files Collection
|
||||
|
||||
Stores file metadata and extracted content:
|
||||
Stores file metadata and extracted content using Pydantic models:
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "ObjectId",
|
||||
"filename": "document.pdf",
|
||||
"filepath": "/watched_files/document.pdf",
|
||||
"file_type": "pdf",
|
||||
"extraction_method": "direct_text", // direct_text, ocr, hybrid
|
||||
"metadata": {
|
||||
"page_count": 15, // for PDFs
|
||||
"word_count": 250, // for text files
|
||||
"image_dimensions": { // for images
|
||||
"width": 1920,
|
||||
"height": 1080
|
||||
}
|
||||
},
|
||||
"detected_at": "2024-01-15T10:29:00Z",
|
||||
"file_hash": "sha256_hash_value"
|
||||
}
|
||||
```
|
||||
#### Document Contents Collection
|
||||
```python
|
||||
class FileDocument(BaseModel):
|
||||
"""
|
||||
Model for file documents stored in the 'files' collection.
|
||||
|
||||
Stores actual file content and technical metadata:
|
||||
```json
|
||||
{
|
||||
"_id": "ObjectId",
|
||||
"file_hash": "sha256_hash_value",
|
||||
"content": "extracted text content...",
|
||||
"encoding": "utf-8",
|
||||
"file_size": 2048576,
|
||||
"mime_type": "application/pdf"
|
||||
}
|
||||
Represents a file detected in the watched directory with its
|
||||
metadata and extracted content.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
filename: str = Field(..., description="Original filename")
|
||||
filepath: str = Field(..., description="Full path to the file")
|
||||
file_type: FileType = Field(..., description="Type of the file")
|
||||
extraction_method: Optional[ExtractionMethod] = Field(default=None, description="Method used to extract content")
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict, description="File-specific metadata")
|
||||
detected_at: Optional[datetime] = Field(default=None, description="Timestamp when file was detected")
|
||||
file_hash: Optional[str] = Field(default=None, description="SHA256 hash of file content")
|
||||
encoding: str = Field(default="utf-8", description="Character encoding for text files")
|
||||
file_size: int = Field(..., ge=0, description="File size in bytes")
|
||||
mime_type: str = Field(..., description="MIME type detected")
|
||||
|
||||
@field_validator('filepath')
|
||||
@classmethod
|
||||
def validate_filepath(cls, v: str) -> str:
|
||||
"""Validate filepath format."""
|
||||
if not v.strip():
|
||||
raise ValueError("Filepath cannot be empty")
|
||||
return v.strip()
|
||||
|
||||
@field_validator('filename')
|
||||
@classmethod
|
||||
def validate_filename(cls, v: str) -> str:
|
||||
"""Validate filename format."""
|
||||
if not v.strip():
|
||||
raise ValueError("Filename cannot be empty")
|
||||
return v.strip()
|
||||
```
|
||||
|
||||
#### Processing Jobs Collection
|
||||
|
||||
Tracks processing status and lifecycle:
|
||||
|
||||
```json
|
||||
{
|
||||
"_id": "ObjectId",
|
||||
"file_id": "reference_to_files_collection",
|
||||
"status": "completed",
|
||||
// pending, processing, completed, failed
|
||||
"task_id": "celery_task_uuid",
|
||||
"created_at": "2024-01-15T10:29:00Z",
|
||||
"started_at": "2024-01-15T10:29:30Z",
|
||||
"completed_at": "2024-01-15T10:30:00Z",
|
||||
"error_message": null
|
||||
}
|
||||
```python
|
||||
class ProcessingJob(BaseModel):
|
||||
"""
|
||||
Model for processing jobs stored in the 'processing_jobs' collection.
|
||||
|
||||
Tracks the lifecycle and status of document processing tasks.
|
||||
"""
|
||||
|
||||
id: Optional[PyObjectId] = Field(default=None, alias="_id")
|
||||
file_id: PyObjectId = Field(..., description="Reference to file document")
|
||||
status: ProcessingStatus = Field(default=ProcessingStatus.PENDING, description="Current processing status")
|
||||
task_id: Optional[str] = Field(default=None, description="Celery task UUID")
|
||||
created_at: Optional[datetime] = Field(default=None, description="Timestamp when job was created")
|
||||
started_at: Optional[datetime] = Field(default=None, description="Timestamp when processing started")
|
||||
completed_at: Optional[datetime] = Field(default=None, description="Timestamp when processing completed")
|
||||
error_message: Optional[str] = Field(default=None, description="Error message if processing failed")
|
||||
|
||||
@field_validator('error_message')
|
||||
@classmethod
|
||||
def validate_error_message(cls, v: Optional[str]) -> Optional[str]:
|
||||
"""Clean up error message."""
|
||||
if v is not None:
|
||||
return v.strip() if v.strip() else None
|
||||
return v
|
||||
```
|
||||
|
||||
### Data Storage Strategy
|
||||
|
||||
- **Choice**: Three separate collections for files, content, and processing status
|
||||
- **Rationale**: Normalization prevents content duplication when multiple files have identical content
|
||||
- **Benefits**:
|
||||
- Content deduplication via SHA256 hash
|
||||
- Better query performance for metadata vs content searches
|
||||
- Clear separation of concerns between file metadata, content, and processing lifecycle
|
||||
- Multiple files can reference the same content (e.g., identical copies in different locations)
|
||||
|
||||
### Content Storage Location
|
||||
|
||||
- **Choice**: Store extracted content in separate `document_contents` collection
|
||||
- **Rationale**: Content normalization and deduplication
|
||||
- **Benefits**:
|
||||
- Single content storage per unique file hash
|
||||
- Multiple file entries can reference same content
|
||||
- Efficient storage for duplicate files
|
||||
|
||||
### Supported File Types (Initial Implementation)
|
||||
|
||||
- **Text Files** (`.txt`): Direct content reading
|
||||
@@ -306,7 +309,7 @@ Tracks processing status and lifecycle:
|
||||
|
||||
#### Watchdog Implementation
|
||||
|
||||
- **Choice**: Dedicated observer thread (Option A)
|
||||
- **Choice**: Dedicated observer thread
|
||||
- **Rationale**: Standard approach, clean separation of concerns
|
||||
- **Implementation**: Watchdog observer runs in separate thread from FastAPI
|
||||
|
||||
@@ -327,17 +330,17 @@ Tracks processing status and lifecycle:
|
||||
|
||||
#### Content Storage Location
|
||||
|
||||
- **Choice**: Store extracted content in `files` collection
|
||||
- **Rationale**: Content is intrinsic property of the file
|
||||
- **Benefits**: Single query to get file + content, simpler data model
|
||||
- **Choice**: Store files in the file system, using the SHA256 hash as filename
|
||||
- **Rationale**: MongoDB is not meant for large files, better performance. Files remain in the file system for easy
|
||||
access.
|
||||
|
||||
### Implementation Order
|
||||
|
||||
1. ✅ Pydantic models for MongoDB collections
|
||||
2. ✅ Repository layer for data access (files + processing_jobs)
|
||||
3. ✅ Celery tasks for document processing
|
||||
4. ✅ Watchdog file monitoring implementation
|
||||
5. ✅ FastAPI integration and startup coordination
|
||||
2. UNDER PROGRESS : Repository layer for data access (files + processing_jobs)
|
||||
3. TODO : Celery tasks for document processing
|
||||
4. TODO : Watchdog file monitoring implementation
|
||||
5. TODO : FastAPI integration and startup coordination
|
||||
|
||||
### Processing Pipeline Features
|
||||
|
||||
@@ -347,87 +350,6 @@ Tracks processing status and lifecycle:
|
||||
- **Extensible Metadata**: Flexible metadata storage per file type
|
||||
- **Multiple Extraction Methods**: Support for direct text, OCR, and hybrid approaches
|
||||
|
||||
## Document Service Architecture
|
||||
|
||||
### Service Overview
|
||||
|
||||
The document service provides orchestrated access to file documents and their content through a single interface that coordinates between `FileDocument` and `DocumentContent` repositories.
|
||||
|
||||
### Service Design
|
||||
|
||||
- **Architecture Pattern**: Service orchestration with separate repositories
|
||||
- **Transaction Support**: MongoDB ACID transactions for data consistency
|
||||
- **Content Deduplication**: Multiple files can reference the same content via SHA256 hash
|
||||
- **Error Handling**: MongoDB standard exceptions with transaction rollback
|
||||
|
||||
### Document Service (`document_service.py`)
|
||||
|
||||
Orchestrates operations between file and content repositories while maintaining data consistency.
|
||||
|
||||
#### Core Functionality
|
||||
|
||||
##### `create_document(file_path: str, file_bytes: bytes, encoding: str)`
|
||||
|
||||
Creates a new document with automatic attribute calculation and content deduplication.
|
||||
|
||||
**Automatic Calculations:**
|
||||
- `file_hash`: SHA256 hash of file bytes
|
||||
- `file_type`: Detection based on file extension
|
||||
- `mime_type`: Detection via `python-magic` library
|
||||
- `file_size`: Length of provided bytes
|
||||
- `detected_at`: Current timestamp
|
||||
- `metadata`: Empty dictionary (reserved for future extension)
|
||||
|
||||
**Deduplication Logic:**
|
||||
1. Calculate SHA256 hash of file content
|
||||
2. Check if `DocumentContent` with this hash already exists
|
||||
3. If EXISTS: Create only `FileDocument` referencing existing content
|
||||
4. If NOT EXISTS: Create both `FileDocument` and `DocumentContent` in transaction
|
||||
|
||||
**Transaction Flow:**
|
||||
```
|
||||
BEGIN TRANSACTION
|
||||
IF content_exists(file_hash):
|
||||
CREATE FileDocument with content reference
|
||||
ELSE:
|
||||
CREATE DocumentContent
|
||||
CREATE FileDocument with content reference
|
||||
COMMIT TRANSACTION
|
||||
```
|
||||
|
||||
#### Available Methods
|
||||
|
||||
- `create_document(file_path, file_bytes, encoding)`: Create with deduplication
|
||||
- `get_document_by_id(document_id)`: Retrieve by document ID
|
||||
- `get_document_by_hash(file_hash)`: Retrieve by file hash
|
||||
- `get_document_by_filepath(filepath)`: Retrieve by file path
|
||||
- `list_documents(skip, limit)`: Paginated document listing
|
||||
- `count_documents()`: Total document count
|
||||
- `update_document(document_id, update_data)`: Update document metadata
|
||||
- `delete_document(document_id)`: Remove document and orphaned content
|
||||
|
||||
### Repository Dependencies
|
||||
|
||||
The document service coordinates two existing repositories:
|
||||
|
||||
#### File Repository (`file_repository.py`)
|
||||
- `create_document()`, `find_document_by_id()`, `find_document_by_hash()`
|
||||
- `find_document_by_filepath()`, `find_document_by_name()`
|
||||
- `list_documents()`, `count_documents()`
|
||||
- `update_document()`, `delete_document()`
|
||||
|
||||
#### Document Content Repository (`document_content_repository.py`)
|
||||
- `create_document_content()`, `find_document_content_by_id()`
|
||||
- `find_document_content_by_file_hash()`, `content_exists()`
|
||||
- `update_document_content()`, `delete_document_content()`
|
||||
- `list_document_contents()`, `count_document_contents()`
|
||||
|
||||
### Dependencies
|
||||
|
||||
- `python-magic`: MIME type detection
|
||||
- `hashlib`: SHA256 hashing (standard library)
|
||||
- `pymongo`: MongoDB transactions support
|
||||
|
||||
## Key Implementation Notes
|
||||
|
||||
### Python Standards
|
||||
@@ -483,21 +405,14 @@ The document service coordinates two existing repositories:
|
||||
|
||||
### Next Implementation Steps
|
||||
|
||||
1. ✅ Create docker-compose.yml with all services => Done
|
||||
2. ✅ Define user management and authentication architecture => Done
|
||||
3. ✅ Implement user models and authentication services =>
|
||||
1. models/user.py => Done
|
||||
2. models/auth.py => Done
|
||||
3. database/repositories/user_repository.py => Done
|
||||
4. ✅ Add automatic admin user creation if it does not exists => Done
|
||||
5. **IN PROGRESS**: Implement file processing pipeline =>
|
||||
1. **IN PROGRESS**: Implement file processing pipeline =>
|
||||
1. Create Pydantic models for files and processing_jobs collections
|
||||
2. Implement repository layer for file and processing job data access
|
||||
3. Create Celery tasks for document processing (.txt, .pdf, .docx)
|
||||
4. Implement Watchdog file monitoring with dedicated observer
|
||||
5. Integrate file watcher with FastAPI startup
|
||||
6. Create protected API routes for user management
|
||||
7. Build React monitoring interface with authentication
|
||||
2. Create protected API routes for user management
|
||||
3. Build React monitoring interface with authentication
|
||||
|
||||
## Annexes
|
||||
|
||||
|
||||
Reference in New Issue
Block a user