Fisrt commit. Docker compose is working

This commit is contained in:
2025-09-15 23:21:09 +02:00
commit 10650420ef
17 changed files with 858 additions and 0 deletions

216
.gitignore vendored Normal file
View File

@@ -0,0 +1,216 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
# poetry.lock
# poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
# pdm.lock
# pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
# pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# Redis
*.rdb
*.aof
*.pid
# RabbitMQ
mnesia/
rabbitmq/
rabbitmq-data/
# ActiveMQ
activemq-data/
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
# Streamlit
.streamlit/secrets.toml

255
Readme.md Normal file
View File

@@ -0,0 +1,255 @@
# MyDocManager
## Overview
MyDocManager is a real-time document processing application that automatically detects files in a monitored directory, processes them asynchronously, and stores the results in a database. The application uses a modern microservices architecture with Redis for task queuing and MongoDB for data persistence.
## Architecture
### Technology Stack
- **Backend API**: FastAPI (Python 3.12)
- **Task Processing**: Celery with Redis broker
- **Document Processing**: EasyOCR, PyMuPDF, python-docx, pdfplumber
- **Database**: MongoDB
- **Frontend**: React
- **Containerization**: Docker & Docker Compose
- **File Monitoring**: Python watchdog library
### Services Architecture
┌─────────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Frontend │ │ file- │ │ Redis │ │ Worker │ │ MongoDB │
│ (React) │◄──►│ processor │───►│ (Broker) │◄──►│ (Celery) │───►│ (Results) │
│ │ │ (FastAPI + │ │ │ │ │ │ │
│ │ │ watchdog) │ │ │ │ │ │ │
└─────────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘
### Docker Services
1. **file-processor**: FastAPI + real-time file monitoring + Celery task dispatch
2. **worker**: Celery workers for document processing (OCR, text extraction)
3. **redis**: Message broker for Celery tasks
4. **mongodb**: Final database for processing results
5. **frontend**: React interface for monitoring and file access
## Data Flow
1. **File Detection**: Watchdog monitors target directory in real-time
2. **Task Creation**: FastAPI creates Celery task for each detected file
3. **Task Processing**: Worker processes document (OCR, text extraction)
4. **Result Storage**: Processed data stored in MongoDB
5. **Monitoring**: React frontend displays processing status and results
## Document Processing Capabilities
### Supported File Types
- **PDF**: Direct text extraction + OCR for scanned documents
- **Word Documents**: .docx text extraction
- **Images**: OCR text recognition (JPG, PNG, etc.)
### Processing Libraries
- **EasyOCR**: Modern OCR engine (80+ languages, deep learning-based)
- **PyMuPDF**: PDF text extraction and manipulation
- **python-docx**: Word document processing
- **pdfplumber**: Advanced PDF text extraction
## Development Environment
### Container-Based Development
The application is designed for container-based development with hot-reload capabilities:
- Source code mounted as volumes for real-time updates
- All services orchestrated via Docker Compose
- Development and production parity
### Key Features
- **Real-time Processing**: Immediate file detection and processing
- **Horizontal Scaling**: Multiple workers can be added easily
- **Fault Tolerance**: Celery provides automatic retry mechanisms
- **Monitoring**: Built-in task status tracking
- **Hot Reload**: Development changes reflected instantly in containers
### Docker Services
1. **file-processor**: FastAPI + real-time file monitoring + Celery task dispatch
2. **worker**: Celery workers for document processing (OCR, text extraction)
3. **redis**: Message broker for Celery tasks
4. **mongodb**: Final database for processing results
5. **frontend**: React interface for monitoring and file access
## Project Structure (To be implemented)
MyDocManager/
├── docker-compose.yml
├── src/
│ ├── file-processor/
│ │ ├── Dockerfile
│ │ ├── requirements.txt
│ │ ├── app/
│ │ │ ├── main.py
│ │ │ ├── file_watcher.py
│ │ │ ├── celery_app.py
│ │ │ └── api/
│ ├── worker/
│ │ ├── Dockerfile
│ │ ├── requirements.txt
│ │ └── tasks/
│ └── frontend/
│ ├── Dockerfile
│ ├── package.json
│ └── src/
├── tests/
│ ├── file-processor/
│ └── worker/
├── volumes/
│ └── watched_files/
└── README.md
## Docker Commands Reference
### Initial Setup & Build
```bash
# Build and start all services (first time)
docker-compose up --build
# Build and start in background
docker-compose up --build -d
# Build specific service
docker-compose build file-processor
docker-compose build worker
```
### Development Workflow
```bash
# Start all services
docker-compose up
# Start in background (detached mode)
docker-compose up -d
# Stop all services
docker-compose down
# Stop and remove volumes (⚠️ deletes MongoDB data)
docker-compose down -v
# Restart specific service
docker-compose restart file-processor
docker-compose restart worker
docker-compose restart redis
docker-compose restart mongodb
```
### Monitoring & Debugging
```bash
# View logs of all services
docker-compose logs
# View logs of specific service
docker-compose logs file-processor
docker-compose logs worker
docker-compose logs redis
docker-compose logs mongodb
# Follow logs in real-time
docker-compose logs -f
docker-compose logs -f worker
# View running containers
docker-compose ps
# Execute command in running container
docker-compose exec file-processor bash
docker-compose exec worker bash
docker-compose exec mongodb mongosh
```
### Service Management
```bash
# Start only specific services
docker-compose up redis mongodb file-processor
# Stop specific service
docker-compose stop worker
docker-compose stop file-processor
# Remove stopped containers
docker-compose rm
# Scale workers (multiple instances)
docker-compose up --scale worker=3
```
### Hot-Reload Configuration
- **file-processor**: Hot-reload enabled via `--reload` flag
- Code changes in `src/file-processor/app/` automatically restart FastAPI
- **worker**: No hot-reload (manual restart required for stability)
- Code changes in `src/worker/tasks/` require: `docker-compose restart worker`
### Useful Service URLs
- **FastAPI API**: http://localhost:8000
- **FastAPI Docs**: http://localhost:8000/docs
- **Health Check**: http://localhost:8000/health
- **Redis**: localhost:6379
- **MongoDB**: localhost:27017
### Testing Commands
```bash
# Test FastAPI health
curl http://localhost:8000/health
# Test Celery task dispatch
curl -X POST http://localhost:8000/test-task \
-H "Content-Type: application/json" \
-d '{"message": "Hello from test!"}'
# Monitor Celery tasks
docker-compose logs -f worker
```
## Key Implementation Notes
### Python Standards
- **Style**: PEP 8 compliance
- **Documentation**: Google/NumPy docstring format
- **Naming**: snake_case for variables and functions
- **Testing**: pytest with test_i_can_xxx / test_i_cannot_xxx patterns
### Dependencies Management
- **Package Manager**: pip (standard)
- **External Dependencies**: Listed in each service's requirements.txt
- **Standard Library First**: Prefer standard library when possible
### Testing Strategy
- All code must be testable
- Unit tests for each processing function
- Integration tests for file processing workflow
- Tests validated before implementation
### Critical Architecture Decisions Made
1. **Option Selected**: Single FastAPI service handles both API and file watching
2. **Celery with Redis**: Chosen over other async patterns for scalability
3. **EasyOCR Preferred**: Selected over Tesseract for modern OCR needs
4. **Container Development**: Hot-reload setup required for development workflow
### Development Process Requirements
1. **Collaborative Validation**: All options must be explained before coding
2. **Test-First Approach**: Test cases defined and validated before implementation
3. **Incremental Development**: Start simple, extend functionality progressively
4. **Error Handling**: Clear problem explanation required before proposing fixes
### Next Implementation Steps
1. Create docker-compose.yml with all services
2. Implement basic FastAPI service structure
3. Add watchdog file monitoring
4. Create Celery task structure
5. Implement document processing tasks
6. Build React monitoring interface
"""

73
docker-compose.yml Normal file
View File

@@ -0,0 +1,73 @@
version: '3.8'
services:
# Redis - Message broker for Celery
redis:
image: redis:8-alpine
container_name: mydocmanager-redis
ports:
- "6379:6379"
networks:
- mydocmanager-network
# MongoDB - Final database for results
mongodb:
image: mongo:7
container_name: mydocmanager-mongodb
ports:
- "27017:27017"
environment:
MONGO_INITDB_ROOT_USERNAME: admin
MONGO_INITDB_ROOT_PASSWORD: password123
MONGO_INITDB_DATABASE: mydocmanager
volumes:
- mongodb-data:/data/db
networks:
- mydocmanager-network
# File Processor - FastAPI + file monitoring + Celery task dispatch
file-processor:
build:
context: ./src/file-processor
dockerfile: Dockerfile
container_name: mydocmanager-file-processor
ports:
- "8000:8000"
environment:
- REDIS_URL=redis://redis:6379/0
- MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin
volumes:
- ./src/file-processor/app:/app
- ./volumes/watched_files:/watched_files
depends_on:
- redis
- mongodb
networks:
- mydocmanager-network
command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
# Worker - Celery workers for document processing
worker:
build:
context: ./src/worker
dockerfile: Dockerfile
container_name: mydocmanager-worker
environment:
- REDIS_URL=redis://redis:6379/0
- MONGODB_URL=mongodb://admin:password123@mongodb:27017/mydocmanager?authSource=admin
volumes:
- ./src/worker/tasks:/app
- ./volumes/watched_files:/watched_files
depends_on:
- redis
- mongodb
networks:
- mydocmanager-network
command: celery -A main worker --loglevel=info
volumes:
mongodb-data:
networks:
mydocmanager-network:
driver: bridge

7
main.py Normal file
View File

@@ -0,0 +1,7 @@
def main():
print("Hello word !")
if __name__ == "__main__":
main()

33
requirements.txt Normal file
View File

@@ -0,0 +1,33 @@
amqp==5.3.1
annotated-types==0.7.0
anyio==4.10.0
billiard==4.2.1
celery==5.5.3
click==8.2.1
click-didyoumean==0.3.1
click-plugins==1.1.1.2
click-repl==0.3.0
fastapi==0.116.1
h11==0.16.0
httptools==0.6.4
idna==3.10
kombu==5.5.4
packaging==25.0
prompt_toolkit==3.0.52
pydantic==2.11.9
pydantic_core==2.33.2
python-dateutil==2.9.0.post0
python-dotenv==1.1.1
PyYAML==6.0.2
six==1.17.0
sniffio==1.3.1
starlette==0.47.3
typing-inspection==0.4.1
typing_extensions==4.15.0
tzdata==2025.2
uvicorn==0.35.0
uvloop==0.21.0
vine==5.1.0
watchfiles==1.1.0
wcwidth==0.2.13
websockets==15.0.1

0
src/__init__.py Normal file
View File

View File

@@ -0,0 +1,17 @@
FROM python:3.12-slim
# Set working directory
WORKDIR /app
# Copy requirements and install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app/ .
# Expose port
EXPOSE 8000
# Command will be overridden by docker-compose
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

View File

View File

@@ -0,0 +1,120 @@
"""
FastAPI application for MyDocManager file processor service.
This service provides API endpoints for health checks and task dispatching.
"""
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import redis
from celery import Celery
# Initialize FastAPI app
app = FastAPI(
title="MyDocManager File Processor",
description="File processing and task dispatch service",
version="1.0.0"
)
# Environment variables
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
# Initialize Redis client
try:
redis_client = redis.from_url(REDIS_URL)
except Exception as e:
redis_client = None
print(f"Warning: Could not connect to Redis: {e}")
# Initialize Celery
celery_app = Celery(
"file_processor",
broker=REDIS_URL,
backend=REDIS_URL
)
# Pydantic models
class TestTaskRequest(BaseModel):
"""Request model for test task."""
message: str
@app.get("/health")
async def health_check():
"""
Health check endpoint.
Returns:
dict: Service health status with dependencies
"""
health_status = {
"status": "healthy",
"service": "file-processor",
"dependencies": {
"redis": "unknown",
"mongodb": "unknown"
},
}
# Check Redis connection
if redis_client:
try:
redis_client.ping()
health_status["dependencies"]["redis"] = "connected"
except Exception:
health_status["dependencies"]["redis"] = "disconnected"
health_status["status"] = "degraded"
return health_status
@app.post("/test-task")
async def dispatch_test_task(request: TestTaskRequest):
"""
Dispatch a test task to Celery worker.
Args:
request: Test task request containing message
Returns:
dict: Task dispatch information
Raises:
HTTPException: If task dispatch fails
"""
try:
# Send task to worker
task = celery_app.send_task(
"main.test_task",
args=[request.message]
)
return {
"status": "dispatched",
"task_id": task.id,
"message": f"Test task dispatched with message: {request.message}"
}
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Failed to dispatch task: {str(e)}"
)
@app.get("/")
async def root():
"""
Root endpoint.
Returns:
dict: Basic service information
"""
return {
"service": "MyDocManager File Processor",
"version": "1.0.0",
"status": "running"
}

View File

@@ -0,0 +1,6 @@
fastapi==0.116.1
uvicorn==0.35.0
celery==5.5.3
redis==6.4.0
pymongo==4.15.0
pydantic==2.11.9

14
src/worker/Dockerfile Normal file
View File

@@ -0,0 +1,14 @@
FROM python:3.12-slim
# Set working directory
WORKDIR /app
# Copy requirements and install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY tasks/ .
# Command will be overridden by docker-compose
CMD ["celery", "-A", "main", "worker", "--loglevel=info"]

0
src/worker/__init__.py Normal file
View File

View File

@@ -0,0 +1,4 @@
celery==5.5.3
redis==6.4.0
pymongo==4.15.0

View File

113
src/worker/tasks/main.py Normal file
View File

@@ -0,0 +1,113 @@
"""
Celery worker for MyDocManager document processing tasks.
This module contains all Celery tasks for processing documents.
"""
import os
import time
from celery import Celery
# Environment variables
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
# Initialize Celery app
app = Celery(
"mydocmanager_worker",
broker=REDIS_URL,
backend=REDIS_URL
)
# Celery configuration
app.conf.update(
task_serializer="json",
accept_content=["json"],
result_serializer="json",
timezone="UTC",
enable_utc=True,
task_track_started=True,
task_time_limit=300, # 5 minutes
task_soft_time_limit=240, # 4 minutes
)
@app.task(bind=True)
def test_task(self, message: str):
"""
Test task for validating worker functionality.
Args:
message: Test message to process
Returns:
dict: Task result with processing information
"""
try:
print(f"[WORKER] Starting test task with message: {message}")
# Simulate some work
for i in range(5):
print(f"[WORKER] Processing step {i + 1}/5...")
time.sleep(1)
# Update task progress
self.update_state(
state="PROGRESS",
meta={
"current": i + 1,
"total": 5,
"message": f"Processing step {i + 1}"
}
)
result = {
"status": "completed",
"message": f"Successfully processed: {message}",
"processed_at": time.time(),
"worker_id": self.request.id
}
print(f"[WORKER] Test task completed successfully: {result}")
return result
except Exception as exc:
print(f"[WORKER] Test task failed: {str(exc)}")
raise self.retry(exc=exc, countdown=60, max_retries=3)
@app.task(bind=True)
def process_document_task(self, file_path: str):
"""
Placeholder task for document processing.
Args:
file_path: Path to the document to process
Returns:
dict: Processing result
"""
try:
print(f"[WORKER] Starting document processing for: {file_path}")
# Placeholder for document processing logic
time.sleep(2) # Simulate processing time
result = {
"status": "completed",
"file_path": file_path,
"processed_at": time.time(),
"content": f"Placeholder content for {file_path}",
"worker_id": self.request.id
}
print(f"[WORKER] Document processing completed: {file_path}")
return result
except Exception as exc:
print(f"[WORKER] Document processing failed for {file_path}: {str(exc)}")
raise self.retry(exc=exc, countdown=60, max_retries=3)
if __name__ == "__main__":
app.start()

0
tests/__init__.py Normal file
View File