I can put a new file and create the associated pdf
This commit is contained in:
199
src/file-processor/app/utils/pdf_converter.py
Normal file
199
src/file-processor/app/utils/pdf_converter.py
Normal file
@@ -0,0 +1,199 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import uuid
|
||||
from abc import ABC
|
||||
from pathlib import Path
|
||||
from typing import Self
|
||||
|
||||
import pikepdf
|
||||
import pypandoc
|
||||
from PIL import Image
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
from tasks.common.converter_utils import detect_file_type
|
||||
|
||||
|
||||
class BaseConverter(ABC):
|
||||
"""Abstract base class for file converters to PDF."""
|
||||
|
||||
def __init__(self, input_path: str, output_dir: str = ".") -> None:
|
||||
self.input_path = Path(input_path)
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_path = self.output_dir / f"{self.generate_uuid_filename()}.pdf"
|
||||
|
||||
def convert(self) -> Self:
|
||||
"""Convert input file to PDF and return the output path."""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def generate_uuid_filename() -> str:
|
||||
"""Generate a unique filename using UUID4."""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
def get_deterministic_date(self) -> str:
|
||||
"""
|
||||
Generate a deterministic date based on file content.
|
||||
This ensures the same file always produces the same PDF.
|
||||
"""
|
||||
# Option 1: Use a fixed date
|
||||
# return "D:20000101000000"
|
||||
|
||||
# Option 2: Generate date from content hash (recommended)
|
||||
with open(self.input_path, 'rb') as f:
|
||||
content = f.read()
|
||||
content_hash = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Use first 14 characters of hash to create a valid date
|
||||
# Format: D:YYYYMMDDHHmmss
|
||||
hash_int = int(content_hash[:14], 16)
|
||||
|
||||
# Create a date between 2000-2099 to keep it reasonable
|
||||
year = 2000 + (hash_int % 100)
|
||||
month = 1 + (hash_int % 12)
|
||||
day = 1 + (hash_int % 28) # Stay safe with 28 days
|
||||
hour = hash_int % 24
|
||||
minute = hash_int % 60
|
||||
second = hash_int % 60
|
||||
|
||||
return f"D:{year:04d}{month:02d}{day:02d}{hour:02d}{minute:02d}{second:02d}"
|
||||
|
||||
def get_file_creation_date(self):
|
||||
# Get file creation time (or modification time)
|
||||
ts = os.path.getctime(self.input_path) # getmtime(self.input_path) for last modification
|
||||
dt = datetime.datetime.fromtimestamp(ts)
|
||||
|
||||
# PDF expects format D:YYYYMMDDHHmmss
|
||||
creation_date = dt.strftime("D:%Y%m%d%H%M%S")
|
||||
return creation_date
|
||||
|
||||
def clean_pdf(self) -> Self:
|
||||
"""Remove all non-deterministic metadata from PDF."""
|
||||
with pikepdf.open(self.output_path, allow_overwriting_input=True) as pdf:
|
||||
# Remove XMP metadata if it exists
|
||||
if hasattr(pdf.Root, 'Metadata'):
|
||||
del pdf.Root.Metadata
|
||||
|
||||
# Clear all document info by deleting each key
|
||||
for key in list(pdf.docinfo.keys()):
|
||||
del pdf.docinfo[key]
|
||||
|
||||
# Set deterministic metadata
|
||||
pdf.docinfo["/Producer"] = "MyConverter"
|
||||
pdf.docinfo["/Creator"] = "MyConverter"
|
||||
pdf.docinfo["/CreationDate"] = self.get_deterministic_date()
|
||||
pdf.docinfo["/ModDate"] = self.get_deterministic_date()
|
||||
pdf.docinfo["/Title"] = self.input_path.name
|
||||
|
||||
# Save with deterministic IDs
|
||||
# compress=True ensures consistent compression
|
||||
# deterministic_id=True (if available) or static_id=True
|
||||
pdf.save(
|
||||
self.output_path,
|
||||
fix_metadata_version=True,
|
||||
compress_streams=True,
|
||||
stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
|
||||
object_stream_mode=pikepdf.ObjectStreamMode.disable,
|
||||
deterministic_id=True # Use this if pikepdf >= 8.0.0, otherwise use static_id=True
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
class TextToPdfConverter(BaseConverter):
|
||||
"""Converter for text files to PDF."""
|
||||
|
||||
def convert(self) -> Self:
|
||||
c = canvas.Canvas(str(self.output_path), pagesize=A4)
|
||||
|
||||
# Fix metadata with deterministic values
|
||||
info = c._doc.info
|
||||
info.producer = "MyConverter"
|
||||
info.creationDate = self.get_file_creation_date()
|
||||
info.title = os.path.basename(self.input_path)
|
||||
|
||||
width, height = A4
|
||||
with open(self.input_path, "r", encoding="utf-8") as f:
|
||||
y = height - 50
|
||||
for line in f:
|
||||
c.drawString(50, y, line.strip())
|
||||
y -= 15
|
||||
if y < 50:
|
||||
c.showPage()
|
||||
y = height - 50
|
||||
|
||||
c.save()
|
||||
return self
|
||||
|
||||
|
||||
class ImageToPdfConverter(BaseConverter):
|
||||
"""Converter for image files to PDF."""
|
||||
|
||||
def convert(self) -> Self:
|
||||
image = Image.open(self.input_path)
|
||||
rgb_image = image.convert("RGB")
|
||||
rgb_image.save(self.output_path)
|
||||
return self
|
||||
|
||||
|
||||
class WordToPdfConverter(BaseConverter):
|
||||
"""Converter for Word files (.docx) to PDF using pypandoc."""
|
||||
|
||||
def convert(self) -> Self:
|
||||
pypandoc.convert_file(
|
||||
str(self.input_path), "pdf", outputfile=str(self.output_path)
|
||||
)
|
||||
return self
|
||||
|
||||
|
||||
# Placeholders for future extensions
|
||||
class HtmlToPdfConverter(BaseConverter):
|
||||
"""Placeholder for HTML to PDF converter."""
|
||||
|
||||
def convert(self) -> Self:
|
||||
raise NotImplementedError("HTML to PDF conversion not implemented.")
|
||||
|
||||
|
||||
class ExcelToPdfConverter(BaseConverter):
|
||||
"""Placeholder for Excel to PDF converter."""
|
||||
|
||||
def convert(self) -> Self:
|
||||
raise NotImplementedError("Excel to PDF conversion not implemented.")
|
||||
|
||||
|
||||
class MarkdownToPdfConverter(BaseConverter):
|
||||
"""Placeholder for Markdown to PDF converter."""
|
||||
|
||||
def convert(self) -> Self:
|
||||
raise NotImplementedError("Markdown to PDF conversion not implemented.")
|
||||
|
||||
|
||||
def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
|
||||
"""
|
||||
Convert any supported file to PDF.
|
||||
|
||||
Args:
|
||||
filepath (str): Path to the input file.
|
||||
output_dir (str): Directory to save the output PDF.
|
||||
|
||||
Returns:
|
||||
str: Path to the generated PDF.
|
||||
|
||||
Raises:
|
||||
UnsupportedFileTypeError: If the input file type is not supported.
|
||||
"""
|
||||
file_type = detect_file_type(filepath)
|
||||
|
||||
if file_type == "text":
|
||||
converter = TextToPdfConverter(filepath, output_dir=output_dir)
|
||||
elif file_type == "image":
|
||||
converter = ImageToPdfConverter(filepath, output_dir=output_dir)
|
||||
elif file_type == "word":
|
||||
converter = WordToPdfConverter(filepath, output_dir=output_dir)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
|
||||
converter.convert()
|
||||
converter.clean_pdf()
|
||||
return str(converter.output_path)
|
||||
Reference in New Issue
Block a user