Working on pdf creation

This commit is contained in:
2025-10-04 18:38:55 +02:00
parent 62c7e46a88
commit bd52f2d296
5 changed files with 128 additions and 37 deletions

View File

@@ -1,7 +1,10 @@
import datetime
import hashlib
import os
from abc import ABC, abstractmethod
import uuid
from abc import ABC
from pathlib import Path
from typing import Self
import pikepdf
import pypandoc
@@ -9,7 +12,7 @@ from PIL import Image
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from tasks.common.converter_utils import generate_uuid_filename, detect_file_type
from tasks.common.converter_utils import detect_file_type
class BaseConverter(ABC):
@@ -18,13 +21,44 @@ class BaseConverter(ABC):
def __init__(self, input_path: str, output_dir: str = ".") -> None:
self.input_path = Path(input_path)
self.output_dir = Path(output_dir)
self.output_path = self.output_dir / f"{generate_uuid_filename()}.pdf"
self.output_path = self.output_dir / f"{self.generate_uuid_filename()}.pdf"
@abstractmethod
def convert(self) -> str:
def convert(self) -> Self:
"""Convert input file to PDF and return the output path."""
pass
@staticmethod
def generate_uuid_filename() -> str:
"""Generate a unique filename using UUID4."""
return str(uuid.uuid4())
def get_deterministic_date(self) -> str:
"""
Generate a deterministic date based on file content.
This ensures the same file always produces the same PDF.
"""
# Option 1: Use a fixed date
# return "D:20000101000000"
# Option 2: Generate date from content hash (recommended)
with open(self.input_path, 'rb') as f:
content = f.read()
content_hash = hashlib.sha256(content).hexdigest()
# Use first 14 characters of hash to create a valid date
# Format: D:YYYYMMDDHHmmss
hash_int = int(content_hash[:14], 16)
# Create a date between 2000-2099 to keep it reasonable
year = 2000 + (hash_int % 100)
month = 1 + (hash_int % 12)
day = 1 + (hash_int % 28) # Stay safe with 28 days
hour = hash_int % 24
minute = hash_int % 60
second = hash_int % 60
return f"D:{year:04d}{month:02d}{day:02d}{hour:02d}{minute:02d}{second:02d}"
def get_file_creation_date(self):
# Get file creation time (or modification time)
ts = os.path.getctime(self.input_path) # getmtime(self.input_path) for last modification
@@ -34,22 +68,43 @@ class BaseConverter(ABC):
creation_date = dt.strftime("D:%Y%m%d%H%M%S")
return creation_date
def clean_pdf(self):
with pikepdf.open(self.output_path) as pdf:
pdf.Root.Metadata = None
def clean_pdf(self) -> Self:
"""Remove all non-deterministic metadata from PDF."""
with pikepdf.open(self.output_path, allow_overwriting_input=True) as pdf:
# Remove XMP metadata if it exists
if hasattr(pdf.Root, 'Metadata'):
del pdf.Root.Metadata
# Clear all document info by deleting each key
for key in list(pdf.docinfo.keys()):
del pdf.docinfo[key]
pdf.docinfo.clear()
# Set deterministic metadata
pdf.docinfo["/Producer"] = "MyConverter"
pdf.docinfo["/CreationDate"] = self.get_file_creation_date()
pdf.docinfo["/Title"] = os.path.basename(self.input_path)
pdf.docinfo["/Creator"] = "MyConverter"
pdf.docinfo["/CreationDate"] = self.get_deterministic_date()
pdf.docinfo["/ModDate"] = self.get_deterministic_date()
pdf.docinfo["/Title"] = self.input_path.name
pdf.save(self.output_path, fix_metadata_version=True, static_id=True)
# Save with deterministic IDs
# compress=True ensures consistent compression
# deterministic_id=True (if available) or static_id=True
pdf.save(
self.output_path,
fix_metadata_version=True,
compress_streams=True,
stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
object_stream_mode=pikepdf.ObjectStreamMode.disable,
deterministic_id=True # Use this if pikepdf >= 8.0.0, otherwise use static_id=True
)
return self
class TextToPdfConverter(BaseConverter):
"""Converter for text files to PDF."""
def convert(self) -> str:
def convert(self) -> Self:
c = canvas.Canvas(str(self.output_path), pagesize=A4)
# Fix metadata with deterministic values
@@ -69,48 +124,48 @@ class TextToPdfConverter(BaseConverter):
y = height - 50
c.save()
return str(self.output_path)
return self
class ImageToPdfConverter(BaseConverter):
"""Converter for image files to PDF."""
def convert(self) -> str:
def convert(self) -> Self:
image = Image.open(self.input_path)
rgb_image = image.convert("RGB")
rgb_image.save(self.output_path)
return str(self.output_path)
return self
class WordToPdfConverter(BaseConverter):
"""Converter for Word files (.docx) to PDF using pypandoc."""
def convert(self) -> str:
def convert(self) -> Self:
pypandoc.convert_file(
str(self.input_path), "pdf", outputfile=str(self.output_path)
)
return str(self.output_path)
return self
# Placeholders for future extensions
class HtmlToPdfConverter(BaseConverter):
"""Placeholder for HTML to PDF converter."""
def convert(self) -> str:
def convert(self) -> Self:
raise NotImplementedError("HTML to PDF conversion not implemented.")
class ExcelToPdfConverter(BaseConverter):
"""Placeholder for Excel to PDF converter."""
def convert(self) -> str:
def convert(self) -> Self:
raise NotImplementedError("Excel to PDF conversion not implemented.")
class MarkdownToPdfConverter(BaseConverter):
"""Placeholder for Markdown to PDF converter."""
def convert(self) -> str:
def convert(self) -> Self:
raise NotImplementedError("Markdown to PDF conversion not implemented.")
@@ -139,4 +194,6 @@ def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
else:
raise ValueError(f"Unsupported file type: {file_type}")
return converter.convert()
converter.convert()
converter.clean_pdf()
return str(converter.output_path)