Working on pdf creation

This commit is contained in:
2025-09-30 22:58:51 +02:00
parent 06549c0d02
commit 62c7e46a88
10 changed files with 156 additions and 7 deletions

View File

@@ -1,12 +1,15 @@
import datetime
import os
from abc import ABC, abstractmethod
from pathlib import Path
import pikepdf
import pypandoc
from PIL import Image
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from tasks.common.converter_utils import generate_uuid_filename
from tasks.common.converter_utils import generate_uuid_filename, detect_file_type
class BaseConverter(ABC):
@@ -21,6 +24,26 @@ class BaseConverter(ABC):
def convert(self) -> str:
"""Convert input file to PDF and return the output path."""
pass
def get_file_creation_date(self):
# Get file creation time (or modification time)
ts = os.path.getctime(self.input_path) # getmtime(self.input_path) for last modification
dt = datetime.datetime.fromtimestamp(ts)
# PDF expects format D:YYYYMMDDHHmmss
creation_date = dt.strftime("D:%Y%m%d%H%M%S")
return creation_date
def clean_pdf(self):
with pikepdf.open(self.output_path) as pdf:
pdf.Root.Metadata = None
pdf.docinfo.clear()
pdf.docinfo["/Producer"] = "MyConverter"
pdf.docinfo["/CreationDate"] = self.get_file_creation_date()
pdf.docinfo["/Title"] = os.path.basename(self.input_path)
pdf.save(self.output_path, fix_metadata_version=True, static_id=True)
class TextToPdfConverter(BaseConverter):
@@ -28,6 +51,13 @@ class TextToPdfConverter(BaseConverter):
def convert(self) -> str:
c = canvas.Canvas(str(self.output_path), pagesize=A4)
# Fix metadata with deterministic values
info = c._doc.info
info.producer = "MyConverter"
info.creationDate = self.get_file_creation_date()
info.title = os.path.basename(self.input_path)
width, height = A4
with open(self.input_path, "r", encoding="utf-8") as f:
y = height - 50
@@ -37,6 +67,7 @@ class TextToPdfConverter(BaseConverter):
if y < 50:
c.showPage()
y = height - 50
c.save()
return str(self.output_path)
@@ -81,3 +112,31 @@ class MarkdownToPdfConverter(BaseConverter):
def convert(self) -> str:
raise NotImplementedError("Markdown to PDF conversion not implemented.")
def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
"""
Convert any supported file to PDF.
Args:
filepath (str): Path to the input file.
output_dir (str): Directory to save the output PDF.
Returns:
str: Path to the generated PDF.
Raises:
UnsupportedFileTypeError: If the input file type is not supported.
"""
file_type = detect_file_type(filepath)
if file_type == "text":
converter = TextToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "image":
converter = ImageToPdfConverter(filepath, output_dir=output_dir)
elif file_type == "word":
converter = WordToPdfConverter(filepath, output_dir=output_dir)
else:
raise ValueError(f"Unsupported file type: {file_type}")
return converter.convert()