Working on pdf creation

2025-09-30 22:58:51 +02:00
parent 06549c0d02
commit 62c7e46a88
10 changed files with 156 additions and 7 deletions
--- a/src/worker/tasks/common/pdf_converter.py
+++ b/src/worker/tasks/common/pdf_converter.py
@@ -1,12 +1,15 @@
+import datetime
+import os
 from abc import ABC, abstractmethod
 from pathlib import Path

+import pikepdf
 import pypandoc
 from PIL import Image
 from reportlab.lib.pagesizes import A4
 from reportlab.pdfgen import canvas

-from tasks.common.converter_utils import generate_uuid_filename
+from tasks.common.converter_utils import generate_uuid_filename, detect_file_type


 class BaseConverter(ABC):
@@ -21,6 +24,26 @@ class BaseConverter(ABC):
  def convert(self) -> str:
    """Convert input file to PDF and return the output path."""
    pass
+  
+  def get_file_creation_date(self):
+    # Get file creation time (or modification time)
+    ts = os.path.getctime(self.input_path)  # getmtime(self.input_path) for last modification
+    dt = datetime.datetime.fromtimestamp(ts)
+    
+    # PDF expects format D:YYYYMMDDHHmmss
+    creation_date = dt.strftime("D:%Y%m%d%H%M%S")
+    return creation_date
+  
+  def clean_pdf(self):
+    with pikepdf.open(self.output_path) as pdf:
+      pdf.Root.Metadata = None
+      
+      pdf.docinfo.clear()
+      pdf.docinfo["/Producer"] = "MyConverter"
+      pdf.docinfo["/CreationDate"] = self.get_file_creation_date()
+      pdf.docinfo["/Title"] = os.path.basename(self.input_path)
+      
+      pdf.save(self.output_path, fix_metadata_version=True, static_id=True)


 class TextToPdfConverter(BaseConverter):
@@ -28,6 +51,13 @@ class TextToPdfConverter(BaseConverter):
  
  def convert(self) -> str:
    c = canvas.Canvas(str(self.output_path), pagesize=A4)
+    
+    # Fix metadata with deterministic values
+    info = c._doc.info
+    info.producer = "MyConverter"
+    info.creationDate = self.get_file_creation_date()
+    info.title = os.path.basename(self.input_path)
+    
    width, height = A4
    with open(self.input_path, "r", encoding="utf-8") as f:
      y = height - 50
@@ -37,6 +67,7 @@ class TextToPdfConverter(BaseConverter):
        if y < 50:
          c.showPage()
          y = height - 50
+    
    c.save()
    return str(self.output_path)

@@ -81,3 +112,31 @@ class MarkdownToPdfConverter(BaseConverter):
  
  def convert(self) -> str:
    raise NotImplementedError("Markdown to PDF conversion not implemented.")
+
+
+def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
+  """
+      Convert any supported file to PDF.
+
+      Args:
+          filepath (str): Path to the input file.
+          output_dir (str): Directory to save the output PDF.
+
+      Returns:
+          str: Path to the generated PDF.
+
+      Raises:
+          UnsupportedFileTypeError: If the input file type is not supported.
+      """
+  file_type = detect_file_type(filepath)
+  
+  if file_type == "text":
+    converter = TextToPdfConverter(filepath, output_dir=output_dir)
+  elif file_type == "image":
+    converter = ImageToPdfConverter(filepath, output_dir=output_dir)
+  elif file_type == "word":
+    converter = WordToPdfConverter(filepath, output_dir=output_dir)
+  else:
+    raise ValueError(f"Unsupported file type: {file_type}")
+  
+  return converter.convert()