I can put a new file and create the associated pdf

2025-10-05 23:54:59 +02:00
parent bd52f2d296
commit 8ae9754fde
14 changed files with 376 additions and 45 deletions
--- a/src/file-processor/app/utils/pdf_converter.py
+++ b/src/file-processor/app/utils/pdf_converter.py
@@ -0,0 +1,199 @@
+import datetime
+import hashlib
+import os
+import uuid
+from abc import ABC
+from pathlib import Path
+from typing import Self
+
+import pikepdf
+import pypandoc
+from PIL import Image
+from reportlab.lib.pagesizes import A4
+from reportlab.pdfgen import canvas
+
+from tasks.common.converter_utils import detect_file_type
+
+
+class BaseConverter(ABC):
+  """Abstract base class for file converters to PDF."""
+  
+  def __init__(self, input_path: str, output_dir: str = ".") -> None:
+    self.input_path = Path(input_path)
+    self.output_dir = Path(output_dir)
+    self.output_path = self.output_dir / f"{self.generate_uuid_filename()}.pdf"
+  
+  def convert(self) -> Self:
+    """Convert input file to PDF and return the output path."""
+    pass
+  
+  @staticmethod
+  def generate_uuid_filename() -> str:
+    """Generate a unique filename using UUID4."""
+    return str(uuid.uuid4())
+  
+  def get_deterministic_date(self) -> str:
+    """
+    Generate a deterministic date based on file content.
+    This ensures the same file always produces the same PDF.
+    """
+    # Option 1: Use a fixed date
+    # return "D:20000101000000"
+    
+    # Option 2: Generate date from content hash (recommended)
+    with open(self.input_path, 'rb') as f:
+      content = f.read()
+      content_hash = hashlib.sha256(content).hexdigest()
+    
+    # Use first 14 characters of hash to create a valid date
+    # Format: D:YYYYMMDDHHmmss
+    hash_int = int(content_hash[:14], 16)
+    
+    # Create a date between 2000-2099 to keep it reasonable
+    year = 2000 + (hash_int % 100)
+    month = 1 + (hash_int % 12)
+    day = 1 + (hash_int % 28)  # Stay safe with 28 days
+    hour = hash_int % 24
+    minute = hash_int % 60
+    second = hash_int % 60
+    
+    return f"D:{year:04d}{month:02d}{day:02d}{hour:02d}{minute:02d}{second:02d}"
+  
+  def get_file_creation_date(self):
+    # Get file creation time (or modification time)
+    ts = os.path.getctime(self.input_path)  # getmtime(self.input_path) for last modification
+    dt = datetime.datetime.fromtimestamp(ts)
+    
+    # PDF expects format D:YYYYMMDDHHmmss
+    creation_date = dt.strftime("D:%Y%m%d%H%M%S")
+    return creation_date
+  
+  def clean_pdf(self) -> Self:
+    """Remove all non-deterministic metadata from PDF."""
+    with pikepdf.open(self.output_path, allow_overwriting_input=True) as pdf:
+      # Remove XMP metadata if it exists
+      if hasattr(pdf.Root, 'Metadata'):
+        del pdf.Root.Metadata
+        
+        # Clear all document info by deleting each key
+        for key in list(pdf.docinfo.keys()):
+          del pdf.docinfo[key]
+      
+      # Set deterministic metadata
+      pdf.docinfo["/Producer"] = "MyConverter"
+      pdf.docinfo["/Creator"] = "MyConverter"
+      pdf.docinfo["/CreationDate"] = self.get_deterministic_date()
+      pdf.docinfo["/ModDate"] = self.get_deterministic_date()
+      pdf.docinfo["/Title"] = self.input_path.name
+      
+      # Save with deterministic IDs
+      # compress=True ensures consistent compression
+      # deterministic_id=True (if available) or static_id=True
+      pdf.save(
+        self.output_path,
+        fix_metadata_version=True,
+        compress_streams=True,
+        stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
+        object_stream_mode=pikepdf.ObjectStreamMode.disable,
+        deterministic_id=True  # Use this if pikepdf >= 8.0.0, otherwise use static_id=True
+      )
+    
+    return self
+
+
+class TextToPdfConverter(BaseConverter):
+  """Converter for text files to PDF."""
+  
+  def convert(self) -> Self:
+    c = canvas.Canvas(str(self.output_path), pagesize=A4)
+    
+    # Fix metadata with deterministic values
+    info = c._doc.info
+    info.producer = "MyConverter"
+    info.creationDate = self.get_file_creation_date()
+    info.title = os.path.basename(self.input_path)
+    
+    width, height = A4
+    with open(self.input_path, "r", encoding="utf-8") as f:
+      y = height - 50
+      for line in f:
+        c.drawString(50, y, line.strip())
+        y -= 15
+        if y < 50:
+          c.showPage()
+          y = height - 50
+    
+    c.save()
+    return self
+
+
+class ImageToPdfConverter(BaseConverter):
+  """Converter for image files to PDF."""
+  
+  def convert(self) -> Self:
+    image = Image.open(self.input_path)
+    rgb_image = image.convert("RGB")
+    rgb_image.save(self.output_path)
+    return self
+
+
+class WordToPdfConverter(BaseConverter):
+  """Converter for Word files (.docx) to PDF using pypandoc."""
+  
+  def convert(self) -> Self:
+    pypandoc.convert_file(
+      str(self.input_path), "pdf", outputfile=str(self.output_path)
+    )
+    return self
+
+
+# Placeholders for future extensions
+class HtmlToPdfConverter(BaseConverter):
+  """Placeholder for HTML to PDF converter."""
+  
+  def convert(self) -> Self:
+    raise NotImplementedError("HTML to PDF conversion not implemented.")
+
+
+class ExcelToPdfConverter(BaseConverter):
+  """Placeholder for Excel to PDF converter."""
+  
+  def convert(self) -> Self:
+    raise NotImplementedError("Excel to PDF conversion not implemented.")
+
+
+class MarkdownToPdfConverter(BaseConverter):
+  """Placeholder for Markdown to PDF converter."""
+  
+  def convert(self) -> Self:
+    raise NotImplementedError("Markdown to PDF conversion not implemented.")
+
+
+def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
+  """
+      Convert any supported file to PDF.
+
+      Args:
+          filepath (str): Path to the input file.
+          output_dir (str): Directory to save the output PDF.
+
+      Returns:
+          str: Path to the generated PDF.
+
+      Raises:
+          UnsupportedFileTypeError: If the input file type is not supported.
+      """
+  file_type = detect_file_type(filepath)
+  
+  if file_type == "text":
+    converter = TextToPdfConverter(filepath, output_dir=output_dir)
+  elif file_type == "image":
+    converter = ImageToPdfConverter(filepath, output_dir=output_dir)
+  elif file_type == "word":
+    converter = WordToPdfConverter(filepath, output_dir=output_dir)
+  else:
+    raise ValueError(f"Unsupported file type: {file_type}")
+  
+  converter.convert()
+  converter.clean_pdf()
+  return str(converter.output_path)