MyDocManager/src/file-processor/app/utils/pdf_converter.py

import datetime
import hashlib
import os
import uuid
from abc import ABC
from pathlib import Path
from typing import Self

import pikepdf
import pypandoc
from PIL import Image
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

from tasks.common.converter_utils import detect_file_type


class BaseConverter(ABC):
  """Abstract base class for file converters to PDF."""

  def __init__(self, input_path: str, output_dir: str = ".") -> None:
    self.input_path = Path(input_path)
    self.output_dir = Path(output_dir)
    self.output_path = self.output_dir / f"{self.generate_uuid_filename()}.pdf"

  def convert(self) -> Self:
    """Convert input file to PDF and return the output path."""
    pass

  @staticmethod
  def generate_uuid_filename() -> str:
    """Generate a unique filename using UUID4."""
    return str(uuid.uuid4())

  def get_deterministic_date(self) -> str:
    """
    Generate a deterministic date based on file content.
    This ensures the same file always produces the same PDF.
    """
    # Option 1: Use a fixed date
    # return "D:20000101000000"

    # Option 2: Generate date from content hash (recommended)
    with open(self.input_path, 'rb') as f:
      content = f.read()
      content_hash = hashlib.sha256(content).hexdigest()

    # Use first 14 characters of hash to create a valid date
    # Format: D:YYYYMMDDHHmmss
    hash_int = int(content_hash[:14], 16)

    # Create a date between 2000-2099 to keep it reasonable
    year = 2000 + (hash_int % 100)
    month = 1 + (hash_int % 12)
    day = 1 + (hash_int % 28)  # Stay safe with 28 days
    hour = hash_int % 24
    minute = hash_int % 60
    second = hash_int % 60

    return f"D:{year:04d}{month:02d}{day:02d}{hour:02d}{minute:02d}{second:02d}"

  def get_file_creation_date(self):
    # Get file creation time (or modification time)
    ts = os.path.getctime(self.input_path)  # getmtime(self.input_path) for last modification
    dt = datetime.datetime.fromtimestamp(ts)

    # PDF expects format D:YYYYMMDDHHmmss
    creation_date = dt.strftime("D:%Y%m%d%H%M%S")
    return creation_date

  def clean_pdf(self) -> Self:
    """Remove all non-deterministic metadata from PDF."""
    with pikepdf.open(self.output_path, allow_overwriting_input=True) as pdf:
      # Remove XMP metadata if it exists
      if hasattr(pdf.Root, 'Metadata'):
        del pdf.Root.Metadata

        # Clear all document info by deleting each key
        for key in list(pdf.docinfo.keys()):
          del pdf.docinfo[key]

      # Set deterministic metadata
      pdf.docinfo["/Producer"] = "MyConverter"
      pdf.docinfo["/Creator"] = "MyConverter"
      pdf.docinfo["/CreationDate"] = self.get_deterministic_date()
      pdf.docinfo["/ModDate"] = self.get_deterministic_date()
      pdf.docinfo["/Title"] = self.input_path.name

      # Save with deterministic IDs
      # compress=True ensures consistent compression
      # deterministic_id=True (if available) or static_id=True
      pdf.save(
        self.output_path,
        fix_metadata_version=True,
        compress_streams=True,
        stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
        object_stream_mode=pikepdf.ObjectStreamMode.disable,
        deterministic_id=True  # Use this if pikepdf >= 8.0.0, otherwise use static_id=True
      )

    return self


class TextToPdfConverter(BaseConverter):
  """Converter for text files to PDF."""

  def convert(self) -> Self:
    c = canvas.Canvas(str(self.output_path), pagesize=A4)

    # Fix metadata with deterministic values
    info = c._doc.info
    info.producer = "MyConverter"
    info.creationDate = self.get_file_creation_date()
    info.title = os.path.basename(self.input_path)

    width, height = A4
    with open(self.input_path, "r", encoding="utf-8") as f:
      y = height - 50
      for line in f:
        c.drawString(50, y, line.strip())
        y -= 15
        if y < 50:
          c.showPage()
          y = height - 50

    c.save()
    return self


class ImageToPdfConverter(BaseConverter):
  """Converter for image files to PDF."""

  def convert(self) -> Self:
    image = Image.open(self.input_path)
    rgb_image = image.convert("RGB")
    rgb_image.save(self.output_path)
    return self


class WordToPdfConverter(BaseConverter):
  """Converter for Word files (.docx) to PDF using pypandoc."""

  def convert(self) -> Self:
    pypandoc.convert_file(
      str(self.input_path), "pdf", outputfile=str(self.output_path)
    )
    return self


# Placeholders for future extensions
class HtmlToPdfConverter(BaseConverter):
  """Placeholder for HTML to PDF converter."""

  def convert(self) -> Self:
    raise NotImplementedError("HTML to PDF conversion not implemented.")


class ExcelToPdfConverter(BaseConverter):
  """Placeholder for Excel to PDF converter."""

  def convert(self) -> Self:
    raise NotImplementedError("Excel to PDF conversion not implemented.")


class MarkdownToPdfConverter(BaseConverter):
  """Placeholder for Markdown to PDF converter."""

  def convert(self) -> Self:
    raise NotImplementedError("Markdown to PDF conversion not implemented.")


def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
  """
      Convert any supported file to PDF.

      Args:
          filepath (str): Path to the input file.
          output_dir (str): Directory to save the output PDF.

      Returns:
          str: Path to the generated PDF.

      Raises:
          UnsupportedFileTypeError: If the input file type is not supported.
      """
  file_type = detect_file_type(filepath)

  if file_type == "text":
    converter = TextToPdfConverter(filepath, output_dir=output_dir)
  elif file_type == "image":
    converter = ImageToPdfConverter(filepath, output_dir=output_dir)
  elif file_type == "word":
    converter = WordToPdfConverter(filepath, output_dir=output_dir)
  else:
    raise ValueError(f"Unsupported file type: {file_type}")

  converter.convert()
  converter.clean_pdf()
  return str(converter.output_path)