Working on pdf creation
This commit is contained in:
@@ -1,12 +1,15 @@
|
||||
import datetime
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
import pikepdf
|
||||
import pypandoc
|
||||
from PIL import Image
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
from tasks.common.converter_utils import generate_uuid_filename
|
||||
from tasks.common.converter_utils import generate_uuid_filename, detect_file_type
|
||||
|
||||
|
||||
class BaseConverter(ABC):
|
||||
@@ -21,6 +24,26 @@ class BaseConverter(ABC):
|
||||
def convert(self) -> str:
|
||||
"""Convert input file to PDF and return the output path."""
|
||||
pass
|
||||
|
||||
def get_file_creation_date(self):
|
||||
# Get file creation time (or modification time)
|
||||
ts = os.path.getctime(self.input_path) # getmtime(self.input_path) for last modification
|
||||
dt = datetime.datetime.fromtimestamp(ts)
|
||||
|
||||
# PDF expects format D:YYYYMMDDHHmmss
|
||||
creation_date = dt.strftime("D:%Y%m%d%H%M%S")
|
||||
return creation_date
|
||||
|
||||
def clean_pdf(self):
|
||||
with pikepdf.open(self.output_path) as pdf:
|
||||
pdf.Root.Metadata = None
|
||||
|
||||
pdf.docinfo.clear()
|
||||
pdf.docinfo["/Producer"] = "MyConverter"
|
||||
pdf.docinfo["/CreationDate"] = self.get_file_creation_date()
|
||||
pdf.docinfo["/Title"] = os.path.basename(self.input_path)
|
||||
|
||||
pdf.save(self.output_path, fix_metadata_version=True, static_id=True)
|
||||
|
||||
|
||||
class TextToPdfConverter(BaseConverter):
|
||||
@@ -28,6 +51,13 @@ class TextToPdfConverter(BaseConverter):
|
||||
|
||||
def convert(self) -> str:
|
||||
c = canvas.Canvas(str(self.output_path), pagesize=A4)
|
||||
|
||||
# Fix metadata with deterministic values
|
||||
info = c._doc.info
|
||||
info.producer = "MyConverter"
|
||||
info.creationDate = self.get_file_creation_date()
|
||||
info.title = os.path.basename(self.input_path)
|
||||
|
||||
width, height = A4
|
||||
with open(self.input_path, "r", encoding="utf-8") as f:
|
||||
y = height - 50
|
||||
@@ -37,6 +67,7 @@ class TextToPdfConverter(BaseConverter):
|
||||
if y < 50:
|
||||
c.showPage()
|
||||
y = height - 50
|
||||
|
||||
c.save()
|
||||
return str(self.output_path)
|
||||
|
||||
@@ -81,3 +112,31 @@ class MarkdownToPdfConverter(BaseConverter):
|
||||
|
||||
def convert(self) -> str:
|
||||
raise NotImplementedError("Markdown to PDF conversion not implemented.")
|
||||
|
||||
|
||||
def convert_to_pdf(filepath: str, output_dir: str = ".") -> str:
|
||||
"""
|
||||
Convert any supported file to PDF.
|
||||
|
||||
Args:
|
||||
filepath (str): Path to the input file.
|
||||
output_dir (str): Directory to save the output PDF.
|
||||
|
||||
Returns:
|
||||
str: Path to the generated PDF.
|
||||
|
||||
Raises:
|
||||
UnsupportedFileTypeError: If the input file type is not supported.
|
||||
"""
|
||||
file_type = detect_file_type(filepath)
|
||||
|
||||
if file_type == "text":
|
||||
converter = TextToPdfConverter(filepath, output_dir=output_dir)
|
||||
elif file_type == "image":
|
||||
converter = ImageToPdfConverter(filepath, output_dir=output_dir)
|
||||
elif file_type == "word":
|
||||
converter = WordToPdfConverter(filepath, output_dir=output_dir)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
|
||||
return converter.convert()
|
||||
|
||||
Reference in New Issue
Block a user