skills/skills/pdf/scripts/fill_pdf_form_with

  1import json
  2import sys
  3
  4from pypdf import PdfReader, PdfWriter
  5from pypdf.annotations import FreeText
  6
  7
  8# Fills a PDF by adding text annotations defined in `fields.json`. See forms.md.
  9
 10
 11def transform_coordinates(bbox, image_width, image_height, pdf_width, pdf_height):
 12    """Transform bounding box from image coordinates to PDF coordinates"""
 13    # Image coordinates: origin at top-left, y increases downward
 14    # PDF coordinates: origin at bottom-left, y increases upward
 15    x_scale = pdf_width / image_width
 16    y_scale = pdf_height / image_height
 17    
 18    left = bbox[0] * x_scale
 19    right = bbox[2] * x_scale
 20    
 21    # Flip Y coordinates for PDF
 22    top = pdf_height - (bbox[1] * y_scale)
 23    bottom = pdf_height - (bbox[3] * y_scale)
 24    
 25    return left, bottom, right, top
 26
 27
 28def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path):
 29    """Fill the PDF form with data from fields.json"""
 30    
 31    # `fields.json` format described in forms.md.
 32    with open(fields_json_path, "r") as f:
 33        fields_data = json.load(f)
 34    
 35    # Open the PDF
 36    reader = PdfReader(input_pdf_path)
 37    writer = PdfWriter()
 38    
 39    # Copy all pages to writer
 40    writer.append(reader)
 41    
 42    # Get PDF dimensions for each page
 43    pdf_dimensions = {}
 44    for i, page in enumerate(reader.pages):
 45        mediabox = page.mediabox
 46        pdf_dimensions[i + 1] = [mediabox.width, mediabox.height]
 47    
 48    # Process each form field
 49    annotations = []
 50    for field in fields_data["form_fields"]:
 51        page_num = field["page_number"]
 52        
 53        # Get page dimensions and transform coordinates.
 54        page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num)
 55        image_width = page_info["image_width"]
 56        image_height = page_info["image_height"]
 57        pdf_width, pdf_height = pdf_dimensions[page_num]
 58        
 59        transformed_entry_box = transform_coordinates(
 60            field["entry_bounding_box"],
 61            image_width, image_height,
 62            pdf_width, pdf_height
 63        )
 64        
 65        # Skip empty fields
 66        if "entry_text" not in field or "text" not in field["entry_text"]:
 67            continue
 68        entry_text = field["entry_text"]
 69        text = entry_text["text"]
 70        if not text:
 71            continue
 72        
 73        font_name = entry_text.get("font", "Arial")
 74        font_size = str(entry_text.get("font_size", 14)) + "pt"
 75        font_color = entry_text.get("font_color", "000000")
 76
 77        # Font size/color seems to not work reliably across viewers:
 78        # https://github.com/py-pdf/pypdf/issues/2084
 79        annotation = FreeText(
 80            text=text,
 81            rect=transformed_entry_box,
 82            font=font_name,
 83            font_size=font_size,
 84            font_color=font_color,
 85            border_color=None,
 86            background_color=None,
 87        )
 88        annotations.append(annotation)
 89        # page_number is 0-based for pypdf
 90        writer.add_annotation(page_number=page_num - 1, annotation=annotation)
 91        
 92    # Save the filled PDF
 93    with open(output_pdf_path, "wb") as output:
 94        writer.write(output)
 95    
 96    print(f"Successfully filled PDF form and saved to {output_pdf_path}")
 97    print(f"Added {len(annotations)} text annotations")
 98
 99
100if __name__ == "__main__":
101    if len(sys.argv) != 4:
102        print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]")
103        sys.exit(1)
104    input_pdf = sys.argv[1]
105    fields_json = sys.argv[2]
106    output_pdf = sys.argv[3]
107    
108    fill_pdf_form(input_pdf, fields_json, output_pdf)