skills/skills/pdf/scripts/extract_form_field

  1import json
  2import sys
  3
  4from pypdf import PdfReader
  5
  6
  7# Extracts data for the fillable form fields in a PDF and outputs JSON that
  8# Claude uses to fill the fields. See forms.md.
  9
 10
 11# This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
 12def get_full_annotation_field_id(annotation):
 13    components = []
 14    while annotation:
 15        field_name = annotation.get('/T')
 16        if field_name:
 17            components.append(field_name)
 18        annotation = annotation.get('/Parent')
 19    return ".".join(reversed(components)) if components else None
 20
 21
 22def make_field_dict(field, field_id):
 23    field_dict = {"field_id": field_id}
 24    ft = field.get('/FT')
 25    if ft == "/Tx":
 26        field_dict["type"] = "text"
 27    elif ft == "/Btn":
 28        field_dict["type"] = "checkbox"  # radio groups handled separately
 29        states = field.get("/_States_", [])
 30        if len(states) == 2:
 31            # "/Off" seems to always be the unchecked value, as suggested by
 32            # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
 33            # It can be either first or second in the "/_States_" list.
 34            if "/Off" in states:
 35                field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
 36                field_dict["unchecked_value"] = "/Off"
 37            else:
 38                print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
 39                field_dict["checked_value"] = states[0]
 40                field_dict["unchecked_value"] = states[1]
 41    elif ft == "/Ch":
 42        field_dict["type"] = "choice"
 43        states = field.get("/_States_", [])
 44        field_dict["choice_options"] = [{
 45            "value": state[0],
 46            "text": state[1],
 47        } for state in states]
 48    else:
 49        field_dict["type"] = f"unknown ({ft})"
 50    return field_dict
 51
 52
 53# Returns a list of fillable PDF fields:
 54# [
 55#   {
 56#     "field_id": "name",
 57#     "page": 1,
 58#     "type": ("text", "checkbox", "radio_group", or "choice")
 59#     // Per-type additional fields described in forms.md
 60#   },
 61# ]
 62def get_field_info(reader: PdfReader):
 63    fields = reader.get_fields()
 64
 65    field_info_by_id = {}
 66    possible_radio_names = set()
 67
 68    for field_id, field in fields.items():
 69        # Skip if this is a container field with children, except that it might be
 70        # a parent group for radio button options.
 71        if field.get("/Kids"):
 72            if field.get("/FT") == "/Btn":
 73                possible_radio_names.add(field_id)
 74            continue
 75        field_info_by_id[field_id] = make_field_dict(field, field_id)
 76
 77    # Bounding rects are stored in annotations in page objects.
 78
 79    # Radio button options have a separate annotation for each choice;
 80    # all choices have the same field name.
 81    # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
 82    radio_fields_by_id = {}
 83
 84    for page_index, page in enumerate(reader.pages):
 85        annotations = page.get('/Annots', [])
 86        for ann in annotations:
 87            field_id = get_full_annotation_field_id(ann)
 88            if field_id in field_info_by_id:
 89                field_info_by_id[field_id]["page"] = page_index + 1
 90                field_info_by_id[field_id]["rect"] = ann.get('/Rect')
 91            elif field_id in possible_radio_names:
 92                try:
 93                    # ann['/AP']['/N'] should have two items. One of them is '/Off',
 94                    # the other is the active value.
 95                    on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
 96                except KeyError:
 97                    continue
 98                if len(on_values) == 1:
 99                    rect = ann.get("/Rect")
100                    if field_id not in radio_fields_by_id:
101                        radio_fields_by_id[field_id] = {
102                            "field_id": field_id,
103                            "type": "radio_group",
104                            "page": page_index + 1,
105                            "radio_options": [],
106                        }
107                    # Note: at least on macOS 15.7, Preview.app doesn't show selected
108                    # radio buttons correctly. (It does if you remove the leading slash
109                    # from the value, but that causes them not to appear correctly in
110                    # Chrome/Firefox/Acrobat/etc).
111                    radio_fields_by_id[field_id]["radio_options"].append({
112                        "value": on_values[0],
113                        "rect": rect,
114                    })
115
116    # Some PDFs have form field definitions without corresponding annotations,
117    # so we can't tell where they are. Ignore these fields for now.
118    fields_with_location = []
119    for field_info in field_info_by_id.values():
120        if "page" in field_info:
121            fields_with_location.append(field_info)
122        else:
123            print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
124
125    # Sort by page number, then Y position (flipped in PDF coordinate system), then X.
126    def sort_key(f):
127        if "radio_options" in f:
128            rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
129        else:
130            rect = f.get("rect") or [0, 0, 0, 0]
131        adjusted_position = [-rect[1], rect[0]]
132        return [f.get("page"), adjusted_position]
133    
134    sorted_fields = fields_with_location + list(radio_fields_by_id.values())
135    sorted_fields.sort(key=sort_key)
136
137    return sorted_fields
138
139
140def write_field_info(pdf_path: str, json_output_path: str):
141    reader = PdfReader(pdf_path)
142    field_info = get_field_info(reader)
143    with open(json_output_path, "w") as f:
144        json.dump(field_info, f, indent=2)
145    print(f"Wrote {len(field_info)} fields to {json_output_path}")
146
147
148if __name__ == "__main__":
149    if len(sys.argv) != 3:
150        print("Usage: extract_form_field_info.py [input pdf] [output json]")
151        sys.exit(1)
152    write_field_info(sys.argv[1], sys.argv[2])