main
1import json
2import sys
3
4from pypdf import PdfReader
5
6
7# Extracts data for the fillable form fields in a PDF and outputs JSON that
8# Claude uses to fill the fields. See forms.md.
9
10
11# This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
12def get_full_annotation_field_id(annotation):
13 components = []
14 while annotation:
15 field_name = annotation.get('/T')
16 if field_name:
17 components.append(field_name)
18 annotation = annotation.get('/Parent')
19 return ".".join(reversed(components)) if components else None
20
21
22def make_field_dict(field, field_id):
23 field_dict = {"field_id": field_id}
24 ft = field.get('/FT')
25 if ft == "/Tx":
26 field_dict["type"] = "text"
27 elif ft == "/Btn":
28 field_dict["type"] = "checkbox" # radio groups handled separately
29 states = field.get("/_States_", [])
30 if len(states) == 2:
31 # "/Off" seems to always be the unchecked value, as suggested by
32 # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
33 # It can be either first or second in the "/_States_" list.
34 if "/Off" in states:
35 field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
36 field_dict["unchecked_value"] = "/Off"
37 else:
38 print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
39 field_dict["checked_value"] = states[0]
40 field_dict["unchecked_value"] = states[1]
41 elif ft == "/Ch":
42 field_dict["type"] = "choice"
43 states = field.get("/_States_", [])
44 field_dict["choice_options"] = [{
45 "value": state[0],
46 "text": state[1],
47 } for state in states]
48 else:
49 field_dict["type"] = f"unknown ({ft})"
50 return field_dict
51
52
53# Returns a list of fillable PDF fields:
54# [
55# {
56# "field_id": "name",
57# "page": 1,
58# "type": ("text", "checkbox", "radio_group", or "choice")
59# // Per-type additional fields described in forms.md
60# },
61# ]
62def get_field_info(reader: PdfReader):
63 fields = reader.get_fields()
64
65 field_info_by_id = {}
66 possible_radio_names = set()
67
68 for field_id, field in fields.items():
69 # Skip if this is a container field with children, except that it might be
70 # a parent group for radio button options.
71 if field.get("/Kids"):
72 if field.get("/FT") == "/Btn":
73 possible_radio_names.add(field_id)
74 continue
75 field_info_by_id[field_id] = make_field_dict(field, field_id)
76
77 # Bounding rects are stored in annotations in page objects.
78
79 # Radio button options have a separate annotation for each choice;
80 # all choices have the same field name.
81 # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
82 radio_fields_by_id = {}
83
84 for page_index, page in enumerate(reader.pages):
85 annotations = page.get('/Annots', [])
86 for ann in annotations:
87 field_id = get_full_annotation_field_id(ann)
88 if field_id in field_info_by_id:
89 field_info_by_id[field_id]["page"] = page_index + 1
90 field_info_by_id[field_id]["rect"] = ann.get('/Rect')
91 elif field_id in possible_radio_names:
92 try:
93 # ann['/AP']['/N'] should have two items. One of them is '/Off',
94 # the other is the active value.
95 on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
96 except KeyError:
97 continue
98 if len(on_values) == 1:
99 rect = ann.get("/Rect")
100 if field_id not in radio_fields_by_id:
101 radio_fields_by_id[field_id] = {
102 "field_id": field_id,
103 "type": "radio_group",
104 "page": page_index + 1,
105 "radio_options": [],
106 }
107 # Note: at least on macOS 15.7, Preview.app doesn't show selected
108 # radio buttons correctly. (It does if you remove the leading slash
109 # from the value, but that causes them not to appear correctly in
110 # Chrome/Firefox/Acrobat/etc).
111 radio_fields_by_id[field_id]["radio_options"].append({
112 "value": on_values[0],
113 "rect": rect,
114 })
115
116 # Some PDFs have form field definitions without corresponding annotations,
117 # so we can't tell where they are. Ignore these fields for now.
118 fields_with_location = []
119 for field_info in field_info_by_id.values():
120 if "page" in field_info:
121 fields_with_location.append(field_info)
122 else:
123 print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
124
125 # Sort by page number, then Y position (flipped in PDF coordinate system), then X.
126 def sort_key(f):
127 if "radio_options" in f:
128 rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
129 else:
130 rect = f.get("rect") or [0, 0, 0, 0]
131 adjusted_position = [-rect[1], rect[0]]
132 return [f.get("page"), adjusted_position]
133
134 sorted_fields = fields_with_location + list(radio_fields_by_id.values())
135 sorted_fields.sort(key=sort_key)
136
137 return sorted_fields
138
139
140def write_field_info(pdf_path: str, json_output_path: str):
141 reader = PdfReader(pdf_path)
142 field_info = get_field_info(reader)
143 with open(json_output_path, "w") as f:
144 json.dump(field_info, f, indent=2)
145 print(f"Wrote {len(field_info)} fields to {json_output_path}")
146
147
148if __name__ == "__main__":
149 if len(sys.argv) != 3:
150 print("Usage: extract_form_field_info.py [input pdf] [output json]")
151 sys.exit(1)
152 write_field_info(sys.argv[1], sys.argv[2])