main
1from dataclasses import dataclass
2import json
3import sys
4
5
6# Script to check that the `fields.json` file that Claude creates when analyzing PDFs
7# does not have overlapping bounding boxes. See forms.md.
8
9
10@dataclass
11class RectAndField:
12 rect: list[float]
13 rect_type: str
14 field: dict
15
16
17# Returns a list of messages that are printed to stdout for Claude to read.
18def get_bounding_box_messages(fields_json_stream) -> list[str]:
19 messages = []
20 fields = json.load(fields_json_stream)
21 messages.append(f"Read {len(fields['form_fields'])} fields")
22
23 def rects_intersect(r1, r2):
24 disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0]
25 disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1]
26 return not (disjoint_horizontal or disjoint_vertical)
27
28 rects_and_fields = []
29 for f in fields["form_fields"]:
30 rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f))
31 rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f))
32
33 has_error = False
34 for i, ri in enumerate(rects_and_fields):
35 # This is O(N^2); we can optimize if it becomes a problem.
36 for j in range(i + 1, len(rects_and_fields)):
37 rj = rects_and_fields[j]
38 if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect):
39 has_error = True
40 if ri.field is rj.field:
41 messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})")
42 else:
43 messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})")
44 if len(messages) >= 20:
45 messages.append("Aborting further checks; fix bounding boxes and try again")
46 return messages
47 if ri.rect_type == "entry":
48 if "entry_text" in ri.field:
49 font_size = ri.field["entry_text"].get("font_size", 14)
50 entry_height = ri.rect[3] - ri.rect[1]
51 if entry_height < font_size:
52 has_error = True
53 messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.")
54 if len(messages) >= 20:
55 messages.append("Aborting further checks; fix bounding boxes and try again")
56 return messages
57
58 if not has_error:
59 messages.append("SUCCESS: All bounding boxes are valid")
60 return messages
61
62if __name__ == "__main__":
63 if len(sys.argv) != 2:
64 print("Usage: check_bounding_boxes.py [fields.json]")
65 sys.exit(1)
66 # Input file should be in the `fields.json` format described in forms.md.
67 with open(sys.argv[1]) as f:
68 messages = get_bounding_box_messages(f)
69 for msg in messages:
70 print(msg)