main
1#!/usr/bin/env python3
2"""
3Library for working with Word documents: comments, tracked changes, and editing.
4
5Usage:
6 from skills.docx.scripts.document import Document
7
8 # Initialize
9 doc = Document('workspace/unpacked')
10 doc = Document('workspace/unpacked', author="John Doe", initials="JD")
11
12 # Find nodes
13 node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
14 node = doc["word/document.xml"].get_node(tag="w:p", line_number=10)
15
16 # Add comments
17 doc.add_comment(start=node, end=node, text="Comment text")
18 doc.reply_to_comment(parent_comment_id=0, text="Reply text")
19
20 # Suggest tracked changes
21 doc["word/document.xml"].suggest_deletion(node) # Delete content
22 doc["word/document.xml"].revert_insertion(ins_node) # Reject insertion
23 doc["word/document.xml"].revert_deletion(del_node) # Reject deletion
24
25 # Save
26 doc.save()
27"""
28
29import html
30import random
31import shutil
32import tempfile
33from datetime import datetime, timezone
34from pathlib import Path
35
36from defusedxml import minidom
37from ooxml.scripts.pack import pack_document
38from ooxml.scripts.validation.docx import DOCXSchemaValidator
39from ooxml.scripts.validation.redlining import RedliningValidator
40
41from .utilities import XMLEditor
42
43# Path to template files
44TEMPLATE_DIR = Path(__file__).parent / "templates"
45
46
47class DocxXMLEditor(XMLEditor):
48 """XMLEditor that automatically applies RSID, author, and date to new elements.
49
50 Automatically adds attributes to elements that support them when inserting new content:
51 - w:rsidR, w:rsidRDefault, w:rsidP (for w:p and w:r elements)
52 - w:author and w:date (for w:ins, w:del, w:comment elements)
53 - w:id (for w:ins and w:del elements)
54
55 Attributes:
56 dom (defusedxml.minidom.Document): The DOM document for direct manipulation
57 """
58
59 def __init__(
60 self, xml_path, rsid: str, author: str = "Claude", initials: str = "C"
61 ):
62 """Initialize with required RSID and optional author.
63
64 Args:
65 xml_path: Path to XML file to edit
66 rsid: RSID to automatically apply to new elements
67 author: Author name for tracked changes and comments (default: "Claude")
68 initials: Author initials (default: "C")
69 """
70 super().__init__(xml_path)
71 self.rsid = rsid
72 self.author = author
73 self.initials = initials
74
75 def _get_next_change_id(self):
76 """Get the next available change ID by checking all tracked change elements."""
77 max_id = -1
78 for tag in ("w:ins", "w:del"):
79 elements = self.dom.getElementsByTagName(tag)
80 for elem in elements:
81 change_id = elem.getAttribute("w:id")
82 if change_id:
83 try:
84 max_id = max(max_id, int(change_id))
85 except ValueError:
86 pass
87 return max_id + 1
88
89 def _ensure_w16du_namespace(self):
90 """Ensure w16du namespace is declared on the root element."""
91 root = self.dom.documentElement
92 if not root.hasAttribute("xmlns:w16du"): # type: ignore
93 root.setAttribute( # type: ignore
94 "xmlns:w16du",
95 "http://schemas.microsoft.com/office/word/2023/wordml/word16du",
96 )
97
98 def _ensure_w16cex_namespace(self):
99 """Ensure w16cex namespace is declared on the root element."""
100 root = self.dom.documentElement
101 if not root.hasAttribute("xmlns:w16cex"): # type: ignore
102 root.setAttribute( # type: ignore
103 "xmlns:w16cex",
104 "http://schemas.microsoft.com/office/word/2018/wordml/cex",
105 )
106
107 def _ensure_w14_namespace(self):
108 """Ensure w14 namespace is declared on the root element."""
109 root = self.dom.documentElement
110 if not root.hasAttribute("xmlns:w14"): # type: ignore
111 root.setAttribute( # type: ignore
112 "xmlns:w14",
113 "http://schemas.microsoft.com/office/word/2010/wordml",
114 )
115
116 def _inject_attributes_to_nodes(self, nodes):
117 """Inject RSID, author, and date attributes into DOM nodes where applicable.
118
119 Adds attributes to elements that support them:
120 - w:r: gets w:rsidR (or w:rsidDel if inside w:del)
121 - w:p: gets w:rsidR, w:rsidRDefault, w:rsidP, w14:paraId, w14:textId
122 - w:t: gets xml:space="preserve" if text has leading/trailing whitespace
123 - w:ins, w:del: get w:id, w:author, w:date, w16du:dateUtc
124 - w:comment: gets w:author, w:date, w:initials
125 - w16cex:commentExtensible: gets w16cex:dateUtc
126
127 Args:
128 nodes: List of DOM nodes to process
129 """
130 from datetime import datetime, timezone
131
132 timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
133
134 def is_inside_deletion(elem):
135 """Check if element is inside a w:del element."""
136 parent = elem.parentNode
137 while parent:
138 if parent.nodeType == parent.ELEMENT_NODE and parent.tagName == "w:del":
139 return True
140 parent = parent.parentNode
141 return False
142
143 def add_rsid_to_p(elem):
144 if not elem.hasAttribute("w:rsidR"):
145 elem.setAttribute("w:rsidR", self.rsid)
146 if not elem.hasAttribute("w:rsidRDefault"):
147 elem.setAttribute("w:rsidRDefault", self.rsid)
148 if not elem.hasAttribute("w:rsidP"):
149 elem.setAttribute("w:rsidP", self.rsid)
150 # Add w14:paraId and w14:textId if not present
151 if not elem.hasAttribute("w14:paraId"):
152 self._ensure_w14_namespace()
153 elem.setAttribute("w14:paraId", _generate_hex_id())
154 if not elem.hasAttribute("w14:textId"):
155 self._ensure_w14_namespace()
156 elem.setAttribute("w14:textId", _generate_hex_id())
157
158 def add_rsid_to_r(elem):
159 # Use w:rsidDel for <w:r> inside <w:del>, otherwise w:rsidR
160 if is_inside_deletion(elem):
161 if not elem.hasAttribute("w:rsidDel"):
162 elem.setAttribute("w:rsidDel", self.rsid)
163 else:
164 if not elem.hasAttribute("w:rsidR"):
165 elem.setAttribute("w:rsidR", self.rsid)
166
167 def add_tracked_change_attrs(elem):
168 # Auto-assign w:id if not present
169 if not elem.hasAttribute("w:id"):
170 elem.setAttribute("w:id", str(self._get_next_change_id()))
171 if not elem.hasAttribute("w:author"):
172 elem.setAttribute("w:author", self.author)
173 if not elem.hasAttribute("w:date"):
174 elem.setAttribute("w:date", timestamp)
175 # Add w16du:dateUtc for tracked changes (same as w:date since we generate UTC timestamps)
176 if elem.tagName in ("w:ins", "w:del") and not elem.hasAttribute(
177 "w16du:dateUtc"
178 ):
179 self._ensure_w16du_namespace()
180 elem.setAttribute("w16du:dateUtc", timestamp)
181
182 def add_comment_attrs(elem):
183 if not elem.hasAttribute("w:author"):
184 elem.setAttribute("w:author", self.author)
185 if not elem.hasAttribute("w:date"):
186 elem.setAttribute("w:date", timestamp)
187 if not elem.hasAttribute("w:initials"):
188 elem.setAttribute("w:initials", self.initials)
189
190 def add_comment_extensible_date(elem):
191 # Add w16cex:dateUtc for comment extensible elements
192 if not elem.hasAttribute("w16cex:dateUtc"):
193 self._ensure_w16cex_namespace()
194 elem.setAttribute("w16cex:dateUtc", timestamp)
195
196 def add_xml_space_to_t(elem):
197 # Add xml:space="preserve" to w:t if text has leading/trailing whitespace
198 if (
199 elem.firstChild
200 and elem.firstChild.nodeType == elem.firstChild.TEXT_NODE
201 ):
202 text = elem.firstChild.data
203 if text and (text[0].isspace() or text[-1].isspace()):
204 if not elem.hasAttribute("xml:space"):
205 elem.setAttribute("xml:space", "preserve")
206
207 for node in nodes:
208 if node.nodeType != node.ELEMENT_NODE:
209 continue
210
211 # Handle the node itself
212 if node.tagName == "w:p":
213 add_rsid_to_p(node)
214 elif node.tagName == "w:r":
215 add_rsid_to_r(node)
216 elif node.tagName == "w:t":
217 add_xml_space_to_t(node)
218 elif node.tagName in ("w:ins", "w:del"):
219 add_tracked_change_attrs(node)
220 elif node.tagName == "w:comment":
221 add_comment_attrs(node)
222 elif node.tagName == "w16cex:commentExtensible":
223 add_comment_extensible_date(node)
224
225 # Process descendants (getElementsByTagName doesn't return the element itself)
226 for elem in node.getElementsByTagName("w:p"):
227 add_rsid_to_p(elem)
228 for elem in node.getElementsByTagName("w:r"):
229 add_rsid_to_r(elem)
230 for elem in node.getElementsByTagName("w:t"):
231 add_xml_space_to_t(elem)
232 for tag in ("w:ins", "w:del"):
233 for elem in node.getElementsByTagName(tag):
234 add_tracked_change_attrs(elem)
235 for elem in node.getElementsByTagName("w:comment"):
236 add_comment_attrs(elem)
237 for elem in node.getElementsByTagName("w16cex:commentExtensible"):
238 add_comment_extensible_date(elem)
239
240 def replace_node(self, elem, new_content):
241 """Replace node with automatic attribute injection."""
242 nodes = super().replace_node(elem, new_content)
243 self._inject_attributes_to_nodes(nodes)
244 return nodes
245
246 def insert_after(self, elem, xml_content):
247 """Insert after with automatic attribute injection."""
248 nodes = super().insert_after(elem, xml_content)
249 self._inject_attributes_to_nodes(nodes)
250 return nodes
251
252 def insert_before(self, elem, xml_content):
253 """Insert before with automatic attribute injection."""
254 nodes = super().insert_before(elem, xml_content)
255 self._inject_attributes_to_nodes(nodes)
256 return nodes
257
258 def append_to(self, elem, xml_content):
259 """Append to with automatic attribute injection."""
260 nodes = super().append_to(elem, xml_content)
261 self._inject_attributes_to_nodes(nodes)
262 return nodes
263
264 def revert_insertion(self, elem):
265 """Reject an insertion by wrapping its content in a deletion.
266
267 Wraps all runs inside w:ins in w:del, converting w:t to w:delText.
268 Can process a single w:ins element or a container element with multiple w:ins.
269
270 Args:
271 elem: Element to process (w:ins, w:p, w:body, etc.)
272
273 Returns:
274 list: List containing the processed element(s)
275
276 Raises:
277 ValueError: If the element contains no w:ins elements
278
279 Example:
280 # Reject a single insertion
281 ins = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"})
282 doc["word/document.xml"].revert_insertion(ins)
283
284 # Reject all insertions in a paragraph
285 para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
286 doc["word/document.xml"].revert_insertion(para)
287 """
288 # Collect insertions
289 ins_elements = []
290 if elem.tagName == "w:ins":
291 ins_elements.append(elem)
292 else:
293 ins_elements.extend(elem.getElementsByTagName("w:ins"))
294
295 # Validate that there are insertions to reject
296 if not ins_elements:
297 raise ValueError(
298 f"revert_insertion requires w:ins elements. "
299 f"The provided element <{elem.tagName}> contains no insertions. "
300 )
301
302 # Process all insertions - wrap all children in w:del
303 for ins_elem in ins_elements:
304 runs = list(ins_elem.getElementsByTagName("w:r"))
305 if not runs:
306 continue
307
308 # Create deletion wrapper
309 del_wrapper = self.dom.createElement("w:del")
310
311 # Process each run
312 for run in runs:
313 # Convert w:t → w:delText and w:rsidR → w:rsidDel
314 if run.hasAttribute("w:rsidR"):
315 run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR"))
316 run.removeAttribute("w:rsidR")
317 elif not run.hasAttribute("w:rsidDel"):
318 run.setAttribute("w:rsidDel", self.rsid)
319
320 for t_elem in list(run.getElementsByTagName("w:t")):
321 del_text = self.dom.createElement("w:delText")
322 # Copy ALL child nodes (not just firstChild) to handle entities
323 while t_elem.firstChild:
324 del_text.appendChild(t_elem.firstChild)
325 for i in range(t_elem.attributes.length):
326 attr = t_elem.attributes.item(i)
327 del_text.setAttribute(attr.name, attr.value)
328 t_elem.parentNode.replaceChild(del_text, t_elem)
329
330 # Move all children from ins to del wrapper
331 while ins_elem.firstChild:
332 del_wrapper.appendChild(ins_elem.firstChild)
333
334 # Add del wrapper back to ins
335 ins_elem.appendChild(del_wrapper)
336
337 # Inject attributes to the deletion wrapper
338 self._inject_attributes_to_nodes([del_wrapper])
339
340 return [elem]
341
342 def revert_deletion(self, elem):
343 """Reject a deletion by re-inserting the deleted content.
344
345 Creates w:ins elements after each w:del, copying deleted content and
346 converting w:delText back to w:t.
347 Can process a single w:del element or a container element with multiple w:del.
348
349 Args:
350 elem: Element to process (w:del, w:p, w:body, etc.)
351
352 Returns:
353 list: If elem is w:del, returns [elem, new_ins]. Otherwise returns [elem].
354
355 Raises:
356 ValueError: If the element contains no w:del elements
357
358 Example:
359 # Reject a single deletion - returns [w:del, w:ins]
360 del_elem = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "3"})
361 nodes = doc["word/document.xml"].revert_deletion(del_elem)
362
363 # Reject all deletions in a paragraph - returns [para]
364 para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
365 nodes = doc["word/document.xml"].revert_deletion(para)
366 """
367 # Collect deletions FIRST - before we modify the DOM
368 del_elements = []
369 is_single_del = elem.tagName == "w:del"
370
371 if is_single_del:
372 del_elements.append(elem)
373 else:
374 del_elements.extend(elem.getElementsByTagName("w:del"))
375
376 # Validate that there are deletions to reject
377 if not del_elements:
378 raise ValueError(
379 f"revert_deletion requires w:del elements. "
380 f"The provided element <{elem.tagName}> contains no deletions. "
381 )
382
383 # Track created insertion (only relevant if elem is a single w:del)
384 created_insertion = None
385
386 # Process all deletions - create insertions that copy the deleted content
387 for del_elem in del_elements:
388 # Clone the deleted runs and convert them to insertions
389 runs = list(del_elem.getElementsByTagName("w:r"))
390 if not runs:
391 continue
392
393 # Create insertion wrapper
394 ins_elem = self.dom.createElement("w:ins")
395
396 for run in runs:
397 # Clone the run
398 new_run = run.cloneNode(True)
399
400 # Convert w:delText → w:t
401 for del_text in list(new_run.getElementsByTagName("w:delText")):
402 t_elem = self.dom.createElement("w:t")
403 # Copy ALL child nodes (not just firstChild) to handle entities
404 while del_text.firstChild:
405 t_elem.appendChild(del_text.firstChild)
406 for i in range(del_text.attributes.length):
407 attr = del_text.attributes.item(i)
408 t_elem.setAttribute(attr.name, attr.value)
409 del_text.parentNode.replaceChild(t_elem, del_text)
410
411 # Update run attributes: w:rsidDel → w:rsidR
412 if new_run.hasAttribute("w:rsidDel"):
413 new_run.setAttribute("w:rsidR", new_run.getAttribute("w:rsidDel"))
414 new_run.removeAttribute("w:rsidDel")
415 elif not new_run.hasAttribute("w:rsidR"):
416 new_run.setAttribute("w:rsidR", self.rsid)
417
418 ins_elem.appendChild(new_run)
419
420 # Insert the new insertion after the deletion
421 nodes = self.insert_after(del_elem, ins_elem.toxml())
422
423 # If processing a single w:del, track the created insertion
424 if is_single_del and nodes:
425 created_insertion = nodes[0]
426
427 # Return based on input type
428 if is_single_del and created_insertion:
429 return [elem, created_insertion]
430 else:
431 return [elem]
432
433 @staticmethod
434 def suggest_paragraph(xml_content: str) -> str:
435 """Transform paragraph XML to add tracked change wrapping for insertion.
436
437 Wraps runs in <w:ins> and adds <w:ins/> to w:rPr in w:pPr for numbered lists.
438
439 Args:
440 xml_content: XML string containing a <w:p> element
441
442 Returns:
443 str: Transformed XML with tracked change wrapping
444 """
445 wrapper = f'<root xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">{xml_content}</root>'
446 doc = minidom.parseString(wrapper)
447 para = doc.getElementsByTagName("w:p")[0]
448
449 # Ensure w:pPr exists
450 pPr_list = para.getElementsByTagName("w:pPr")
451 if not pPr_list:
452 pPr = doc.createElement("w:pPr")
453 para.insertBefore(
454 pPr, para.firstChild
455 ) if para.firstChild else para.appendChild(pPr)
456 else:
457 pPr = pPr_list[0]
458
459 # Ensure w:rPr exists in w:pPr
460 rPr_list = pPr.getElementsByTagName("w:rPr")
461 if not rPr_list:
462 rPr = doc.createElement("w:rPr")
463 pPr.appendChild(rPr)
464 else:
465 rPr = rPr_list[0]
466
467 # Add <w:ins/> to w:rPr
468 ins_marker = doc.createElement("w:ins")
469 rPr.insertBefore(
470 ins_marker, rPr.firstChild
471 ) if rPr.firstChild else rPr.appendChild(ins_marker)
472
473 # Wrap all non-pPr children in <w:ins>
474 ins_wrapper = doc.createElement("w:ins")
475 for child in [c for c in para.childNodes if c.nodeName != "w:pPr"]:
476 para.removeChild(child)
477 ins_wrapper.appendChild(child)
478 para.appendChild(ins_wrapper)
479
480 return para.toxml()
481
482 def suggest_deletion(self, elem):
483 """Mark a w:r or w:p element as deleted with tracked changes (in-place DOM manipulation).
484
485 For w:r: wraps in <w:del>, converts <w:t> to <w:delText>, preserves w:rPr
486 For w:p (regular): wraps content in <w:del>, converts <w:t> to <w:delText>
487 For w:p (numbered list): adds <w:del/> to w:rPr in w:pPr, wraps content in <w:del>
488
489 Args:
490 elem: A w:r or w:p DOM element without existing tracked changes
491
492 Returns:
493 Element: The modified element
494
495 Raises:
496 ValueError: If element has existing tracked changes or invalid structure
497 """
498 if elem.nodeName == "w:r":
499 # Check for existing w:delText
500 if elem.getElementsByTagName("w:delText"):
501 raise ValueError("w:r element already contains w:delText")
502
503 # Convert w:t → w:delText
504 for t_elem in list(elem.getElementsByTagName("w:t")):
505 del_text = self.dom.createElement("w:delText")
506 # Copy ALL child nodes (not just firstChild) to handle entities
507 while t_elem.firstChild:
508 del_text.appendChild(t_elem.firstChild)
509 # Preserve attributes like xml:space
510 for i in range(t_elem.attributes.length):
511 attr = t_elem.attributes.item(i)
512 del_text.setAttribute(attr.name, attr.value)
513 t_elem.parentNode.replaceChild(del_text, t_elem)
514
515 # Update run attributes: w:rsidR → w:rsidDel
516 if elem.hasAttribute("w:rsidR"):
517 elem.setAttribute("w:rsidDel", elem.getAttribute("w:rsidR"))
518 elem.removeAttribute("w:rsidR")
519 elif not elem.hasAttribute("w:rsidDel"):
520 elem.setAttribute("w:rsidDel", self.rsid)
521
522 # Wrap in w:del
523 del_wrapper = self.dom.createElement("w:del")
524 parent = elem.parentNode
525 parent.insertBefore(del_wrapper, elem)
526 parent.removeChild(elem)
527 del_wrapper.appendChild(elem)
528
529 # Inject attributes to the deletion wrapper
530 self._inject_attributes_to_nodes([del_wrapper])
531
532 return del_wrapper
533
534 elif elem.nodeName == "w:p":
535 # Check for existing tracked changes
536 if elem.getElementsByTagName("w:ins") or elem.getElementsByTagName("w:del"):
537 raise ValueError("w:p element already contains tracked changes")
538
539 # Check if it's a numbered list item
540 pPr_list = elem.getElementsByTagName("w:pPr")
541 is_numbered = pPr_list and pPr_list[0].getElementsByTagName("w:numPr")
542
543 if is_numbered:
544 # Add <w:del/> to w:rPr in w:pPr
545 pPr = pPr_list[0]
546 rPr_list = pPr.getElementsByTagName("w:rPr")
547
548 if not rPr_list:
549 rPr = self.dom.createElement("w:rPr")
550 pPr.appendChild(rPr)
551 else:
552 rPr = rPr_list[0]
553
554 # Add <w:del/> marker
555 del_marker = self.dom.createElement("w:del")
556 rPr.insertBefore(
557 del_marker, rPr.firstChild
558 ) if rPr.firstChild else rPr.appendChild(del_marker)
559
560 # Convert w:t → w:delText in all runs
561 for t_elem in list(elem.getElementsByTagName("w:t")):
562 del_text = self.dom.createElement("w:delText")
563 # Copy ALL child nodes (not just firstChild) to handle entities
564 while t_elem.firstChild:
565 del_text.appendChild(t_elem.firstChild)
566 # Preserve attributes like xml:space
567 for i in range(t_elem.attributes.length):
568 attr = t_elem.attributes.item(i)
569 del_text.setAttribute(attr.name, attr.value)
570 t_elem.parentNode.replaceChild(del_text, t_elem)
571
572 # Update run attributes: w:rsidR → w:rsidDel
573 for run in elem.getElementsByTagName("w:r"):
574 if run.hasAttribute("w:rsidR"):
575 run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR"))
576 run.removeAttribute("w:rsidR")
577 elif not run.hasAttribute("w:rsidDel"):
578 run.setAttribute("w:rsidDel", self.rsid)
579
580 # Wrap all non-pPr children in <w:del>
581 del_wrapper = self.dom.createElement("w:del")
582 for child in [c for c in elem.childNodes if c.nodeName != "w:pPr"]:
583 elem.removeChild(child)
584 del_wrapper.appendChild(child)
585 elem.appendChild(del_wrapper)
586
587 # Inject attributes to the deletion wrapper
588 self._inject_attributes_to_nodes([del_wrapper])
589
590 return elem
591
592 else:
593 raise ValueError(f"Element must be w:r or w:p, got {elem.nodeName}")
594
595
596def _generate_hex_id() -> str:
597 """Generate random 8-character hex ID for para/durable IDs.
598
599 Values are constrained to be less than 0x7FFFFFFF per OOXML spec:
600 - paraId must be < 0x80000000
601 - durableId must be < 0x7FFFFFFF
602 We use the stricter constraint (0x7FFFFFFF) for both.
603 """
604 return f"{random.randint(1, 0x7FFFFFFE):08X}"
605
606
607def _generate_rsid() -> str:
608 """Generate random 8-character hex RSID."""
609 return "".join(random.choices("0123456789ABCDEF", k=8))
610
611
612class Document:
613 """Manages comments in unpacked Word documents."""
614
615 def __init__(
616 self,
617 unpacked_dir,
618 rsid=None,
619 track_revisions=False,
620 author="Claude",
621 initials="C",
622 ):
623 """
624 Initialize with path to unpacked Word document directory.
625 Automatically sets up comment infrastructure (people.xml, RSIDs).
626
627 Args:
628 unpacked_dir: Path to unpacked DOCX directory (must contain word/ subdirectory)
629 rsid: Optional RSID to use for all comment elements. If not provided, one will be generated.
630 track_revisions: If True, enables track revisions in settings.xml (default: False)
631 author: Default author name for comments (default: "Claude")
632 initials: Default author initials for comments (default: "C")
633 """
634 self.original_path = Path(unpacked_dir)
635
636 if not self.original_path.exists() or not self.original_path.is_dir():
637 raise ValueError(f"Directory not found: {unpacked_dir}")
638
639 # Create temporary directory with subdirectories for unpacked content and baseline
640 self.temp_dir = tempfile.mkdtemp(prefix="docx_")
641 self.unpacked_path = Path(self.temp_dir) / "unpacked"
642 shutil.copytree(self.original_path, self.unpacked_path)
643
644 # Pack original directory into temporary .docx for validation baseline (outside unpacked dir)
645 self.original_docx = Path(self.temp_dir) / "original.docx"
646 pack_document(self.original_path, self.original_docx, validate=False)
647
648 self.word_path = self.unpacked_path / "word"
649
650 # Generate RSID if not provided
651 self.rsid = rsid if rsid else _generate_rsid()
652 print(f"Using RSID: {self.rsid}")
653
654 # Set default author and initials
655 self.author = author
656 self.initials = initials
657
658 # Cache for lazy-loaded editors
659 self._editors = {}
660
661 # Comment file paths
662 self.comments_path = self.word_path / "comments.xml"
663 self.comments_extended_path = self.word_path / "commentsExtended.xml"
664 self.comments_ids_path = self.word_path / "commentsIds.xml"
665 self.comments_extensible_path = self.word_path / "commentsExtensible.xml"
666
667 # Load existing comments and determine next ID (before setup modifies files)
668 self.existing_comments = self._load_existing_comments()
669 self.next_comment_id = self._get_next_comment_id()
670
671 # Convenient access to document.xml editor (semi-private)
672 self._document = self["word/document.xml"]
673
674 # Setup tracked changes infrastructure
675 self._setup_tracking(track_revisions=track_revisions)
676
677 # Add author to people.xml
678 self._add_author_to_people(author)
679
680 def __getitem__(self, xml_path: str) -> DocxXMLEditor:
681 """
682 Get or create a DocxXMLEditor for the specified XML file.
683
684 Enables lazy-loaded editors with bracket notation:
685 node = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
686
687 Args:
688 xml_path: Relative path to XML file (e.g., "word/document.xml", "word/comments.xml")
689
690 Returns:
691 DocxXMLEditor instance for the specified file
692
693 Raises:
694 ValueError: If the file does not exist
695
696 Example:
697 # Get node from document.xml
698 node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
699
700 # Get node from comments.xml
701 comment = doc["word/comments.xml"].get_node(tag="w:comment", attrs={"w:id": "0"})
702 """
703 if xml_path not in self._editors:
704 file_path = self.unpacked_path / xml_path
705 if not file_path.exists():
706 raise ValueError(f"XML file not found: {xml_path}")
707 # Use DocxXMLEditor with RSID, author, and initials for all editors
708 self._editors[xml_path] = DocxXMLEditor(
709 file_path, rsid=self.rsid, author=self.author, initials=self.initials
710 )
711 return self._editors[xml_path]
712
713 def add_comment(self, start, end, text: str) -> int:
714 """
715 Add a comment spanning from one element to another.
716
717 Args:
718 start: DOM element for the starting point
719 end: DOM element for the ending point
720 text: Comment content
721
722 Returns:
723 The comment ID that was created
724
725 Example:
726 start_node = cm.get_document_node(tag="w:del", id="1")
727 end_node = cm.get_document_node(tag="w:ins", id="2")
728 cm.add_comment(start=start_node, end=end_node, text="Explanation")
729 """
730 comment_id = self.next_comment_id
731 para_id = _generate_hex_id()
732 durable_id = _generate_hex_id()
733 timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
734
735 # Add comment ranges to document.xml immediately
736 self._document.insert_before(start, self._comment_range_start_xml(comment_id))
737
738 # If end node is a paragraph, append comment markup inside it
739 # Otherwise insert after it (for run-level anchors)
740 if end.tagName == "w:p":
741 self._document.append_to(end, self._comment_range_end_xml(comment_id))
742 else:
743 self._document.insert_after(end, self._comment_range_end_xml(comment_id))
744
745 # Add to comments.xml immediately
746 self._add_to_comments_xml(
747 comment_id, para_id, text, self.author, self.initials, timestamp
748 )
749
750 # Add to commentsExtended.xml immediately
751 self._add_to_comments_extended_xml(para_id, parent_para_id=None)
752
753 # Add to commentsIds.xml immediately
754 self._add_to_comments_ids_xml(para_id, durable_id)
755
756 # Add to commentsExtensible.xml immediately
757 self._add_to_comments_extensible_xml(durable_id)
758
759 # Update existing_comments so replies work
760 self.existing_comments[comment_id] = {"para_id": para_id}
761
762 self.next_comment_id += 1
763 return comment_id
764
765 def reply_to_comment(
766 self,
767 parent_comment_id: int,
768 text: str,
769 ) -> int:
770 """
771 Add a reply to an existing comment.
772
773 Args:
774 parent_comment_id: The w:id of the parent comment to reply to
775 text: Reply text
776
777 Returns:
778 The comment ID that was created for the reply
779
780 Example:
781 cm.reply_to_comment(parent_comment_id=0, text="I agree with this change")
782 """
783 if parent_comment_id not in self.existing_comments:
784 raise ValueError(f"Parent comment with id={parent_comment_id} not found")
785
786 parent_info = self.existing_comments[parent_comment_id]
787 comment_id = self.next_comment_id
788 para_id = _generate_hex_id()
789 durable_id = _generate_hex_id()
790 timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
791
792 # Add comment ranges to document.xml immediately
793 parent_start_elem = self._document.get_node(
794 tag="w:commentRangeStart", attrs={"w:id": str(parent_comment_id)}
795 )
796 parent_ref_elem = self._document.get_node(
797 tag="w:commentReference", attrs={"w:id": str(parent_comment_id)}
798 )
799
800 self._document.insert_after(
801 parent_start_elem, self._comment_range_start_xml(comment_id)
802 )
803 parent_ref_run = parent_ref_elem.parentNode
804 self._document.insert_after(
805 parent_ref_run, f'<w:commentRangeEnd w:id="{comment_id}"/>'
806 )
807 self._document.insert_after(
808 parent_ref_run, self._comment_ref_run_xml(comment_id)
809 )
810
811 # Add to comments.xml immediately
812 self._add_to_comments_xml(
813 comment_id, para_id, text, self.author, self.initials, timestamp
814 )
815
816 # Add to commentsExtended.xml immediately (with parent)
817 self._add_to_comments_extended_xml(
818 para_id, parent_para_id=parent_info["para_id"]
819 )
820
821 # Add to commentsIds.xml immediately
822 self._add_to_comments_ids_xml(para_id, durable_id)
823
824 # Add to commentsExtensible.xml immediately
825 self._add_to_comments_extensible_xml(durable_id)
826
827 # Update existing_comments so replies work
828 self.existing_comments[comment_id] = {"para_id": para_id}
829
830 self.next_comment_id += 1
831 return comment_id
832
833 def __del__(self):
834 """Clean up temporary directory on deletion."""
835 if hasattr(self, "temp_dir") and Path(self.temp_dir).exists():
836 shutil.rmtree(self.temp_dir)
837
838 def validate(self) -> None:
839 """
840 Validate the document against XSD schema and redlining rules.
841
842 Raises:
843 ValueError: If validation fails.
844 """
845 # Create validators with current state
846 schema_validator = DOCXSchemaValidator(
847 self.unpacked_path, self.original_docx, verbose=False
848 )
849 redlining_validator = RedliningValidator(
850 self.unpacked_path, self.original_docx, verbose=False
851 )
852
853 # Run validations
854 if not schema_validator.validate():
855 raise ValueError("Schema validation failed")
856 if not redlining_validator.validate():
857 raise ValueError("Redlining validation failed")
858
859 def save(self, destination=None, validate=True) -> None:
860 """
861 Save all modified XML files to disk and copy to destination directory.
862
863 This persists all changes made via add_comment() and reply_to_comment().
864
865 Args:
866 destination: Optional path to save to. If None, saves back to original directory.
867 validate: If True, validates document before saving (default: True).
868 """
869 # Only ensure comment relationships and content types if comment files exist
870 if self.comments_path.exists():
871 self._ensure_comment_relationships()
872 self._ensure_comment_content_types()
873
874 # Save all modified XML files in temp directory
875 for editor in self._editors.values():
876 editor.save()
877
878 # Validate by default
879 if validate:
880 self.validate()
881
882 # Copy contents from temp directory to destination (or original directory)
883 target_path = Path(destination) if destination else self.original_path
884 shutil.copytree(self.unpacked_path, target_path, dirs_exist_ok=True)
885
886 # ==================== Private: Initialization ====================
887
888 def _get_next_comment_id(self):
889 """Get the next available comment ID."""
890 if not self.comments_path.exists():
891 return 0
892
893 editor = self["word/comments.xml"]
894 max_id = -1
895 for comment_elem in editor.dom.getElementsByTagName("w:comment"):
896 comment_id = comment_elem.getAttribute("w:id")
897 if comment_id:
898 try:
899 max_id = max(max_id, int(comment_id))
900 except ValueError:
901 pass
902 return max_id + 1
903
904 def _load_existing_comments(self):
905 """Load existing comments from files to enable replies."""
906 if not self.comments_path.exists():
907 return {}
908
909 editor = self["word/comments.xml"]
910 existing = {}
911
912 for comment_elem in editor.dom.getElementsByTagName("w:comment"):
913 comment_id = comment_elem.getAttribute("w:id")
914 if not comment_id:
915 continue
916
917 # Find para_id from the w:p element within the comment
918 para_id = None
919 for p_elem in comment_elem.getElementsByTagName("w:p"):
920 para_id = p_elem.getAttribute("w14:paraId")
921 if para_id:
922 break
923
924 if not para_id:
925 continue
926
927 existing[int(comment_id)] = {"para_id": para_id}
928
929 return existing
930
931 # ==================== Private: Setup Methods ====================
932
933 def _setup_tracking(self, track_revisions=False):
934 """Set up comment infrastructure in unpacked directory.
935
936 Args:
937 track_revisions: If True, enables track revisions in settings.xml
938 """
939 # Create or update word/people.xml
940 people_file = self.word_path / "people.xml"
941 self._update_people_xml(people_file)
942
943 # Update XML files
944 self._add_content_type_for_people(self.unpacked_path / "[Content_Types].xml")
945 self._add_relationship_for_people(
946 self.word_path / "_rels" / "document.xml.rels"
947 )
948
949 # Always add RSID to settings.xml, optionally enable trackRevisions
950 self._update_settings(
951 self.word_path / "settings.xml", track_revisions=track_revisions
952 )
953
954 def _update_people_xml(self, path):
955 """Create people.xml if it doesn't exist."""
956 if not path.exists():
957 # Copy from template
958 shutil.copy(TEMPLATE_DIR / "people.xml", path)
959
960 def _add_content_type_for_people(self, path):
961 """Add people.xml content type to [Content_Types].xml if not already present."""
962 editor = self["[Content_Types].xml"]
963
964 if self._has_override(editor, "/word/people.xml"):
965 return
966
967 # Add Override element
968 root = editor.dom.documentElement
969 override_xml = '<Override PartName="/word/people.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.people+xml"/>'
970 editor.append_to(root, override_xml)
971
972 def _add_relationship_for_people(self, path):
973 """Add people.xml relationship to document.xml.rels if not already present."""
974 editor = self["word/_rels/document.xml.rels"]
975
976 if self._has_relationship(editor, "people.xml"):
977 return
978
979 root = editor.dom.documentElement
980 root_tag = root.tagName # type: ignore
981 prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else ""
982 next_rid = editor.get_next_rid()
983
984 # Create the relationship entry
985 rel_xml = f'<{prefix}Relationship Id="{next_rid}" Type="http://schemas.microsoft.com/office/2011/relationships/people" Target="people.xml"/>'
986 editor.append_to(root, rel_xml)
987
988 def _update_settings(self, path, track_revisions=False):
989 """Add RSID and optionally enable track revisions in settings.xml.
990
991 Args:
992 path: Path to settings.xml
993 track_revisions: If True, adds trackRevisions element
994
995 Places elements per OOXML schema order:
996 - trackRevisions: early (before defaultTabStop)
997 - rsids: late (after compat)
998 """
999 editor = self["word/settings.xml"]
1000 root = editor.get_node(tag="w:settings")
1001 prefix = root.tagName.split(":")[0] if ":" in root.tagName else "w"
1002
1003 # Conditionally add trackRevisions if requested
1004 if track_revisions:
1005 track_revisions_exists = any(
1006 elem.tagName == f"{prefix}:trackRevisions"
1007 for elem in editor.dom.getElementsByTagName(f"{prefix}:trackRevisions")
1008 )
1009
1010 if not track_revisions_exists:
1011 track_rev_xml = f"<{prefix}:trackRevisions/>"
1012 # Try to insert before documentProtection, defaultTabStop, or at start
1013 inserted = False
1014 for tag in [f"{prefix}:documentProtection", f"{prefix}:defaultTabStop"]:
1015 elements = editor.dom.getElementsByTagName(tag)
1016 if elements:
1017 editor.insert_before(elements[0], track_rev_xml)
1018 inserted = True
1019 break
1020 if not inserted:
1021 # Insert as first child of settings
1022 if root.firstChild:
1023 editor.insert_before(root.firstChild, track_rev_xml)
1024 else:
1025 editor.append_to(root, track_rev_xml)
1026
1027 # Always check if rsids section exists
1028 rsids_elements = editor.dom.getElementsByTagName(f"{prefix}:rsids")
1029
1030 if not rsids_elements:
1031 # Add new rsids section
1032 rsids_xml = f'''<{prefix}:rsids>
1033 <{prefix}:rsidRoot {prefix}:val="{self.rsid}"/>
1034 <{prefix}:rsid {prefix}:val="{self.rsid}"/>
1035</{prefix}:rsids>'''
1036
1037 # Try to insert after compat, before clrSchemeMapping, or before closing tag
1038 inserted = False
1039 compat_elements = editor.dom.getElementsByTagName(f"{prefix}:compat")
1040 if compat_elements:
1041 editor.insert_after(compat_elements[0], rsids_xml)
1042 inserted = True
1043
1044 if not inserted:
1045 clr_elements = editor.dom.getElementsByTagName(
1046 f"{prefix}:clrSchemeMapping"
1047 )
1048 if clr_elements:
1049 editor.insert_before(clr_elements[0], rsids_xml)
1050 inserted = True
1051
1052 if not inserted:
1053 editor.append_to(root, rsids_xml)
1054 else:
1055 # Check if this rsid already exists
1056 rsids_elem = rsids_elements[0]
1057 rsid_exists = any(
1058 elem.getAttribute(f"{prefix}:val") == self.rsid
1059 for elem in rsids_elem.getElementsByTagName(f"{prefix}:rsid")
1060 )
1061
1062 if not rsid_exists:
1063 rsid_xml = f'<{prefix}:rsid {prefix}:val="{self.rsid}"/>'
1064 editor.append_to(rsids_elem, rsid_xml)
1065
1066 # ==================== Private: XML File Creation ====================
1067
1068 def _add_to_comments_xml(
1069 self, comment_id, para_id, text, author, initials, timestamp
1070 ):
1071 """Add a single comment to comments.xml."""
1072 if not self.comments_path.exists():
1073 shutil.copy(TEMPLATE_DIR / "comments.xml", self.comments_path)
1074
1075 editor = self["word/comments.xml"]
1076 root = editor.get_node(tag="w:comments")
1077
1078 escaped_text = (
1079 text.replace("&", "&").replace("<", "<").replace(">", ">")
1080 )
1081 # Note: w:rsidR, w:rsidRDefault, w:rsidP on w:p, w:rsidR on w:r,
1082 # and w:author, w:date, w:initials on w:comment are automatically added by DocxXMLEditor
1083 comment_xml = f'''<w:comment w:id="{comment_id}">
1084 <w:p w14:paraId="{para_id}" w14:textId="77777777">
1085 <w:r><w:rPr><w:rStyle w:val="CommentReference"/></w:rPr><w:annotationRef/></w:r>
1086 <w:r><w:rPr><w:color w:val="000000"/><w:sz w:val="20"/><w:szCs w:val="20"/></w:rPr><w:t>{escaped_text}</w:t></w:r>
1087 </w:p>
1088</w:comment>'''
1089 editor.append_to(root, comment_xml)
1090
1091 def _add_to_comments_extended_xml(self, para_id, parent_para_id):
1092 """Add a single comment to commentsExtended.xml."""
1093 if not self.comments_extended_path.exists():
1094 shutil.copy(
1095 TEMPLATE_DIR / "commentsExtended.xml", self.comments_extended_path
1096 )
1097
1098 editor = self["word/commentsExtended.xml"]
1099 root = editor.get_node(tag="w15:commentsEx")
1100
1101 if parent_para_id:
1102 xml = f'<w15:commentEx w15:paraId="{para_id}" w15:paraIdParent="{parent_para_id}" w15:done="0"/>'
1103 else:
1104 xml = f'<w15:commentEx w15:paraId="{para_id}" w15:done="0"/>'
1105 editor.append_to(root, xml)
1106
1107 def _add_to_comments_ids_xml(self, para_id, durable_id):
1108 """Add a single comment to commentsIds.xml."""
1109 if not self.comments_ids_path.exists():
1110 shutil.copy(TEMPLATE_DIR / "commentsIds.xml", self.comments_ids_path)
1111
1112 editor = self["word/commentsIds.xml"]
1113 root = editor.get_node(tag="w16cid:commentsIds")
1114
1115 xml = f'<w16cid:commentId w16cid:paraId="{para_id}" w16cid:durableId="{durable_id}"/>'
1116 editor.append_to(root, xml)
1117
1118 def _add_to_comments_extensible_xml(self, durable_id):
1119 """Add a single comment to commentsExtensible.xml."""
1120 if not self.comments_extensible_path.exists():
1121 shutil.copy(
1122 TEMPLATE_DIR / "commentsExtensible.xml", self.comments_extensible_path
1123 )
1124
1125 editor = self["word/commentsExtensible.xml"]
1126 root = editor.get_node(tag="w16cex:commentsExtensible")
1127
1128 xml = f'<w16cex:commentExtensible w16cex:durableId="{durable_id}"/>'
1129 editor.append_to(root, xml)
1130
1131 # ==================== Private: XML Fragments ====================
1132
1133 def _comment_range_start_xml(self, comment_id):
1134 """Generate XML for comment range start."""
1135 return f'<w:commentRangeStart w:id="{comment_id}"/>'
1136
1137 def _comment_range_end_xml(self, comment_id):
1138 """Generate XML for comment range end with reference run.
1139
1140 Note: w:rsidR is automatically added by DocxXMLEditor.
1141 """
1142 return f'''<w:commentRangeEnd w:id="{comment_id}"/>
1143<w:r>
1144 <w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
1145 <w:commentReference w:id="{comment_id}"/>
1146</w:r>'''
1147
1148 def _comment_ref_run_xml(self, comment_id):
1149 """Generate XML for comment reference run.
1150
1151 Note: w:rsidR is automatically added by DocxXMLEditor.
1152 """
1153 return f'''<w:r>
1154 <w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
1155 <w:commentReference w:id="{comment_id}"/>
1156</w:r>'''
1157
1158 # ==================== Private: Metadata Updates ====================
1159
1160 def _has_relationship(self, editor, target):
1161 """Check if a relationship with given target exists."""
1162 for rel_elem in editor.dom.getElementsByTagName("Relationship"):
1163 if rel_elem.getAttribute("Target") == target:
1164 return True
1165 return False
1166
1167 def _has_override(self, editor, part_name):
1168 """Check if an override with given part name exists."""
1169 for override_elem in editor.dom.getElementsByTagName("Override"):
1170 if override_elem.getAttribute("PartName") == part_name:
1171 return True
1172 return False
1173
1174 def _has_author(self, editor, author):
1175 """Check if an author already exists in people.xml."""
1176 for person_elem in editor.dom.getElementsByTagName("w15:person"):
1177 if person_elem.getAttribute("w15:author") == author:
1178 return True
1179 return False
1180
1181 def _add_author_to_people(self, author):
1182 """Add author to people.xml (called during initialization)."""
1183 people_path = self.word_path / "people.xml"
1184
1185 # people.xml should already exist from _setup_tracking
1186 if not people_path.exists():
1187 raise ValueError("people.xml should exist after _setup_tracking")
1188
1189 editor = self["word/people.xml"]
1190 root = editor.get_node(tag="w15:people")
1191
1192 # Check if author already exists
1193 if self._has_author(editor, author):
1194 return
1195
1196 # Add author with proper XML escaping to prevent injection
1197 escaped_author = html.escape(author, quote=True)
1198 person_xml = f'''<w15:person w15:author="{escaped_author}">
1199 <w15:presenceInfo w15:providerId="None" w15:userId="{escaped_author}"/>
1200</w15:person>'''
1201 editor.append_to(root, person_xml)
1202
1203 def _ensure_comment_relationships(self):
1204 """Ensure word/_rels/document.xml.rels has comment relationships."""
1205 editor = self["word/_rels/document.xml.rels"]
1206
1207 if self._has_relationship(editor, "comments.xml"):
1208 return
1209
1210 root = editor.dom.documentElement
1211 root_tag = root.tagName # type: ignore
1212 prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else ""
1213 next_rid_num = int(editor.get_next_rid()[3:])
1214
1215 # Add relationship elements
1216 rels = [
1217 (
1218 next_rid_num,
1219 "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
1220 "comments.xml",
1221 ),
1222 (
1223 next_rid_num + 1,
1224 "http://schemas.microsoft.com/office/2011/relationships/commentsExtended",
1225 "commentsExtended.xml",
1226 ),
1227 (
1228 next_rid_num + 2,
1229 "http://schemas.microsoft.com/office/2016/09/relationships/commentsIds",
1230 "commentsIds.xml",
1231 ),
1232 (
1233 next_rid_num + 3,
1234 "http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible",
1235 "commentsExtensible.xml",
1236 ),
1237 ]
1238
1239 for rel_id, rel_type, target in rels:
1240 rel_xml = f'<{prefix}Relationship Id="rId{rel_id}" Type="{rel_type}" Target="{target}"/>'
1241 editor.append_to(root, rel_xml)
1242
1243 def _ensure_comment_content_types(self):
1244 """Ensure [Content_Types].xml has comment content types."""
1245 editor = self["[Content_Types].xml"]
1246
1247 if self._has_override(editor, "/word/comments.xml"):
1248 return
1249
1250 root = editor.dom.documentElement
1251
1252 # Add Override elements
1253 overrides = [
1254 (
1255 "/word/comments.xml",
1256 "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
1257 ),
1258 (
1259 "/word/commentsExtended.xml",
1260 "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml",
1261 ),
1262 (
1263 "/word/commentsIds.xml",
1264 "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml",
1265 ),
1266 (
1267 "/word/commentsExtensible.xml",
1268 "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml",
1269 ),
1270 ]
1271
1272 for part_name, content_type in overrides:
1273 override_xml = (
1274 f'<Override PartName="{part_name}" ContentType="{content_type}"/>'
1275 )
1276 editor.append_to(root, override_xml)