main
1"""
2Validator for tracked changes in Word documents.
3"""
4
5import subprocess
6import tempfile
7import zipfile
8from pathlib import Path
9
10
11class RedliningValidator:
12 """Validator for tracked changes in Word documents."""
13
14 def __init__(self, unpacked_dir, original_docx, verbose=False):
15 self.unpacked_dir = Path(unpacked_dir)
16 self.original_docx = Path(original_docx)
17 self.verbose = verbose
18 self.namespaces = {
19 "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
20 }
21
22 def validate(self):
23 """Main validation method that returns True if valid, False otherwise."""
24 # Verify unpacked directory exists and has correct structure
25 modified_file = self.unpacked_dir / "word" / "document.xml"
26 if not modified_file.exists():
27 print(f"FAILED - Modified document.xml not found at {modified_file}")
28 return False
29
30 # First, check if there are any tracked changes by Claude to validate
31 try:
32 import xml.etree.ElementTree as ET
33
34 tree = ET.parse(modified_file)
35 root = tree.getroot()
36
37 # Check for w:del or w:ins tags authored by Claude
38 del_elements = root.findall(".//w:del", self.namespaces)
39 ins_elements = root.findall(".//w:ins", self.namespaces)
40
41 # Filter to only include changes by Claude
42 claude_del_elements = [
43 elem
44 for elem in del_elements
45 if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
46 ]
47 claude_ins_elements = [
48 elem
49 for elem in ins_elements
50 if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
51 ]
52
53 # Redlining validation is only needed if tracked changes by Claude have been used.
54 if not claude_del_elements and not claude_ins_elements:
55 if self.verbose:
56 print("PASSED - No tracked changes by Claude found.")
57 return True
58
59 except Exception:
60 # If we can't parse the XML, continue with full validation
61 pass
62
63 # Create temporary directory for unpacking original docx
64 with tempfile.TemporaryDirectory() as temp_dir:
65 temp_path = Path(temp_dir)
66
67 # Unpack original docx
68 try:
69 with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
70 zip_ref.extractall(temp_path)
71 except Exception as e:
72 print(f"FAILED - Error unpacking original docx: {e}")
73 return False
74
75 original_file = temp_path / "word" / "document.xml"
76 if not original_file.exists():
77 print(
78 f"FAILED - Original document.xml not found in {self.original_docx}"
79 )
80 return False
81
82 # Parse both XML files using xml.etree.ElementTree for redlining validation
83 try:
84 import xml.etree.ElementTree as ET
85
86 modified_tree = ET.parse(modified_file)
87 modified_root = modified_tree.getroot()
88 original_tree = ET.parse(original_file)
89 original_root = original_tree.getroot()
90 except ET.ParseError as e:
91 print(f"FAILED - Error parsing XML files: {e}")
92 return False
93
94 # Remove Claude's tracked changes from both documents
95 self._remove_claude_tracked_changes(original_root)
96 self._remove_claude_tracked_changes(modified_root)
97
98 # Extract and compare text content
99 modified_text = self._extract_text_content(modified_root)
100 original_text = self._extract_text_content(original_root)
101
102 if modified_text != original_text:
103 # Show detailed character-level differences for each paragraph
104 error_message = self._generate_detailed_diff(
105 original_text, modified_text
106 )
107 print(error_message)
108 return False
109
110 if self.verbose:
111 print("PASSED - All changes by Claude are properly tracked")
112 return True
113
114 def _generate_detailed_diff(self, original_text, modified_text):
115 """Generate detailed word-level differences using git word diff."""
116 error_parts = [
117 "FAILED - Document text doesn't match after removing Claude's tracked changes",
118 "",
119 "Likely causes:",
120 " 1. Modified text inside another author's <w:ins> or <w:del> tags",
121 " 2. Made edits without proper tracked changes",
122 " 3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
123 "",
124 "For pre-redlined documents, use correct patterns:",
125 " - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
126 " - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
127 "",
128 ]
129
130 # Show git word diff
131 git_diff = self._get_git_word_diff(original_text, modified_text)
132 if git_diff:
133 error_parts.extend(["Differences:", "============", git_diff])
134 else:
135 error_parts.append("Unable to generate word diff (git not available)")
136
137 return "\n".join(error_parts)
138
139 def _get_git_word_diff(self, original_text, modified_text):
140 """Generate word diff using git with character-level precision."""
141 try:
142 with tempfile.TemporaryDirectory() as temp_dir:
143 temp_path = Path(temp_dir)
144
145 # Create two files
146 original_file = temp_path / "original.txt"
147 modified_file = temp_path / "modified.txt"
148
149 original_file.write_text(original_text, encoding="utf-8")
150 modified_file.write_text(modified_text, encoding="utf-8")
151
152 # Try character-level diff first for precise differences
153 result = subprocess.run(
154 [
155 "git",
156 "diff",
157 "--word-diff=plain",
158 "--word-diff-regex=.", # Character-by-character diff
159 "-U0", # Zero lines of context - show only changed lines
160 "--no-index",
161 str(original_file),
162 str(modified_file),
163 ],
164 capture_output=True,
165 text=True,
166 )
167
168 if result.stdout.strip():
169 # Clean up the output - remove git diff header lines
170 lines = result.stdout.split("\n")
171 # Skip the header lines (diff --git, index, +++, ---, @@)
172 content_lines = []
173 in_content = False
174 for line in lines:
175 if line.startswith("@@"):
176 in_content = True
177 continue
178 if in_content and line.strip():
179 content_lines.append(line)
180
181 if content_lines:
182 return "\n".join(content_lines)
183
184 # Fallback to word-level diff if character-level is too verbose
185 result = subprocess.run(
186 [
187 "git",
188 "diff",
189 "--word-diff=plain",
190 "-U0", # Zero lines of context
191 "--no-index",
192 str(original_file),
193 str(modified_file),
194 ],
195 capture_output=True,
196 text=True,
197 )
198
199 if result.stdout.strip():
200 lines = result.stdout.split("\n")
201 content_lines = []
202 in_content = False
203 for line in lines:
204 if line.startswith("@@"):
205 in_content = True
206 continue
207 if in_content and line.strip():
208 content_lines.append(line)
209 return "\n".join(content_lines)
210
211 except (subprocess.CalledProcessError, FileNotFoundError, Exception):
212 # Git not available or other error, return None to use fallback
213 pass
214
215 return None
216
217 def _remove_claude_tracked_changes(self, root):
218 """Remove tracked changes authored by Claude from the XML root."""
219 ins_tag = f"{{{self.namespaces['w']}}}ins"
220 del_tag = f"{{{self.namespaces['w']}}}del"
221 author_attr = f"{{{self.namespaces['w']}}}author"
222
223 # Remove w:ins elements
224 for parent in root.iter():
225 to_remove = []
226 for child in parent:
227 if child.tag == ins_tag and child.get(author_attr) == "Claude":
228 to_remove.append(child)
229 for elem in to_remove:
230 parent.remove(elem)
231
232 # Unwrap content in w:del elements where author is "Claude"
233 deltext_tag = f"{{{self.namespaces['w']}}}delText"
234 t_tag = f"{{{self.namespaces['w']}}}t"
235
236 for parent in root.iter():
237 to_process = []
238 for child in parent:
239 if child.tag == del_tag and child.get(author_attr) == "Claude":
240 to_process.append((child, list(parent).index(child)))
241
242 # Process in reverse order to maintain indices
243 for del_elem, del_index in reversed(to_process):
244 # Convert w:delText to w:t before moving
245 for elem in del_elem.iter():
246 if elem.tag == deltext_tag:
247 elem.tag = t_tag
248
249 # Move all children of w:del to its parent before removing w:del
250 for child in reversed(list(del_elem)):
251 parent.insert(del_index, child)
252 parent.remove(del_elem)
253
254 def _extract_text_content(self, root):
255 """Extract text content from Word XML, preserving paragraph structure.
256
257 Empty paragraphs are skipped to avoid false positives when tracked
258 insertions add only structural elements without text content.
259 """
260 p_tag = f"{{{self.namespaces['w']}}}p"
261 t_tag = f"{{{self.namespaces['w']}}}t"
262
263 paragraphs = []
264 for p_elem in root.findall(f".//{p_tag}"):
265 # Get all text elements within this paragraph
266 text_parts = []
267 for t_elem in p_elem.findall(f".//{t_tag}"):
268 if t_elem.text:
269 text_parts.append(t_elem.text)
270 paragraph_text = "".join(text_parts)
271 # Skip empty paragraphs - they don't affect content validation
272 if paragraph_text:
273 paragraphs.append(paragraph_text)
274
275 return "\n".join(paragraphs)
276
277
278if __name__ == "__main__":
279 raise RuntimeError("This module should not be run directly.")