skills/skills/document-skills/docx/ooxml/scripts/validation/redlining.py at main

  1"""
  2Validator for tracked changes in Word documents.
  3"""
  4
  5import subprocess
  6import tempfile
  7import zipfile
  8from pathlib import Path
  9
 10
 11class RedliningValidator:
 12    """Validator for tracked changes in Word documents."""
 13
 14    def __init__(self, unpacked_dir, original_docx, verbose=False):
 15        self.unpacked_dir = Path(unpacked_dir)
 16        self.original_docx = Path(original_docx)
 17        self.verbose = verbose
 18        self.namespaces = {
 19            "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
 20        }
 21
 22    def validate(self):
 23        """Main validation method that returns True if valid, False otherwise."""
 24        # Verify unpacked directory exists and has correct structure
 25        modified_file = self.unpacked_dir / "word" / "document.xml"
 26        if not modified_file.exists():
 27            print(f"FAILED - Modified document.xml not found at {modified_file}")
 28            return False
 29
 30        # First, check if there are any tracked changes by Claude to validate
 31        try:
 32            import xml.etree.ElementTree as ET
 33
 34            tree = ET.parse(modified_file)
 35            root = tree.getroot()
 36
 37            # Check for w:del or w:ins tags authored by Claude
 38            del_elements = root.findall(".//w:del", self.namespaces)
 39            ins_elements = root.findall(".//w:ins", self.namespaces)
 40
 41            # Filter to only include changes by Claude
 42            claude_del_elements = [
 43                elem
 44                for elem in del_elements
 45                if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
 46            ]
 47            claude_ins_elements = [
 48                elem
 49                for elem in ins_elements
 50                if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
 51            ]
 52
 53            # Redlining validation is only needed if tracked changes by Claude have been used.
 54            if not claude_del_elements and not claude_ins_elements:
 55                if self.verbose:
 56                    print("PASSED - No tracked changes by Claude found.")
 57                return True
 58
 59        except Exception:
 60            # If we can't parse the XML, continue with full validation
 61            pass
 62
 63        # Create temporary directory for unpacking original docx
 64        with tempfile.TemporaryDirectory() as temp_dir:
 65            temp_path = Path(temp_dir)
 66
 67            # Unpack original docx
 68            try:
 69                with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
 70                    zip_ref.extractall(temp_path)
 71            except Exception as e:
 72                print(f"FAILED - Error unpacking original docx: {e}")
 73                return False
 74
 75            original_file = temp_path / "word" / "document.xml"
 76            if not original_file.exists():
 77                print(
 78                    f"FAILED - Original document.xml not found in {self.original_docx}"
 79                )
 80                return False
 81
 82            # Parse both XML files using xml.etree.ElementTree for redlining validation
 83            try:
 84                import xml.etree.ElementTree as ET
 85
 86                modified_tree = ET.parse(modified_file)
 87                modified_root = modified_tree.getroot()
 88                original_tree = ET.parse(original_file)
 89                original_root = original_tree.getroot()
 90            except ET.ParseError as e:
 91                print(f"FAILED - Error parsing XML files: {e}")
 92                return False
 93
 94            # Remove Claude's tracked changes from both documents
 95            self._remove_claude_tracked_changes(original_root)
 96            self._remove_claude_tracked_changes(modified_root)
 97
 98            # Extract and compare text content
 99            modified_text = self._extract_text_content(modified_root)
100            original_text = self._extract_text_content(original_root)
101
102            if modified_text != original_text:
103                # Show detailed character-level differences for each paragraph
104                error_message = self._generate_detailed_diff(
105                    original_text, modified_text
106                )
107                print(error_message)
108                return False
109
110            if self.verbose:
111                print("PASSED - All changes by Claude are properly tracked")
112            return True
113
114    def _generate_detailed_diff(self, original_text, modified_text):
115        """Generate detailed word-level differences using git word diff."""
116        error_parts = [
117            "FAILED - Document text doesn't match after removing Claude's tracked changes",
118            "",
119            "Likely causes:",
120            "  1. Modified text inside another author's <w:ins> or <w:del> tags",
121            "  2. Made edits without proper tracked changes",
122            "  3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
123            "",
124            "For pre-redlined documents, use correct patterns:",
125            "  - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
126            "  - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
127            "",
128        ]
129
130        # Show git word diff
131        git_diff = self._get_git_word_diff(original_text, modified_text)
132        if git_diff:
133            error_parts.extend(["Differences:", "============", git_diff])
134        else:
135            error_parts.append("Unable to generate word diff (git not available)")
136
137        return "\n".join(error_parts)
138
139    def _get_git_word_diff(self, original_text, modified_text):
140        """Generate word diff using git with character-level precision."""
141        try:
142            with tempfile.TemporaryDirectory() as temp_dir:
143                temp_path = Path(temp_dir)
144
145                # Create two files
146                original_file = temp_path / "original.txt"
147                modified_file = temp_path / "modified.txt"
148
149                original_file.write_text(original_text, encoding="utf-8")
150                modified_file.write_text(modified_text, encoding="utf-8")
151
152                # Try character-level diff first for precise differences
153                result = subprocess.run(
154                    [
155                        "git",
156                        "diff",
157                        "--word-diff=plain",
158                        "--word-diff-regex=.",  # Character-by-character diff
159                        "-U0",  # Zero lines of context - show only changed lines
160                        "--no-index",
161                        str(original_file),
162                        str(modified_file),
163                    ],
164                    capture_output=True,
165                    text=True,
166                )
167
168                if result.stdout.strip():
169                    # Clean up the output - remove git diff header lines
170                    lines = result.stdout.split("\n")
171                    # Skip the header lines (diff --git, index, +++, ---, @@)
172                    content_lines = []
173                    in_content = False
174                    for line in lines:
175                        if line.startswith("@@"):
176                            in_content = True
177                            continue
178                        if in_content and line.strip():
179                            content_lines.append(line)
180
181                    if content_lines:
182                        return "\n".join(content_lines)
183
184                # Fallback to word-level diff if character-level is too verbose
185                result = subprocess.run(
186                    [
187                        "git",
188                        "diff",
189                        "--word-diff=plain",
190                        "-U0",  # Zero lines of context
191                        "--no-index",
192                        str(original_file),
193                        str(modified_file),
194                    ],
195                    capture_output=True,
196                    text=True,
197                )
198
199                if result.stdout.strip():
200                    lines = result.stdout.split("\n")
201                    content_lines = []
202                    in_content = False
203                    for line in lines:
204                        if line.startswith("@@"):
205                            in_content = True
206                            continue
207                        if in_content and line.strip():
208                            content_lines.append(line)
209                    return "\n".join(content_lines)
210
211        except (subprocess.CalledProcessError, FileNotFoundError, Exception):
212            # Git not available or other error, return None to use fallback
213            pass
214
215        return None
216
217    def _remove_claude_tracked_changes(self, root):
218        """Remove tracked changes authored by Claude from the XML root."""
219        ins_tag = f"{{{self.namespaces['w']}}}ins"
220        del_tag = f"{{{self.namespaces['w']}}}del"
221        author_attr = f"{{{self.namespaces['w']}}}author"
222
223        # Remove w:ins elements
224        for parent in root.iter():
225            to_remove = []
226            for child in parent:
227                if child.tag == ins_tag and child.get(author_attr) == "Claude":
228                    to_remove.append(child)
229            for elem in to_remove:
230                parent.remove(elem)
231
232        # Unwrap content in w:del elements where author is "Claude"
233        deltext_tag = f"{{{self.namespaces['w']}}}delText"
234        t_tag = f"{{{self.namespaces['w']}}}t"
235
236        for parent in root.iter():
237            to_process = []
238            for child in parent:
239                if child.tag == del_tag and child.get(author_attr) == "Claude":
240                    to_process.append((child, list(parent).index(child)))
241
242            # Process in reverse order to maintain indices
243            for del_elem, del_index in reversed(to_process):
244                # Convert w:delText to w:t before moving
245                for elem in del_elem.iter():
246                    if elem.tag == deltext_tag:
247                        elem.tag = t_tag
248
249                # Move all children of w:del to its parent before removing w:del
250                for child in reversed(list(del_elem)):
251                    parent.insert(del_index, child)
252                parent.remove(del_elem)
253
254    def _extract_text_content(self, root):
255        """Extract text content from Word XML, preserving paragraph structure.
256
257        Empty paragraphs are skipped to avoid false positives when tracked
258        insertions add only structural elements without text content.
259        """
260        p_tag = f"{{{self.namespaces['w']}}}p"
261        t_tag = f"{{{self.namespaces['w']}}}t"
262
263        paragraphs = []
264        for p_elem in root.findall(f".//{p_tag}"):
265            # Get all text elements within this paragraph
266            text_parts = []
267            for t_elem in p_elem.findall(f".//{t_tag}"):
268                if t_elem.text:
269                    text_parts.append(t_elem.text)
270            paragraph_text = "".join(text_parts)
271            # Skip empty paragraphs - they don't affect content validation
272            if paragraph_text:
273                paragraphs.append(paragraph_text)
274
275        return "\n".join(paragraphs)
276
277
278if __name__ == "__main__":
279    raise RuntimeError("This module should not be run directly.")