main
   1#!/usr/bin/env python3
   2"""
   3Extract structured text content from PowerPoint presentations.
   4
   5This module provides functionality to:
   6- Extract all text content from PowerPoint shapes
   7- Preserve paragraph formatting (alignment, bullets, fonts, spacing)
   8- Handle nested GroupShapes recursively with correct absolute positions
   9- Sort shapes by visual position on slides
  10- Filter out slide numbers and non-content placeholders
  11- Export to JSON with clean, structured data
  12
  13Classes:
  14    ParagraphData: Represents a text paragraph with formatting
  15    ShapeData: Represents a shape with position and text content
  16
  17Main Functions:
  18    extract_text_inventory: Extract all text from a presentation
  19    save_inventory: Save extracted data to JSON
  20
  21Usage:
  22    python inventory.py input.pptx output.json
  23"""
  24
  25import argparse
  26import json
  27import platform
  28import sys
  29from dataclasses import dataclass
  30from pathlib import Path
  31from typing import Any, Dict, List, Optional, Tuple, Union
  32
  33from PIL import Image, ImageDraw, ImageFont
  34from pptx import Presentation
  35from pptx.enum.text import PP_ALIGN
  36from pptx.shapes.base import BaseShape
  37
  38# Type aliases for cleaner signatures
  39JsonValue = Union[str, int, float, bool, None]
  40ParagraphDict = Dict[str, JsonValue]
  41ShapeDict = Dict[
  42    str, Union[str, float, bool, List[ParagraphDict], List[str], Dict[str, Any], None]
  43]
  44InventoryData = Dict[
  45    str, Dict[str, "ShapeData"]
  46]  # Dict of slide_id -> {shape_id -> ShapeData}
  47InventoryDict = Dict[str, Dict[str, ShapeDict]]  # JSON-serializable inventory
  48
  49
  50def main():
  51    """Main entry point for command-line usage."""
  52    parser = argparse.ArgumentParser(
  53        description="Extract text inventory from PowerPoint with proper GroupShape support.",
  54        formatter_class=argparse.RawDescriptionHelpFormatter,
  55        epilog="""
  56Examples:
  57  python inventory.py presentation.pptx inventory.json
  58    Extracts text inventory with correct absolute positions for grouped shapes
  59
  60  python inventory.py presentation.pptx inventory.json --issues-only
  61    Extracts only text shapes that have overflow or overlap issues
  62
  63The output JSON includes:
  64  - All text content organized by slide and shape
  65  - Correct absolute positions for shapes in groups
  66  - Visual position and size in inches
  67  - Paragraph properties and formatting
  68  - Issue detection: text overflow and shape overlaps
  69        """,
  70    )
  71
  72    parser.add_argument("input", help="Input PowerPoint file (.pptx)")
  73    parser.add_argument("output", help="Output JSON file for inventory")
  74    parser.add_argument(
  75        "--issues-only",
  76        action="store_true",
  77        help="Include only text shapes that have overflow or overlap issues",
  78    )
  79
  80    args = parser.parse_args()
  81
  82    input_path = Path(args.input)
  83    if not input_path.exists():
  84        print(f"Error: Input file not found: {args.input}")
  85        sys.exit(1)
  86
  87    if not input_path.suffix.lower() == ".pptx":
  88        print("Error: Input must be a PowerPoint file (.pptx)")
  89        sys.exit(1)
  90
  91    try:
  92        print(f"Extracting text inventory from: {args.input}")
  93        if args.issues_only:
  94            print(
  95                "Filtering to include only text shapes with issues (overflow/overlap)"
  96            )
  97        inventory = extract_text_inventory(input_path, issues_only=args.issues_only)
  98
  99        output_path = Path(args.output)
 100        output_path.parent.mkdir(parents=True, exist_ok=True)
 101        save_inventory(inventory, output_path)
 102
 103        print(f"Output saved to: {args.output}")
 104
 105        # Report statistics
 106        total_slides = len(inventory)
 107        total_shapes = sum(len(shapes) for shapes in inventory.values())
 108        if args.issues_only:
 109            if total_shapes > 0:
 110                print(
 111                    f"Found {total_shapes} text elements with issues in {total_slides} slides"
 112                )
 113            else:
 114                print("No issues discovered")
 115        else:
 116            print(
 117                f"Found text in {total_slides} slides with {total_shapes} text elements"
 118            )
 119
 120    except Exception as e:
 121        print(f"Error processing presentation: {e}")
 122        import traceback
 123
 124        traceback.print_exc()
 125        sys.exit(1)
 126
 127
 128@dataclass
 129class ShapeWithPosition:
 130    """A shape with its absolute position on the slide."""
 131
 132    shape: BaseShape
 133    absolute_left: int  # in EMUs
 134    absolute_top: int  # in EMUs
 135
 136
 137class ParagraphData:
 138    """Data structure for paragraph properties extracted from a PowerPoint paragraph."""
 139
 140    def __init__(self, paragraph: Any):
 141        """Initialize from a PowerPoint paragraph object.
 142
 143        Args:
 144            paragraph: The PowerPoint paragraph object
 145        """
 146        self.text: str = paragraph.text.strip()
 147        self.bullet: bool = False
 148        self.level: Optional[int] = None
 149        self.alignment: Optional[str] = None
 150        self.space_before: Optional[float] = None
 151        self.space_after: Optional[float] = None
 152        self.font_name: Optional[str] = None
 153        self.font_size: Optional[float] = None
 154        self.bold: Optional[bool] = None
 155        self.italic: Optional[bool] = None
 156        self.underline: Optional[bool] = None
 157        self.color: Optional[str] = None
 158        self.theme_color: Optional[str] = None
 159        self.line_spacing: Optional[float] = None
 160
 161        # Check for bullet formatting
 162        if (
 163            hasattr(paragraph, "_p")
 164            and paragraph._p is not None
 165            and paragraph._p.pPr is not None
 166        ):
 167            pPr = paragraph._p.pPr
 168            ns = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
 169            if (
 170                pPr.find(f"{ns}buChar") is not None
 171                or pPr.find(f"{ns}buAutoNum") is not None
 172            ):
 173                self.bullet = True
 174                if hasattr(paragraph, "level"):
 175                    self.level = paragraph.level
 176
 177        # Add alignment if not LEFT (default)
 178        if hasattr(paragraph, "alignment") and paragraph.alignment is not None:
 179            alignment_map = {
 180                PP_ALIGN.CENTER: "CENTER",
 181                PP_ALIGN.RIGHT: "RIGHT",
 182                PP_ALIGN.JUSTIFY: "JUSTIFY",
 183            }
 184            if paragraph.alignment in alignment_map:
 185                self.alignment = alignment_map[paragraph.alignment]
 186
 187        # Add spacing properties if set
 188        if hasattr(paragraph, "space_before") and paragraph.space_before:
 189            self.space_before = paragraph.space_before.pt
 190        if hasattr(paragraph, "space_after") and paragraph.space_after:
 191            self.space_after = paragraph.space_after.pt
 192
 193        # Extract font properties from first run
 194        if paragraph.runs:
 195            first_run = paragraph.runs[0]
 196            if hasattr(first_run, "font"):
 197                font = first_run.font
 198                if font.name:
 199                    self.font_name = font.name
 200                if font.size:
 201                    self.font_size = font.size.pt
 202                if font.bold is not None:
 203                    self.bold = font.bold
 204                if font.italic is not None:
 205                    self.italic = font.italic
 206                if font.underline is not None:
 207                    self.underline = font.underline
 208
 209                # Handle color - both RGB and theme colors
 210                try:
 211                    # Try RGB color first
 212                    if font.color.rgb:
 213                        self.color = str(font.color.rgb)
 214                except (AttributeError, TypeError):
 215                    # Fall back to theme color
 216                    try:
 217                        if font.color.theme_color:
 218                            self.theme_color = font.color.theme_color.name
 219                    except (AttributeError, TypeError):
 220                        pass
 221
 222        # Add line spacing if set
 223        if hasattr(paragraph, "line_spacing") and paragraph.line_spacing is not None:
 224            if hasattr(paragraph.line_spacing, "pt"):
 225                self.line_spacing = round(paragraph.line_spacing.pt, 2)
 226            else:
 227                # Multiplier - convert to points
 228                font_size = self.font_size if self.font_size else 12.0
 229                self.line_spacing = round(paragraph.line_spacing * font_size, 2)
 230
 231    def to_dict(self) -> ParagraphDict:
 232        """Convert to dictionary for JSON serialization, excluding None values."""
 233        result: ParagraphDict = {"text": self.text}
 234
 235        # Add optional fields only if they have values
 236        if self.bullet:
 237            result["bullet"] = self.bullet
 238        if self.level is not None:
 239            result["level"] = self.level
 240        if self.alignment:
 241            result["alignment"] = self.alignment
 242        if self.space_before is not None:
 243            result["space_before"] = self.space_before
 244        if self.space_after is not None:
 245            result["space_after"] = self.space_after
 246        if self.font_name:
 247            result["font_name"] = self.font_name
 248        if self.font_size is not None:
 249            result["font_size"] = self.font_size
 250        if self.bold is not None:
 251            result["bold"] = self.bold
 252        if self.italic is not None:
 253            result["italic"] = self.italic
 254        if self.underline is not None:
 255            result["underline"] = self.underline
 256        if self.color:
 257            result["color"] = self.color
 258        if self.theme_color:
 259            result["theme_color"] = self.theme_color
 260        if self.line_spacing is not None:
 261            result["line_spacing"] = self.line_spacing
 262
 263        return result
 264
 265
 266class ShapeData:
 267    """Data structure for shape properties extracted from a PowerPoint shape."""
 268
 269    @staticmethod
 270    def emu_to_inches(emu: int) -> float:
 271        """Convert EMUs (English Metric Units) to inches."""
 272        return emu / 914400.0
 273
 274    @staticmethod
 275    def inches_to_pixels(inches: float, dpi: int = 96) -> int:
 276        """Convert inches to pixels at given DPI."""
 277        return int(inches * dpi)
 278
 279    @staticmethod
 280    def get_font_path(font_name: str) -> Optional[str]:
 281        """Get the font file path for a given font name.
 282
 283        Args:
 284            font_name: Name of the font (e.g., 'Arial', 'Calibri')
 285
 286        Returns:
 287            Path to the font file, or None if not found
 288        """
 289        system = platform.system()
 290
 291        # Common font file variations to try
 292        font_variations = [
 293            font_name,
 294            font_name.lower(),
 295            font_name.replace(" ", ""),
 296            font_name.replace(" ", "-"),
 297        ]
 298
 299        # Define font directories and extensions by platform
 300        if system == "Darwin":  # macOS
 301            font_dirs = [
 302                "/System/Library/Fonts/",
 303                "/Library/Fonts/",
 304                "~/Library/Fonts/",
 305            ]
 306            extensions = [".ttf", ".otf", ".ttc", ".dfont"]
 307        else:  # Linux
 308            font_dirs = [
 309                "/usr/share/fonts/truetype/",
 310                "/usr/local/share/fonts/",
 311                "~/.fonts/",
 312            ]
 313            extensions = [".ttf", ".otf"]
 314
 315        # Try to find the font file
 316        from pathlib import Path
 317
 318        for font_dir in font_dirs:
 319            font_dir_path = Path(font_dir).expanduser()
 320            if not font_dir_path.exists():
 321                continue
 322
 323            # First try exact matches
 324            for variant in font_variations:
 325                for ext in extensions:
 326                    font_path = font_dir_path / f"{variant}{ext}"
 327                    if font_path.exists():
 328                        return str(font_path)
 329
 330            # Then try fuzzy matching - find files containing the font name
 331            try:
 332                for file_path in font_dir_path.iterdir():
 333                    if file_path.is_file():
 334                        file_name_lower = file_path.name.lower()
 335                        font_name_lower = font_name.lower().replace(" ", "")
 336                        if font_name_lower in file_name_lower and any(
 337                            file_name_lower.endswith(ext) for ext in extensions
 338                        ):
 339                            return str(file_path)
 340            except (OSError, PermissionError):
 341                continue
 342
 343        return None
 344
 345    @staticmethod
 346    def get_slide_dimensions(slide: Any) -> tuple[Optional[int], Optional[int]]:
 347        """Get slide dimensions from slide object.
 348
 349        Args:
 350            slide: Slide object
 351
 352        Returns:
 353            Tuple of (width_emu, height_emu) or (None, None) if not found
 354        """
 355        try:
 356            prs = slide.part.package.presentation_part.presentation
 357            return prs.slide_width, prs.slide_height
 358        except (AttributeError, TypeError):
 359            return None, None
 360
 361    @staticmethod
 362    def get_default_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]:
 363        """Extract default font size from slide layout for a placeholder shape.
 364
 365        Args:
 366            shape: Placeholder shape
 367            slide_layout: Slide layout containing the placeholder definition
 368
 369        Returns:
 370            Default font size in points, or None if not found
 371        """
 372        try:
 373            if not hasattr(shape, "placeholder_format"):
 374                return None
 375
 376            shape_type = shape.placeholder_format.type  # type: ignore
 377            for layout_placeholder in slide_layout.placeholders:
 378                if layout_placeholder.placeholder_format.type == shape_type:
 379                    # Find first defRPr element with sz (size) attribute
 380                    for elem in layout_placeholder.element.iter():
 381                        if "defRPr" in elem.tag and (sz := elem.get("sz")):
 382                            return float(sz) / 100.0  # Convert EMUs to points
 383                    break
 384        except Exception:
 385            pass
 386        return None
 387
 388    def __init__(
 389        self,
 390        shape: BaseShape,
 391        absolute_left: Optional[int] = None,
 392        absolute_top: Optional[int] = None,
 393        slide: Optional[Any] = None,
 394    ):
 395        """Initialize from a PowerPoint shape object.
 396
 397        Args:
 398            shape: The PowerPoint shape object (should be pre-validated)
 399            absolute_left: Absolute left position in EMUs (for shapes in groups)
 400            absolute_top: Absolute top position in EMUs (for shapes in groups)
 401            slide: Optional slide object to get dimensions and layout information
 402        """
 403        self.shape = shape  # Store reference to original shape
 404        self.shape_id: str = ""  # Will be set after sorting
 405
 406        # Get slide dimensions from slide object
 407        self.slide_width_emu, self.slide_height_emu = (
 408            self.get_slide_dimensions(slide) if slide else (None, None)
 409        )
 410
 411        # Get placeholder type if applicable
 412        self.placeholder_type: Optional[str] = None
 413        self.default_font_size: Optional[float] = None
 414        if hasattr(shape, "is_placeholder") and shape.is_placeholder:  # type: ignore
 415            if shape.placeholder_format and shape.placeholder_format.type:  # type: ignore
 416                self.placeholder_type = (
 417                    str(shape.placeholder_format.type).split(".")[-1].split(" ")[0]  # type: ignore
 418                )
 419
 420                # Get default font size from layout
 421                if slide and hasattr(slide, "slide_layout"):
 422                    self.default_font_size = self.get_default_font_size(
 423                        shape, slide.slide_layout
 424                    )
 425
 426        # Get position information
 427        # Use absolute positions if provided (for shapes in groups), otherwise use shape's position
 428        left_emu = (
 429            absolute_left
 430            if absolute_left is not None
 431            else (shape.left if hasattr(shape, "left") else 0)
 432        )
 433        top_emu = (
 434            absolute_top
 435            if absolute_top is not None
 436            else (shape.top if hasattr(shape, "top") else 0)
 437        )
 438
 439        self.left: float = round(self.emu_to_inches(left_emu), 2)  # type: ignore
 440        self.top: float = round(self.emu_to_inches(top_emu), 2)  # type: ignore
 441        self.width: float = round(
 442            self.emu_to_inches(shape.width if hasattr(shape, "width") else 0),
 443            2,  # type: ignore
 444        )
 445        self.height: float = round(
 446            self.emu_to_inches(shape.height if hasattr(shape, "height") else 0),
 447            2,  # type: ignore
 448        )
 449
 450        # Store EMU positions for overflow calculations
 451        self.left_emu = left_emu
 452        self.top_emu = top_emu
 453        self.width_emu = shape.width if hasattr(shape, "width") else 0
 454        self.height_emu = shape.height if hasattr(shape, "height") else 0
 455
 456        # Calculate overflow status
 457        self.frame_overflow_bottom: Optional[float] = None
 458        self.slide_overflow_right: Optional[float] = None
 459        self.slide_overflow_bottom: Optional[float] = None
 460        self.overlapping_shapes: Dict[
 461            str, float
 462        ] = {}  # Dict of shape_id -> overlap area in sq inches
 463        self.warnings: List[str] = []
 464        self._estimate_frame_overflow()
 465        self._calculate_slide_overflow()
 466        self._detect_bullet_issues()
 467
 468    @property
 469    def paragraphs(self) -> List[ParagraphData]:
 470        """Calculate paragraphs from the shape's text frame."""
 471        if not self.shape or not hasattr(self.shape, "text_frame"):
 472            return []
 473
 474        paragraphs = []
 475        for paragraph in self.shape.text_frame.paragraphs:  # type: ignore
 476            if paragraph.text.strip():
 477                paragraphs.append(ParagraphData(paragraph))
 478        return paragraphs
 479
 480    def _get_default_font_size(self) -> int:
 481        """Get default font size from theme text styles or use conservative default."""
 482        try:
 483            if not (
 484                hasattr(self.shape, "part") and hasattr(self.shape.part, "slide_layout")
 485            ):
 486                return 14
 487
 488            slide_master = self.shape.part.slide_layout.slide_master  # type: ignore
 489            if not hasattr(slide_master, "element"):
 490                return 14
 491
 492            # Determine theme style based on placeholder type
 493            style_name = "bodyStyle"  # Default
 494            if self.placeholder_type and "TITLE" in self.placeholder_type:
 495                style_name = "titleStyle"
 496
 497            # Find font size in theme styles
 498            for child in slide_master.element.iter():
 499                tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
 500                if tag == style_name:
 501                    for elem in child.iter():
 502                        if "sz" in elem.attrib:
 503                            return int(elem.attrib["sz"]) // 100
 504        except Exception:
 505            pass
 506
 507        return 14  # Conservative default for body text
 508
 509    def _get_usable_dimensions(self, text_frame) -> Tuple[int, int]:
 510        """Get usable width and height in pixels after accounting for margins."""
 511        # Default PowerPoint margins in inches
 512        margins = {"top": 0.05, "bottom": 0.05, "left": 0.1, "right": 0.1}
 513
 514        # Override with actual margins if set
 515        if hasattr(text_frame, "margin_top") and text_frame.margin_top:
 516            margins["top"] = self.emu_to_inches(text_frame.margin_top)
 517        if hasattr(text_frame, "margin_bottom") and text_frame.margin_bottom:
 518            margins["bottom"] = self.emu_to_inches(text_frame.margin_bottom)
 519        if hasattr(text_frame, "margin_left") and text_frame.margin_left:
 520            margins["left"] = self.emu_to_inches(text_frame.margin_left)
 521        if hasattr(text_frame, "margin_right") and text_frame.margin_right:
 522            margins["right"] = self.emu_to_inches(text_frame.margin_right)
 523
 524        # Calculate usable area
 525        usable_width = self.width - margins["left"] - margins["right"]
 526        usable_height = self.height - margins["top"] - margins["bottom"]
 527
 528        # Convert to pixels
 529        return (
 530            self.inches_to_pixels(usable_width),
 531            self.inches_to_pixels(usable_height),
 532        )
 533
 534    def _wrap_text_line(self, line: str, max_width_px: int, draw, font) -> List[str]:
 535        """Wrap a single line of text to fit within max_width_px."""
 536        if not line:
 537            return [""]
 538
 539        # Use textlength for efficient width calculation
 540        if draw.textlength(line, font=font) <= max_width_px:
 541            return [line]
 542
 543        # Need to wrap - split into words
 544        wrapped = []
 545        words = line.split(" ")
 546        current_line = ""
 547
 548        for word in words:
 549            test_line = current_line + (" " if current_line else "") + word
 550            if draw.textlength(test_line, font=font) <= max_width_px:
 551                current_line = test_line
 552            else:
 553                if current_line:
 554                    wrapped.append(current_line)
 555                current_line = word
 556
 557        if current_line:
 558            wrapped.append(current_line)
 559
 560        return wrapped
 561
 562    def _estimate_frame_overflow(self) -> None:
 563        """Estimate if text overflows the shape bounds using PIL text measurement."""
 564        if not self.shape or not hasattr(self.shape, "text_frame"):
 565            return
 566
 567        text_frame = self.shape.text_frame  # type: ignore
 568        if not text_frame or not text_frame.paragraphs:
 569            return
 570
 571        # Get usable dimensions after accounting for margins
 572        usable_width_px, usable_height_px = self._get_usable_dimensions(text_frame)
 573        if usable_width_px <= 0 or usable_height_px <= 0:
 574            return
 575
 576        # Set up PIL for text measurement
 577        dummy_img = Image.new("RGB", (1, 1))
 578        draw = ImageDraw.Draw(dummy_img)
 579
 580        # Get default font size from placeholder or use conservative estimate
 581        default_font_size = self._get_default_font_size()
 582
 583        # Calculate total height of all paragraphs
 584        total_height_px = 0
 585
 586        for para_idx, paragraph in enumerate(text_frame.paragraphs):
 587            if not paragraph.text.strip():
 588                continue
 589
 590            para_data = ParagraphData(paragraph)
 591
 592            # Load font for this paragraph
 593            font_name = para_data.font_name or "Arial"
 594            font_size = int(para_data.font_size or default_font_size)
 595
 596            font = None
 597            font_path = self.get_font_path(font_name)
 598            if font_path:
 599                try:
 600                    font = ImageFont.truetype(font_path, size=font_size)
 601                except Exception:
 602                    font = ImageFont.load_default()
 603            else:
 604                font = ImageFont.load_default()
 605
 606            # Wrap all lines in this paragraph
 607            all_wrapped_lines = []
 608            for line in paragraph.text.split("\n"):
 609                wrapped = self._wrap_text_line(line, usable_width_px, draw, font)
 610                all_wrapped_lines.extend(wrapped)
 611
 612            if all_wrapped_lines:
 613                # Calculate line height
 614                if para_data.line_spacing:
 615                    # Custom line spacing explicitly set
 616                    line_height_px = para_data.line_spacing * 96 / 72
 617                else:
 618                    # PowerPoint default single spacing (1.0x font size)
 619                    line_height_px = font_size * 96 / 72
 620
 621                # Add space_before (except first paragraph)
 622                if para_idx > 0 and para_data.space_before:
 623                    total_height_px += para_data.space_before * 96 / 72
 624
 625                # Add paragraph text height
 626                total_height_px += len(all_wrapped_lines) * line_height_px
 627
 628                # Add space_after
 629                if para_data.space_after:
 630                    total_height_px += para_data.space_after * 96 / 72
 631
 632        # Check for overflow (ignore negligible overflows <= 0.05")
 633        if total_height_px > usable_height_px:
 634            overflow_px = total_height_px - usable_height_px
 635            overflow_inches = round(overflow_px / 96.0, 2)
 636            if overflow_inches > 0.05:  # Only report significant overflows
 637                self.frame_overflow_bottom = overflow_inches
 638
 639    def _calculate_slide_overflow(self) -> None:
 640        """Calculate if shape overflows the slide boundaries."""
 641        if self.slide_width_emu is None or self.slide_height_emu is None:
 642            return
 643
 644        # Check right overflow (ignore negligible overflows <= 0.01")
 645        right_edge_emu = self.left_emu + self.width_emu
 646        if right_edge_emu > self.slide_width_emu:
 647            overflow_emu = right_edge_emu - self.slide_width_emu
 648            overflow_inches = round(self.emu_to_inches(overflow_emu), 2)
 649            if overflow_inches > 0.01:  # Only report significant overflows
 650                self.slide_overflow_right = overflow_inches
 651
 652        # Check bottom overflow (ignore negligible overflows <= 0.01")
 653        bottom_edge_emu = self.top_emu + self.height_emu
 654        if bottom_edge_emu > self.slide_height_emu:
 655            overflow_emu = bottom_edge_emu - self.slide_height_emu
 656            overflow_inches = round(self.emu_to_inches(overflow_emu), 2)
 657            if overflow_inches > 0.01:  # Only report significant overflows
 658                self.slide_overflow_bottom = overflow_inches
 659
 660    def _detect_bullet_issues(self) -> None:
 661        """Detect bullet point formatting issues in paragraphs."""
 662        if not self.shape or not hasattr(self.shape, "text_frame"):
 663            return
 664
 665        text_frame = self.shape.text_frame  # type: ignore
 666        if not text_frame or not text_frame.paragraphs:
 667            return
 668
 669        # Common bullet symbols that indicate manual bullets
 670        bullet_symbols = ["", "", ""]
 671
 672        for paragraph in text_frame.paragraphs:
 673            text = paragraph.text.strip()
 674            # Check for manual bullet symbols
 675            if text and any(text.startswith(symbol + " ") for symbol in bullet_symbols):
 676                self.warnings.append(
 677                    "manual_bullet_symbol: use proper bullet formatting"
 678                )
 679                break
 680
 681    @property
 682    def has_any_issues(self) -> bool:
 683        """Check if shape has any issues (overflow, overlap, or warnings)."""
 684        return (
 685            self.frame_overflow_bottom is not None
 686            or self.slide_overflow_right is not None
 687            or self.slide_overflow_bottom is not None
 688            or len(self.overlapping_shapes) > 0
 689            or len(self.warnings) > 0
 690        )
 691
 692    def to_dict(self) -> ShapeDict:
 693        """Convert to dictionary for JSON serialization."""
 694        result: ShapeDict = {
 695            "left": self.left,
 696            "top": self.top,
 697            "width": self.width,
 698            "height": self.height,
 699        }
 700
 701        # Add optional fields if present
 702        if self.placeholder_type:
 703            result["placeholder_type"] = self.placeholder_type
 704
 705        if self.default_font_size:
 706            result["default_font_size"] = self.default_font_size
 707
 708        # Add overflow information only if there is overflow
 709        overflow_data = {}
 710
 711        # Add frame overflow if present
 712        if self.frame_overflow_bottom is not None:
 713            overflow_data["frame"] = {"overflow_bottom": self.frame_overflow_bottom}
 714
 715        # Add slide overflow if present
 716        slide_overflow = {}
 717        if self.slide_overflow_right is not None:
 718            slide_overflow["overflow_right"] = self.slide_overflow_right
 719        if self.slide_overflow_bottom is not None:
 720            slide_overflow["overflow_bottom"] = self.slide_overflow_bottom
 721        if slide_overflow:
 722            overflow_data["slide"] = slide_overflow
 723
 724        # Only add overflow field if there is overflow
 725        if overflow_data:
 726            result["overflow"] = overflow_data
 727
 728        # Add overlap field if there are overlapping shapes
 729        if self.overlapping_shapes:
 730            result["overlap"] = {"overlapping_shapes": self.overlapping_shapes}
 731
 732        # Add warnings field if there are warnings
 733        if self.warnings:
 734            result["warnings"] = self.warnings
 735
 736        # Add paragraphs after placeholder_type
 737        result["paragraphs"] = [para.to_dict() for para in self.paragraphs]
 738
 739        return result
 740
 741
 742def is_valid_shape(shape: BaseShape) -> bool:
 743    """Check if a shape contains meaningful text content."""
 744    # Must have a text frame with content
 745    if not hasattr(shape, "text_frame") or not shape.text_frame:  # type: ignore
 746        return False
 747
 748    text = shape.text_frame.text.strip()  # type: ignore
 749    if not text:
 750        return False
 751
 752    # Skip slide numbers and numeric footers
 753    if hasattr(shape, "is_placeholder") and shape.is_placeholder:  # type: ignore
 754        if shape.placeholder_format and shape.placeholder_format.type:  # type: ignore
 755            placeholder_type = (
 756                str(shape.placeholder_format.type).split(".")[-1].split(" ")[0]  # type: ignore
 757            )
 758            if placeholder_type == "SLIDE_NUMBER":
 759                return False
 760            if placeholder_type == "FOOTER" and text.isdigit():
 761                return False
 762
 763    return True
 764
 765
 766def collect_shapes_with_absolute_positions(
 767    shape: BaseShape, parent_left: int = 0, parent_top: int = 0
 768) -> List[ShapeWithPosition]:
 769    """Recursively collect all shapes with valid text, calculating absolute positions.
 770
 771    For shapes within groups, their positions are relative to the group.
 772    This function calculates the absolute position on the slide by accumulating
 773    parent group offsets.
 774
 775    Args:
 776        shape: The shape to process
 777        parent_left: Accumulated left offset from parent groups (in EMUs)
 778        parent_top: Accumulated top offset from parent groups (in EMUs)
 779
 780    Returns:
 781        List of ShapeWithPosition objects with absolute positions
 782    """
 783    if hasattr(shape, "shapes"):  # GroupShape
 784        result = []
 785        # Get this group's position
 786        group_left = shape.left if hasattr(shape, "left") else 0
 787        group_top = shape.top if hasattr(shape, "top") else 0
 788
 789        # Calculate absolute position for this group
 790        abs_group_left = parent_left + group_left
 791        abs_group_top = parent_top + group_top
 792
 793        # Process children with accumulated offsets
 794        for child in shape.shapes:  # type: ignore
 795            result.extend(
 796                collect_shapes_with_absolute_positions(
 797                    child, abs_group_left, abs_group_top
 798                )
 799            )
 800        return result
 801
 802    # Regular shape - check if it has valid text
 803    if is_valid_shape(shape):
 804        # Calculate absolute position
 805        shape_left = shape.left if hasattr(shape, "left") else 0
 806        shape_top = shape.top if hasattr(shape, "top") else 0
 807
 808        return [
 809            ShapeWithPosition(
 810                shape=shape,
 811                absolute_left=parent_left + shape_left,
 812                absolute_top=parent_top + shape_top,
 813            )
 814        ]
 815
 816    return []
 817
 818
 819def sort_shapes_by_position(shapes: List[ShapeData]) -> List[ShapeData]:
 820    """Sort shapes by visual position (top-to-bottom, left-to-right).
 821
 822    Shapes within 0.5 inches vertically are considered on the same row.
 823    """
 824    if not shapes:
 825        return shapes
 826
 827    # Sort by top position first
 828    shapes = sorted(shapes, key=lambda s: (s.top, s.left))
 829
 830    # Group shapes by row (within 0.5 inches vertically)
 831    result = []
 832    row = [shapes[0]]
 833    row_top = shapes[0].top
 834
 835    for shape in shapes[1:]:
 836        if abs(shape.top - row_top) <= 0.5:
 837            row.append(shape)
 838        else:
 839            # Sort current row by left position and add to result
 840            result.extend(sorted(row, key=lambda s: s.left))
 841            row = [shape]
 842            row_top = shape.top
 843
 844    # Don't forget the last row
 845    result.extend(sorted(row, key=lambda s: s.left))
 846    return result
 847
 848
 849def calculate_overlap(
 850    rect1: Tuple[float, float, float, float],
 851    rect2: Tuple[float, float, float, float],
 852    tolerance: float = 0.05,
 853) -> Tuple[bool, float]:
 854    """Calculate if and how much two rectangles overlap.
 855
 856    Args:
 857        rect1: (left, top, width, height) of first rectangle in inches
 858        rect2: (left, top, width, height) of second rectangle in inches
 859        tolerance: Minimum overlap in inches to consider as overlapping (default: 0.05")
 860
 861    Returns:
 862        Tuple of (overlaps, overlap_area) where:
 863        - overlaps: True if rectangles overlap by more than tolerance
 864        - overlap_area: Area of overlap in square inches
 865    """
 866    left1, top1, w1, h1 = rect1
 867    left2, top2, w2, h2 = rect2
 868
 869    # Calculate overlap dimensions
 870    overlap_width = min(left1 + w1, left2 + w2) - max(left1, left2)
 871    overlap_height = min(top1 + h1, top2 + h2) - max(top1, top2)
 872
 873    # Check if there's meaningful overlap (more than tolerance)
 874    if overlap_width > tolerance and overlap_height > tolerance:
 875        # Calculate overlap area in square inches
 876        overlap_area = overlap_width * overlap_height
 877        return True, round(overlap_area, 2)
 878
 879    return False, 0
 880
 881
 882def detect_overlaps(shapes: List[ShapeData]) -> None:
 883    """Detect overlapping shapes and update their overlapping_shapes dictionaries.
 884
 885    This function requires each ShapeData to have its shape_id already set.
 886    It modifies the shapes in-place, adding shape IDs with overlap areas in square inches.
 887
 888    Args:
 889        shapes: List of ShapeData objects with shape_id attributes set
 890    """
 891    n = len(shapes)
 892
 893    # Compare each pair of shapes
 894    for i in range(n):
 895        for j in range(i + 1, n):
 896            shape1 = shapes[i]
 897            shape2 = shapes[j]
 898
 899            # Ensure shape IDs are set
 900            assert shape1.shape_id, f"Shape at index {i} has no shape_id"
 901            assert shape2.shape_id, f"Shape at index {j} has no shape_id"
 902
 903            rect1 = (shape1.left, shape1.top, shape1.width, shape1.height)
 904            rect2 = (shape2.left, shape2.top, shape2.width, shape2.height)
 905
 906            overlaps, overlap_area = calculate_overlap(rect1, rect2)
 907
 908            if overlaps:
 909                # Add shape IDs with overlap area in square inches
 910                shape1.overlapping_shapes[shape2.shape_id] = overlap_area
 911                shape2.overlapping_shapes[shape1.shape_id] = overlap_area
 912
 913
 914def extract_text_inventory(
 915    pptx_path: Path, prs: Optional[Any] = None, issues_only: bool = False
 916) -> InventoryData:
 917    """Extract text content from all slides in a PowerPoint presentation.
 918
 919    Args:
 920        pptx_path: Path to the PowerPoint file
 921        prs: Optional Presentation object to use. If not provided, will load from pptx_path.
 922        issues_only: If True, only include shapes that have overflow or overlap issues
 923
 924    Returns a nested dictionary: {slide-N: {shape-N: ShapeData}}
 925    Shapes are sorted by visual position (top-to-bottom, left-to-right).
 926    The ShapeData objects contain the full shape information and can be
 927    converted to dictionaries for JSON serialization using to_dict().
 928    """
 929    if prs is None:
 930        prs = Presentation(str(pptx_path))
 931    inventory: InventoryData = {}
 932
 933    for slide_idx, slide in enumerate(prs.slides):
 934        # Collect all valid shapes from this slide with absolute positions
 935        shapes_with_positions = []
 936        for shape in slide.shapes:  # type: ignore
 937            shapes_with_positions.extend(collect_shapes_with_absolute_positions(shape))
 938
 939        if not shapes_with_positions:
 940            continue
 941
 942        # Convert to ShapeData with absolute positions and slide reference
 943        shape_data_list = [
 944            ShapeData(
 945                swp.shape,
 946                swp.absolute_left,
 947                swp.absolute_top,
 948                slide,
 949            )
 950            for swp in shapes_with_positions
 951        ]
 952
 953        # Sort by visual position and assign stable IDs in one step
 954        sorted_shapes = sort_shapes_by_position(shape_data_list)
 955        for idx, shape_data in enumerate(sorted_shapes):
 956            shape_data.shape_id = f"shape-{idx}"
 957
 958        # Detect overlaps using the stable shape IDs
 959        if len(sorted_shapes) > 1:
 960            detect_overlaps(sorted_shapes)
 961
 962        # Filter for issues only if requested (after overlap detection)
 963        if issues_only:
 964            sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues]
 965
 966        if not sorted_shapes:
 967            continue
 968
 969        # Create slide inventory using the stable shape IDs
 970        inventory[f"slide-{slide_idx}"] = {
 971            shape_data.shape_id: shape_data for shape_data in sorted_shapes
 972        }
 973
 974    return inventory
 975
 976
 977def get_inventory_as_dict(pptx_path: Path, issues_only: bool = False) -> InventoryDict:
 978    """Extract text inventory and return as JSON-serializable dictionaries.
 979
 980    This is a convenience wrapper around extract_text_inventory that returns
 981    dictionaries instead of ShapeData objects, useful for testing and direct
 982    JSON serialization.
 983
 984    Args:
 985        pptx_path: Path to the PowerPoint file
 986        issues_only: If True, only include shapes that have overflow or overlap issues
 987
 988    Returns:
 989        Nested dictionary with all data serialized for JSON
 990    """
 991    inventory = extract_text_inventory(pptx_path, issues_only=issues_only)
 992
 993    # Convert ShapeData objects to dictionaries
 994    dict_inventory: InventoryDict = {}
 995    for slide_key, shapes in inventory.items():
 996        dict_inventory[slide_key] = {
 997            shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items()
 998        }
 999
1000    return dict_inventory
1001
1002
1003def save_inventory(inventory: InventoryData, output_path: Path) -> None:
1004    """Save inventory to JSON file with proper formatting.
1005
1006    Converts ShapeData objects to dictionaries for JSON serialization.
1007    """
1008    # Convert ShapeData objects to dictionaries
1009    json_inventory: InventoryDict = {}
1010    for slide_key, shapes in inventory.items():
1011        json_inventory[slide_key] = {
1012            shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items()
1013        }
1014
1015    with open(output_path, "w", encoding="utf-8") as f:
1016        json.dump(json_inventory, f, indent=2, ensure_ascii=False)
1017
1018
1019if __name__ == "__main__":
1020    main()