bash0 · vincedarley · Dec 10, 2025
diff --git a/cewe2pdf.py b/cewe2pdf.py
@@ -81,6 +81,7 @@
 from reportlab.lib.utils import ImageReader
 from reportlab.pdfgen import canvas
 from reportlab.platypus import Paragraph, Table
+from reportlab.lib.styles import ParagraphStyle
 # from reportlab.lib.styles import getSampleStyleSheet
 
 import numpy as np
@@ -600,7 +601,140 @@ def warnAndIgnoreEnabledDecorationShadow(decoration):
 
 def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, areaWidth, pdf, transx, transy, pgno): # noqa: C901 (too complex)
     # note: it would be better to use proper html processing here
-    htmlxml = etree.XML(textTag.text)
+
+    # Preprocess text to fix CEWE bugs: merge duplicate style attributes
+    # CEWE sometimes generates invalid XML like: <li style="..." style="...">
+    # We need to merge these into a single style attribute
+    import re
+
+    def merge_duplicate_styles(match):
+        """Merge duplicate style attributes in a single tag."""
+        full_tag = match.group(0)  # e.g., '<li style="..." style="...">'
+
+        # Find all style="..." attributes in this specific tag
+        style_pattern = r'style="([^"]*)"'
+        styles = re.findall(style_pattern, full_tag)
+
+        if len(styles) <= 1:
+            # No duplicates, return unchanged
+            return full_tag
+
+        # Log warning about duplicate styles with context
+        # Extract tag name for context
+        tag_name_match = re.match(r'<(\w+)', full_tag)
+        tag_name = tag_name_match.group(1) if tag_name_match else 'unknown'
+
+        # Get position of this tag in the original text to show nearby text content
+        tag_pos = textTag.text.find(full_tag)
+        if tag_pos >= 0:
+            # Find some actual text content near this tag (not HTML tags)
+            # Look ahead after this tag for text content
+            search_start = tag_pos + len(full_tag)
+            search_end = min(len(textTag.text), search_start + 200)
+            nearby = textTag.text[search_start:search_end]
+            # Extract text between tags
+            text_content = re.sub(r'<[^>]*>', '', nearby)[:20].strip()
+            context = f"near text: '{text_content}'" if text_content else "at start/end"
+        else:
+            context = ""
+
+        logging.warning(f"Merging duplicate 'style' attributes in <{tag_name}> tag ({len(styles)} instances) {context}")
+        logging.warning(f"  Styles: {styles}")
+
+        # Merge all style values
+        merged_parts = []
+        for s in styles:
+            s = s.strip()
+            if s:
+                # Ensure ends with semicolon for proper CSS
+                if not s.endswith(';'):
+                    s += ';'
+                merged_parts.append(s)
+        merged_style = ' '.join(merged_parts).strip()
+
+        # Replace: keep first style="..." and remove all subsequent ones
+        # First, remove ALL style attributes
+        tag_without_styles = re.sub(style_pattern, '', full_tag)
+
+        # Then add the merged style back as the first attribute
+        # Find position after tag name to insert style
+        tag_name_match = re.match(r'(<\w+)(\s|>)', tag_without_styles)
+        if tag_name_match:
+            prefix = tag_name_match.group(1)  # e.g., '<li'
+            rest = tag_without_styles[len(prefix):]  # everything after tag name
+            return f'{prefix} style="{merged_style}"{rest}'
+
+        # Fallback: shouldn't reach here, but return original if parsing fails
+        return full_tag
+
+    # Process each opening tag, merging duplicate style attributes
+    text_content = re.sub(r'<\w+[^>]*>', merge_duplicate_styles, textTag.text)
+
+    # Validate that we haven't lost any actual text content (only fixed attributes)
+    # Strip all HTML tags and compare character counts
+    original_text_only = re.sub(r'<[^>]*>', '', textTag.text)
+    processed_text_only = re.sub(r'<[^>]*>', '', text_content)
+
+    if len(original_text_only) != len(processed_text_only):
+        logging.error("=" * 80)
+        logging.error("PREPROCESSING VALIDATION FAILED: Text content length changed!")
+        logging.error(f"Original text-only length: {len(original_text_only)}")
+        logging.error(f"Processed text-only length: {len(processed_text_only)}")
+        logging.error(f"Difference: {len(processed_text_only) - len(original_text_only)} characters")
+        logging.error("-" * 80)
+        logging.error("Original text-only content:")
+        logging.error(original_text_only)
+        logging.error("-" * 80)
+        logging.error("Processed text-only content:")
+        logging.error(processed_text_only)
+        logging.error("=" * 80)
+        raise ValueError("Text preprocessing corrupted content - text length mismatch")
+
+    try:
+        htmlxml = etree.XML(text_content)
+        # Log what we successfully parsed
+        body = htmlxml.find('.//body')
+        if body is not None:
+            # Log all direct children of body to see structure
+            body_children = list(body)
+        else:
+            logging.warning("No <body> tag found in parsed HTML!")
+    except etree.XMLSyntaxError as e:
+        # Log detailed error information for debugging XML parsing issues
+        logging.error("=" * 80)
+        logging.error("XML PARSING ERROR in text area")
+        logging.error(f"Error: {e}")
+        logging.error(f"Original text content ({len(textTag.text)} characters):")
+        logging.error(textTag.text)
+        logging.error("-" * 80)
+        logging.error(f"Preprocessed text content ({len(text_content)} characters):")
+        logging.error(text_content)
+        logging.error("-" * 80)
+
+        # Try to highlight the problematic portion based on column number
+        if hasattr(e, 'position') and e.position:
+            col = e.position[1] if len(e.position) > 1 else None
+        else:
+            # Try to extract column from error message (e.g., "column 3838")
+            import re
+            match = re.search(r'column (\d+)', str(e))
+            col = int(match.group(1)) if match else None
+
+        if col is not None:
+            # Show context around the error (30 chars before and after)
+            start = max(0, col - 30)
+            end = min(len(text_content), col + 30)
+            context = text_content[start:end]
+            marker_pos = min(30, col - start)
+
+            logging.error(f"Context around column {col} in preprocessed text:")
+            logging.error(f"  {context}")
+            logging.error(f"  {' ' * marker_pos}^ (error position)")
+
+        logging.error("=" * 80)
+        # Re-throw the error for now
+        raise
+
     body = htmlxml.find('.//body')
     bstyle = dict([kv.split(':') for kv in body.get('style').lstrip(' ').rstrip(';').split('; ')])
     try:
@@ -676,8 +810,15 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
     # Concatenating them to just one index entry seems to work in practice
     indexEntryText = None
 
+    # Track all direct children of body to validate we process everything
+    all_body_children = list(body)
+    unprocessed_children = set(all_body_children)  # Will remove elements as we process them
+
     htmlparas = body.findall(".//p")
+
     for p in htmlparas:
+        # Mark this paragraph as processed
+        unprocessed_children.discard(p)
         maxfs = 0  # cannot use the bodyfs as a default, there may not actually be any text at body size
         if p.get('align') == 'center':
             pdf_styleN.alignment = reportlab.lib.enums.TA_CENTER
@@ -703,6 +844,9 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
         finalLeadingFactor = LineScales.lineScaleForFont(bodyfont) * pLineHeight
 
         htmlspans = p.findall(".*")
+        logging.debug(f"Paragraph has {len(htmlspans)} child elements")
+        for child in htmlspans[:3]:  # Log first 3 to see what they are
+            logging.debug(f"  Child tag: {child.tag}")
         if len(htmlspans) < 1: # i.e. there are no spans, just a paragraph
             paragraphText = '<para autoLeading="max">'
             paragraphText, maxfs = AppendItemTextInStyle(paragraphText, p.text, p, pdf,
@@ -779,7 +923,7 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
                         paragraphText = AppendText(paragraphText, html.escape(span.tail))
 
                 else:
-                    logging.warning(f"Ignoring unhandled tag {item.tag}")
+                    logging.warning(f"Ignoring unhandled tag {item.tag} in text area (tag content: {etree.tostring(item, encoding='unicode')[:100]}...)")
 
             # try to create a paragraph with the current text and style. Catch errors.
             try:
@@ -790,6 +934,143 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
             except Exception:
                 logging.exception('Exception')
 
+    # Process <ul> (unordered list) elements - bulleted lists
+    htmllists = body.findall("ul")
+
+    for ul in htmllists:
+        # Mark this list as processed
+        unprocessed_children.discard(ul)
+
+        listitems = ul.findall("li")
+
+        for li in listitems:
+            maxfs = 0
+
+            # Create a copy of the style for this list item with hanging indent
+            list_styleN = ParagraphStyle('list_item', parent=pdf_styleN)
+            # Hanging indent: first line at 0, subsequent lines indented
+            # Calculate indent based on font size - approximately 2x the font size 
+            # accounts for bullet width + space
+            bullet_indent = bodyfs * 1.65  # Adjust multiplier if needed (1.5 - 2.5 range)
+            list_styleN.leftIndent = bullet_indent  # Where wrapped lines start
+            list_styleN.firstLineIndent = -bullet_indent/2  # Pull first line (with bullet) back halfway position 0
+            bullet_txt = '• '
+
+            # Check alignment (though lists are typically left-aligned)
+            if li.get('align') == 'center':
+                list_styleN.alignment = reportlab.lib.enums.TA_CENTER
+            elif li.get('align') == 'right':
+                list_styleN.alignment = reportlab.lib.enums.TA_RIGHT
+            elif li.get('align') == 'justify':
+                list_styleN.alignment = reportlab.lib.enums.TA_JUSTIFY
+            else:
+                list_styleN.alignment = reportlab.lib.enums.TA_LEFT
+
+            # Get line height from <li> style if present
+            pLineHeight = 1.0
+            liStyleAttribute = li.get('style')
+            if liStyleAttribute is not None:
+                liStyle = dict([kv.split(':') for kv in
+                    li.get('style').lstrip(' ').rstrip(';').split('; ')])
+                if 'line-height' in liStyle.keys():
+                    try:
+                        pLineHeight = floor(float(liStyle['line-height'].strip("%")))/100.0
+                    except: # noqa: E722
+                        logging.warning(f"Ignoring invalid list item line-height setting {liStyleAttribute}")
+            finalLeadingFactor = LineScales.lineScaleForFont(bodyfont) * pLineHeight
+
+            # Start paragraph - we'll add bullet inside the styled text
+            paragraphText = '<para autoLeading="max">'
+
+            # Check if there are child elements (spans, br, etc.)
+            lispans = li.findall(".*")
+
+            if len(lispans) < 1:
+                # Simple list item with just text, no spans
+                # Prepend bullet to the text so it gets styled
+                bullet_plus_text = bullet_txt + (li.text != None and li.text or "")
+                paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
+                    additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+                paragraphText += '</para>'
+                usefs = maxfs if maxfs > 0 else bodyfs
+                list_styleN.leading = usefs * finalLeadingFactor
+                pdf_flowableList.append(Paragraph(paragraphText, list_styleN))
+            else:
+                # List item with spans and other formatting
+                bullet_plus_text = bullet_txt + (li.text != None and li.text or "")
+                paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
+                    additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+                paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
+                    additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+                usefs = maxfs if maxfs > 0 else bodyfs
+                list_styleN.leading = usefs * finalLeadingFactor
+
+                # Process child elements (spans, br, etc.)
+                for item in lispans:
+                    if item.tag == 'br':
+                        br = item
+                        # For lists, we don't typically break into multiple paragraphs on <br>
+                        # Instead, insert a line break within the same paragraph
+                        paragraphText += '<br/>'
+                        if br.tail:
+                            paragraphText, maxfs = AppendItemTextInStyle(paragraphText, br.tail, li, pdf,
+                                additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+
+                    elif item.tag == 'span':
+                        span = item
+                        spanfont, spanfs, spanweight, spanstyle = CollectFontInfo(span, pdf, additional_fonts, bodyfont, bodyfs, bweight)
+
+                        maxfs = max(maxfs, spanfs)
+
+                        paragraphText = AppendSpanStart(paragraphText, spanfont, spanfs, spanweight, spanstyle, bstyle)
+
+                        if span.text is not None:
+                            paragraphText = AppendText(paragraphText, html.escape(span.text))
+
+                        # Handle line breaks within spans
+                        brs = span.findall(".//br")
+                        if len(brs) > 0:
+                            paragraphText = AppendSpanEnd(paragraphText, spanweight, spanstyle, bstyle)
+                            for br in brs:
+                                paragraphText += '<br/>'
+                                if br.tail:
+                                    paragraphText, maxfs = AppendItemTextInStyle(paragraphText, br.tail, span, pdf,
+                                        additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+                        else:
+                            paragraphText = AppendSpanEnd(paragraphText, spanweight, spanstyle, bstyle)
+
+                        if span.tail is not None:
+                            paragraphText = AppendText(paragraphText, html.escape(span.tail))
+
+                    else:
+                        logging.warning(f"Ignoring unhandled tag {item.tag} in list item (tag content: {etree.tostring(item, encoding='unicode')[:100]}...)")
+
+                # Finalize the list item paragraph
+                try:
+                    paragraphText += '</para>'
+                    usefs = maxfs if maxfs > 0 else bodyfs
+                    list_styleN.leading = usefs * finalLeadingFactor
+                    pdf_flowableList.append(Paragraph(paragraphText, list_styleN))
+                except Exception:
+                    logging.exception('Exception')
+
+    # The <table> tag contains margin info, not actual content - mark it as processed
+    table = body.find('table')
+    if table is not None:
+        unprocessed_children.discard(table)
+
+    # Validate: warn about any body children that we didn't process
+    if unprocessed_children:
+        logging.warning("=" * 80)
+        logging.warning("TEXT CONTENT BEING SILENTLY IGNORED!")
+        logging.warning(f"Found {len(unprocessed_children)} unprocessed elements as direct children of <body>:")
+        for child in unprocessed_children:
+            child_text = ''.join(child.itertext())[:100]  # Get text content, first 100 chars
+            logging.warning(f"  Ignoring <{child.tag}> with {len(list(child))} children")
+            logging.warning(f"    Text content preview: {child_text}")
+            logging.warning(f"    XML: {etree.tostring(child, encoding='unicode')[:200]}...")
+        logging.warning("=" * 80)
+
     if indexEntryText:
         albumIndex.AddIndexEntry(pgno, indexEntryText)