diff --git a/cewe2pdf.py b/cewe2pdf.py
index bef8683..0f06301 100755
--- a/cewe2pdf.py
+++ b/cewe2pdf.py
@@ -81,6 +81,7 @@
from reportlab.lib.utils import ImageReader
from reportlab.pdfgen import canvas
from reportlab.platypus import Paragraph, Table
+from reportlab.lib.styles import ParagraphStyle
# from reportlab.lib.styles import getSampleStyleSheet
import numpy as np
@@ -600,7 +601,140 @@ def warnAndIgnoreEnabledDecorationShadow(decoration):
def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, areaWidth, pdf, transx, transy, pgno): # noqa: C901 (too complex)
# note: it would be better to use proper html processing here
- htmlxml = etree.XML(textTag.text)
+
+ # Preprocess text to fix CEWE bugs: merge duplicate style attributes
+ # CEWE sometimes generates invalid XML like:
+ # We need to merge these into a single style attribute
+ import re
+
+ def merge_duplicate_styles(match):
+ """Merge duplicate style attributes in a single tag."""
+ full_tag = match.group(0) # e.g., ''
+
+ # Find all style="..." attributes in this specific tag
+ style_pattern = r'style="([^"]*)"'
+ styles = re.findall(style_pattern, full_tag)
+
+ if len(styles) <= 1:
+ # No duplicates, return unchanged
+ return full_tag
+
+ # Log warning about duplicate styles with context
+ # Extract tag name for context
+ tag_name_match = re.match(r'<(\w+)', full_tag)
+ tag_name = tag_name_match.group(1) if tag_name_match else 'unknown'
+
+ # Get position of this tag in the original text to show nearby text content
+ tag_pos = textTag.text.find(full_tag)
+ if tag_pos >= 0:
+ # Find some actual text content near this tag (not HTML tags)
+ # Look ahead after this tag for text content
+ search_start = tag_pos + len(full_tag)
+ search_end = min(len(textTag.text), search_start + 200)
+ nearby = textTag.text[search_start:search_end]
+ # Extract text between tags
+ text_content = re.sub(r'<[^>]*>', '', nearby)[:20].strip()
+ context = f"near text: '{text_content}'" if text_content else "at start/end"
+ else:
+ context = ""
+
+ logging.warning(f"Merging duplicate 'style' attributes in <{tag_name}> tag ({len(styles)} instances) {context}")
+ logging.warning(f" Styles: {styles}")
+
+ # Merge all style values
+ merged_parts = []
+ for s in styles:
+ s = s.strip()
+ if s:
+ # Ensure ends with semicolon for proper CSS
+ if not s.endswith(';'):
+ s += ';'
+ merged_parts.append(s)
+ merged_style = ' '.join(merged_parts).strip()
+
+ # Replace: keep first style="..." and remove all subsequent ones
+ # First, remove ALL style attributes
+ tag_without_styles = re.sub(style_pattern, '', full_tag)
+
+ # Then add the merged style back as the first attribute
+ # Find position after tag name to insert style
+ tag_name_match = re.match(r'(<\w+)(\s|>)', tag_without_styles)
+ if tag_name_match:
+ prefix = tag_name_match.group(1) # e.g., ']*>', merge_duplicate_styles, textTag.text)
+
+ # Validate that we haven't lost any actual text content (only fixed attributes)
+ # Strip all HTML tags and compare character counts
+ original_text_only = re.sub(r'<[^>]*>', '', textTag.text)
+ processed_text_only = re.sub(r'<[^>]*>', '', text_content)
+
+ if len(original_text_only) != len(processed_text_only):
+ logging.error("=" * 80)
+ logging.error("PREPROCESSING VALIDATION FAILED: Text content length changed!")
+ logging.error(f"Original text-only length: {len(original_text_only)}")
+ logging.error(f"Processed text-only length: {len(processed_text_only)}")
+ logging.error(f"Difference: {len(processed_text_only) - len(original_text_only)} characters")
+ logging.error("-" * 80)
+ logging.error("Original text-only content:")
+ logging.error(original_text_only)
+ logging.error("-" * 80)
+ logging.error("Processed text-only content:")
+ logging.error(processed_text_only)
+ logging.error("=" * 80)
+ raise ValueError("Text preprocessing corrupted content - text length mismatch")
+
+ try:
+ htmlxml = etree.XML(text_content)
+ # Log what we successfully parsed
+ body = htmlxml.find('.//body')
+ if body is not None:
+ # Log all direct children of body to see structure
+ body_children = list(body)
+ else:
+ logging.warning("No tag found in parsed HTML!")
+ except etree.XMLSyntaxError as e:
+ # Log detailed error information for debugging XML parsing issues
+ logging.error("=" * 80)
+ logging.error("XML PARSING ERROR in text area")
+ logging.error(f"Error: {e}")
+ logging.error(f"Original text content ({len(textTag.text)} characters):")
+ logging.error(textTag.text)
+ logging.error("-" * 80)
+ logging.error(f"Preprocessed text content ({len(text_content)} characters):")
+ logging.error(text_content)
+ logging.error("-" * 80)
+
+ # Try to highlight the problematic portion based on column number
+ if hasattr(e, 'position') and e.position:
+ col = e.position[1] if len(e.position) > 1 else None
+ else:
+ # Try to extract column from error message (e.g., "column 3838")
+ import re
+ match = re.search(r'column (\d+)', str(e))
+ col = int(match.group(1)) if match else None
+
+ if col is not None:
+ # Show context around the error (30 chars before and after)
+ start = max(0, col - 30)
+ end = min(len(text_content), col + 30)
+ context = text_content[start:end]
+ marker_pos = min(30, col - start)
+
+ logging.error(f"Context around column {col} in preprocessed text:")
+ logging.error(f" {context}")
+ logging.error(f" {' ' * marker_pos}^ (error position)")
+
+ logging.error("=" * 80)
+ # Re-throw the error for now
+ raise
+
body = htmlxml.find('.//body')
bstyle = dict([kv.split(':') for kv in body.get('style').lstrip(' ').rstrip(';').split('; ')])
try:
@@ -676,8 +810,15 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
# Concatenating them to just one index entry seems to work in practice
indexEntryText = None
+ # Track all direct children of body to validate we process everything
+ all_body_children = list(body)
+ unprocessed_children = set(all_body_children) # Will remove elements as we process them
+
htmlparas = body.findall(".//p")
+
for p in htmlparas:
+ # Mark this paragraph as processed
+ unprocessed_children.discard(p)
maxfs = 0 # cannot use the bodyfs as a default, there may not actually be any text at body size
if p.get('align') == 'center':
pdf_styleN.alignment = reportlab.lib.enums.TA_CENTER
@@ -703,6 +844,9 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
finalLeadingFactor = LineScales.lineScaleForFont(bodyfont) * pLineHeight
htmlspans = p.findall(".*")
+ logging.debug(f"Paragraph has {len(htmlspans)} child elements")
+ for child in htmlspans[:3]: # Log first 3 to see what they are
+ logging.debug(f" Child tag: {child.tag}")
if len(htmlspans) < 1: # i.e. there are no spans, just a paragraph
paragraphText = ''
paragraphText, maxfs = AppendItemTextInStyle(paragraphText, p.text, p, pdf,
@@ -779,7 +923,7 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
paragraphText = AppendText(paragraphText, html.escape(span.tail))
else:
- logging.warning(f"Ignoring unhandled tag {item.tag}")
+ logging.warning(f"Ignoring unhandled tag {item.tag} in text area (tag content: {etree.tostring(item, encoding='unicode')[:100]}...)")
# try to create a paragraph with the current text and style. Catch errors.
try:
@@ -790,6 +934,143 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
except Exception:
logging.exception('Exception')
+ # Process (unordered list) elements - bulleted lists
+ htmllists = body.findall("ul")
+
+ for ul in htmllists:
+ # Mark this list as processed
+ unprocessed_children.discard(ul)
+
+ listitems = ul.findall("li")
+
+ for li in listitems:
+ maxfs = 0
+
+ # Create a copy of the style for this list item with hanging indent
+ list_styleN = ParagraphStyle('list_item', parent=pdf_styleN)
+ # Hanging indent: first line at 0, subsequent lines indented
+ # Calculate indent based on font size - approximately 2x the font size
+ # accounts for bullet width + space
+ bullet_indent = bodyfs * 1.65 # Adjust multiplier if needed (1.5 - 2.5 range)
+ list_styleN.leftIndent = bullet_indent # Where wrapped lines start
+ list_styleN.firstLineIndent = -bullet_indent/2 # Pull first line (with bullet) back halfway position 0
+ bullet_txt = '• '
+
+ # Check alignment (though lists are typically left-aligned)
+ if li.get('align') == 'center':
+ list_styleN.alignment = reportlab.lib.enums.TA_CENTER
+ elif li.get('align') == 'right':
+ list_styleN.alignment = reportlab.lib.enums.TA_RIGHT
+ elif li.get('align') == 'justify':
+ list_styleN.alignment = reportlab.lib.enums.TA_JUSTIFY
+ else:
+ list_styleN.alignment = reportlab.lib.enums.TA_LEFT
+
+ # Get line height from - style if present
+ pLineHeight = 1.0
+ liStyleAttribute = li.get('style')
+ if liStyleAttribute is not None:
+ liStyle = dict([kv.split(':') for kv in
+ li.get('style').lstrip(' ').rstrip(';').split('; ')])
+ if 'line-height' in liStyle.keys():
+ try:
+ pLineHeight = floor(float(liStyle['line-height'].strip("%")))/100.0
+ except: # noqa: E722
+ logging.warning(f"Ignoring invalid list item line-height setting {liStyleAttribute}")
+ finalLeadingFactor = LineScales.lineScaleForFont(bodyfont) * pLineHeight
+
+ # Start paragraph - we'll add bullet inside the styled text
+ paragraphText = ''
+
+ # Check if there are child elements (spans, br, etc.)
+ lispans = li.findall(".*")
+
+ if len(lispans) < 1:
+ # Simple list item with just text, no spans
+ # Prepend bullet to the text so it gets styled
+ bullet_plus_text = bullet_txt + (li.text != None and li.text or "")
+ paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
+ additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+ paragraphText += ''
+ usefs = maxfs if maxfs > 0 else bodyfs
+ list_styleN.leading = usefs * finalLeadingFactor
+ pdf_flowableList.append(Paragraph(paragraphText, list_styleN))
+ else:
+ # List item with spans and other formatting
+ bullet_plus_text = bullet_txt + (li.text != None and li.text or "")
+ paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
+ additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+ paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
+ additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+ usefs = maxfs if maxfs > 0 else bodyfs
+ list_styleN.leading = usefs * finalLeadingFactor
+
+ # Process child elements (spans, br, etc.)
+ for item in lispans:
+ if item.tag == 'br':
+ br = item
+ # For lists, we don't typically break into multiple paragraphs on
+ # Instead, insert a line break within the same paragraph
+ paragraphText += '
'
+ if br.tail:
+ paragraphText, maxfs = AppendItemTextInStyle(paragraphText, br.tail, li, pdf,
+ additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+
+ elif item.tag == 'span':
+ span = item
+ spanfont, spanfs, spanweight, spanstyle = CollectFontInfo(span, pdf, additional_fonts, bodyfont, bodyfs, bweight)
+
+ maxfs = max(maxfs, spanfs)
+
+ paragraphText = AppendSpanStart(paragraphText, spanfont, spanfs, spanweight, spanstyle, bstyle)
+
+ if span.text is not None:
+ paragraphText = AppendText(paragraphText, html.escape(span.text))
+
+ # Handle line breaks within spans
+ brs = span.findall(".//br")
+ if len(brs) > 0:
+ paragraphText = AppendSpanEnd(paragraphText, spanweight, spanstyle, bstyle)
+ for br in brs:
+ paragraphText += '
'
+ if br.tail:
+ paragraphText, maxfs = AppendItemTextInStyle(paragraphText, br.tail, span, pdf,
+ additional_fonts, bodyfont, bodyfs, bweight, bstyle)
+ else:
+ paragraphText = AppendSpanEnd(paragraphText, spanweight, spanstyle, bstyle)
+
+ if span.tail is not None:
+ paragraphText = AppendText(paragraphText, html.escape(span.tail))
+
+ else:
+ logging.warning(f"Ignoring unhandled tag {item.tag} in list item (tag content: {etree.tostring(item, encoding='unicode')[:100]}...)")
+
+ # Finalize the list item paragraph
+ try:
+ paragraphText += '
'
+ usefs = maxfs if maxfs > 0 else bodyfs
+ list_styleN.leading = usefs * finalLeadingFactor
+ pdf_flowableList.append(Paragraph(paragraphText, list_styleN))
+ except Exception:
+ logging.exception('Exception')
+
+ # The tag contains margin info, not actual content - mark it as processed
+ table = body.find('table')
+ if table is not None:
+ unprocessed_children.discard(table)
+
+ # Validate: warn about any body children that we didn't process
+ if unprocessed_children:
+ logging.warning("=" * 80)
+ logging.warning("TEXT CONTENT BEING SILENTLY IGNORED!")
+ logging.warning(f"Found {len(unprocessed_children)} unprocessed elements as direct children of :")
+ for child in unprocessed_children:
+ child_text = ''.join(child.itertext())[:100] # Get text content, first 100 chars
+ logging.warning(f" Ignoring <{child.tag}> with {len(list(child))} children")
+ logging.warning(f" Text content preview: {child_text}")
+ logging.warning(f" XML: {etree.tostring(child, encoding='unicode')[:200]}...")
+ logging.warning("=" * 80)
+
if indexEntryText:
albumIndex.AddIndexEntry(pgno, indexEntryText)