diff --git a/cewe2pdf.py b/cewe2pdf.py index bef8683..0f06301 100755 --- a/cewe2pdf.py +++ b/cewe2pdf.py @@ -81,6 +81,7 @@ from reportlab.lib.utils import ImageReader from reportlab.pdfgen import canvas from reportlab.platypus import Paragraph, Table +from reportlab.lib.styles import ParagraphStyle # from reportlab.lib.styles import getSampleStyleSheet import numpy as np @@ -600,7 +601,140 @@ def warnAndIgnoreEnabledDecorationShadow(decoration): def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, areaWidth, pdf, transx, transy, pgno): # noqa: C901 (too complex) # note: it would be better to use proper html processing here - htmlxml = etree.XML(textTag.text) + + # Preprocess text to fix CEWE bugs: merge duplicate style attributes + # CEWE sometimes generates invalid XML like:
  • + # We need to merge these into a single style attribute + import re + + def merge_duplicate_styles(match): + """Merge duplicate style attributes in a single tag.""" + full_tag = match.group(0) # e.g., '
  • ' + + # Find all style="..." attributes in this specific tag + style_pattern = r'style="([^"]*)"' + styles = re.findall(style_pattern, full_tag) + + if len(styles) <= 1: + # No duplicates, return unchanged + return full_tag + + # Log warning about duplicate styles with context + # Extract tag name for context + tag_name_match = re.match(r'<(\w+)', full_tag) + tag_name = tag_name_match.group(1) if tag_name_match else 'unknown' + + # Get position of this tag in the original text to show nearby text content + tag_pos = textTag.text.find(full_tag) + if tag_pos >= 0: + # Find some actual text content near this tag (not HTML tags) + # Look ahead after this tag for text content + search_start = tag_pos + len(full_tag) + search_end = min(len(textTag.text), search_start + 200) + nearby = textTag.text[search_start:search_end] + # Extract text between tags + text_content = re.sub(r'<[^>]*>', '', nearby)[:20].strip() + context = f"near text: '{text_content}'" if text_content else "at start/end" + else: + context = "" + + logging.warning(f"Merging duplicate 'style' attributes in <{tag_name}> tag ({len(styles)} instances) {context}") + logging.warning(f" Styles: {styles}") + + # Merge all style values + merged_parts = [] + for s in styles: + s = s.strip() + if s: + # Ensure ends with semicolon for proper CSS + if not s.endswith(';'): + s += ';' + merged_parts.append(s) + merged_style = ' '.join(merged_parts).strip() + + # Replace: keep first style="..." and remove all subsequent ones + # First, remove ALL style attributes + tag_without_styles = re.sub(style_pattern, '', full_tag) + + # Then add the merged style back as the first attribute + # Find position after tag name to insert style + tag_name_match = re.match(r'(<\w+)(\s|>)', tag_without_styles) + if tag_name_match: + prefix = tag_name_match.group(1) # e.g., ']*>', merge_duplicate_styles, textTag.text) + + # Validate that we haven't lost any actual text content (only fixed attributes) + # Strip all HTML tags and compare character counts + original_text_only = re.sub(r'<[^>]*>', '', textTag.text) + processed_text_only = re.sub(r'<[^>]*>', '', text_content) + + if len(original_text_only) != len(processed_text_only): + logging.error("=" * 80) + logging.error("PREPROCESSING VALIDATION FAILED: Text content length changed!") + logging.error(f"Original text-only length: {len(original_text_only)}") + logging.error(f"Processed text-only length: {len(processed_text_only)}") + logging.error(f"Difference: {len(processed_text_only) - len(original_text_only)} characters") + logging.error("-" * 80) + logging.error("Original text-only content:") + logging.error(original_text_only) + logging.error("-" * 80) + logging.error("Processed text-only content:") + logging.error(processed_text_only) + logging.error("=" * 80) + raise ValueError("Text preprocessing corrupted content - text length mismatch") + + try: + htmlxml = etree.XML(text_content) + # Log what we successfully parsed + body = htmlxml.find('.//body') + if body is not None: + # Log all direct children of body to see structure + body_children = list(body) + else: + logging.warning("No tag found in parsed HTML!") + except etree.XMLSyntaxError as e: + # Log detailed error information for debugging XML parsing issues + logging.error("=" * 80) + logging.error("XML PARSING ERROR in text area") + logging.error(f"Error: {e}") + logging.error(f"Original text content ({len(textTag.text)} characters):") + logging.error(textTag.text) + logging.error("-" * 80) + logging.error(f"Preprocessed text content ({len(text_content)} characters):") + logging.error(text_content) + logging.error("-" * 80) + + # Try to highlight the problematic portion based on column number + if hasattr(e, 'position') and e.position: + col = e.position[1] if len(e.position) > 1 else None + else: + # Try to extract column from error message (e.g., "column 3838") + import re + match = re.search(r'column (\d+)', str(e)) + col = int(match.group(1)) if match else None + + if col is not None: + # Show context around the error (30 chars before and after) + start = max(0, col - 30) + end = min(len(text_content), col + 30) + context = text_content[start:end] + marker_pos = min(30, col - start) + + logging.error(f"Context around column {col} in preprocessed text:") + logging.error(f" {context}") + logging.error(f" {' ' * marker_pos}^ (error position)") + + logging.error("=" * 80) + # Re-throw the error for now + raise + body = htmlxml.find('.//body') bstyle = dict([kv.split(':') for kv in body.get('style').lstrip(' ').rstrip(';').split('; ')]) try: @@ -676,8 +810,15 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are # Concatenating them to just one index entry seems to work in practice indexEntryText = None + # Track all direct children of body to validate we process everything + all_body_children = list(body) + unprocessed_children = set(all_body_children) # Will remove elements as we process them + htmlparas = body.findall(".//p") + for p in htmlparas: + # Mark this paragraph as processed + unprocessed_children.discard(p) maxfs = 0 # cannot use the bodyfs as a default, there may not actually be any text at body size if p.get('align') == 'center': pdf_styleN.alignment = reportlab.lib.enums.TA_CENTER @@ -703,6 +844,9 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are finalLeadingFactor = LineScales.lineScaleForFont(bodyfont) * pLineHeight htmlspans = p.findall(".*") + logging.debug(f"Paragraph has {len(htmlspans)} child elements") + for child in htmlspans[:3]: # Log first 3 to see what they are + logging.debug(f" Child tag: {child.tag}") if len(htmlspans) < 1: # i.e. there are no spans, just a paragraph paragraphText = '' paragraphText, maxfs = AppendItemTextInStyle(paragraphText, p.text, p, pdf, @@ -779,7 +923,7 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are paragraphText = AppendText(paragraphText, html.escape(span.tail)) else: - logging.warning(f"Ignoring unhandled tag {item.tag}") + logging.warning(f"Ignoring unhandled tag {item.tag} in text area (tag content: {etree.tostring(item, encoding='unicode')[:100]}...)") # try to create a paragraph with the current text and style. Catch errors. try: @@ -790,6 +934,143 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are except Exception: logging.exception('Exception') + # Process
      (unordered list) elements - bulleted lists + htmllists = body.findall("ul") + + for ul in htmllists: + # Mark this list as processed + unprocessed_children.discard(ul) + + listitems = ul.findall("li") + + for li in listitems: + maxfs = 0 + + # Create a copy of the style for this list item with hanging indent + list_styleN = ParagraphStyle('list_item', parent=pdf_styleN) + # Hanging indent: first line at 0, subsequent lines indented + # Calculate indent based on font size - approximately 2x the font size + # accounts for bullet width + space + bullet_indent = bodyfs * 1.65 # Adjust multiplier if needed (1.5 - 2.5 range) + list_styleN.leftIndent = bullet_indent # Where wrapped lines start + list_styleN.firstLineIndent = -bullet_indent/2 # Pull first line (with bullet) back halfway position 0 + bullet_txt = '• ' + + # Check alignment (though lists are typically left-aligned) + if li.get('align') == 'center': + list_styleN.alignment = reportlab.lib.enums.TA_CENTER + elif li.get('align') == 'right': + list_styleN.alignment = reportlab.lib.enums.TA_RIGHT + elif li.get('align') == 'justify': + list_styleN.alignment = reportlab.lib.enums.TA_JUSTIFY + else: + list_styleN.alignment = reportlab.lib.enums.TA_LEFT + + # Get line height from
    • style if present + pLineHeight = 1.0 + liStyleAttribute = li.get('style') + if liStyleAttribute is not None: + liStyle = dict([kv.split(':') for kv in + li.get('style').lstrip(' ').rstrip(';').split('; ')]) + if 'line-height' in liStyle.keys(): + try: + pLineHeight = floor(float(liStyle['line-height'].strip("%")))/100.0 + except: # noqa: E722 + logging.warning(f"Ignoring invalid list item line-height setting {liStyleAttribute}") + finalLeadingFactor = LineScales.lineScaleForFont(bodyfont) * pLineHeight + + # Start paragraph - we'll add bullet inside the styled text + paragraphText = '' + + # Check if there are child elements (spans, br, etc.) + lispans = li.findall(".*") + + if len(lispans) < 1: + # Simple list item with just text, no spans + # Prepend bullet to the text so it gets styled + bullet_plus_text = bullet_txt + (li.text != None and li.text or "") + paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf, + additional_fonts, bodyfont, bodyfs, bweight, bstyle) + paragraphText += '' + usefs = maxfs if maxfs > 0 else bodyfs + list_styleN.leading = usefs * finalLeadingFactor + pdf_flowableList.append(Paragraph(paragraphText, list_styleN)) + else: + # List item with spans and other formatting + bullet_plus_text = bullet_txt + (li.text != None and li.text or "") + paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf, + additional_fonts, bodyfont, bodyfs, bweight, bstyle) + paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf, + additional_fonts, bodyfont, bodyfs, bweight, bstyle) + usefs = maxfs if maxfs > 0 else bodyfs + list_styleN.leading = usefs * finalLeadingFactor + + # Process child elements (spans, br, etc.) + for item in lispans: + if item.tag == 'br': + br = item + # For lists, we don't typically break into multiple paragraphs on
      + # Instead, insert a line break within the same paragraph + paragraphText += '
      ' + if br.tail: + paragraphText, maxfs = AppendItemTextInStyle(paragraphText, br.tail, li, pdf, + additional_fonts, bodyfont, bodyfs, bweight, bstyle) + + elif item.tag == 'span': + span = item + spanfont, spanfs, spanweight, spanstyle = CollectFontInfo(span, pdf, additional_fonts, bodyfont, bodyfs, bweight) + + maxfs = max(maxfs, spanfs) + + paragraphText = AppendSpanStart(paragraphText, spanfont, spanfs, spanweight, spanstyle, bstyle) + + if span.text is not None: + paragraphText = AppendText(paragraphText, html.escape(span.text)) + + # Handle line breaks within spans + brs = span.findall(".//br") + if len(brs) > 0: + paragraphText = AppendSpanEnd(paragraphText, spanweight, spanstyle, bstyle) + for br in brs: + paragraphText += '
      ' + if br.tail: + paragraphText, maxfs = AppendItemTextInStyle(paragraphText, br.tail, span, pdf, + additional_fonts, bodyfont, bodyfs, bweight, bstyle) + else: + paragraphText = AppendSpanEnd(paragraphText, spanweight, spanstyle, bstyle) + + if span.tail is not None: + paragraphText = AppendText(paragraphText, html.escape(span.tail)) + + else: + logging.warning(f"Ignoring unhandled tag {item.tag} in list item (tag content: {etree.tostring(item, encoding='unicode')[:100]}...)") + + # Finalize the list item paragraph + try: + paragraphText += '' + usefs = maxfs if maxfs > 0 else bodyfs + list_styleN.leading = usefs * finalLeadingFactor + pdf_flowableList.append(Paragraph(paragraphText, list_styleN)) + except Exception: + logging.exception('Exception') + + # The tag contains margin info, not actual content - mark it as processed + table = body.find('table') + if table is not None: + unprocessed_children.discard(table) + + # Validate: warn about any body children that we didn't process + if unprocessed_children: + logging.warning("=" * 80) + logging.warning("TEXT CONTENT BEING SILENTLY IGNORED!") + logging.warning(f"Found {len(unprocessed_children)} unprocessed elements as direct children of :") + for child in unprocessed_children: + child_text = ''.join(child.itertext())[:100] # Get text content, first 100 chars + logging.warning(f" Ignoring <{child.tag}> with {len(list(child))} children") + logging.warning(f" Text content preview: {child_text}") + logging.warning(f" XML: {etree.tostring(child, encoding='unicode')[:200]}...") + logging.warning("=" * 80) + if indexEntryText: albumIndex.AddIndexEntry(pgno, indexEntryText)