Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 283 additions & 2 deletions cewe2pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
from reportlab.lib.utils import ImageReader
from reportlab.pdfgen import canvas
from reportlab.platypus import Paragraph, Table
from reportlab.lib.styles import ParagraphStyle
# from reportlab.lib.styles import getSampleStyleSheet

import numpy as np
Expand Down Expand Up @@ -600,7 +601,140 @@ def warnAndIgnoreEnabledDecorationShadow(decoration):

def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, areaWidth, pdf, transx, transy, pgno): # noqa: C901 (too complex)
# note: it would be better to use proper html processing here
htmlxml = etree.XML(textTag.text)

# Preprocess text to fix CEWE bugs: merge duplicate style attributes
# CEWE sometimes generates invalid XML like: <li style="..." style="...">
# We need to merge these into a single style attribute
import re

def merge_duplicate_styles(match):
"""Merge duplicate style attributes in a single tag."""
full_tag = match.group(0) # e.g., '<li style="..." style="...">'

# Find all style="..." attributes in this specific tag
style_pattern = r'style="([^"]*)"'
styles = re.findall(style_pattern, full_tag)

if len(styles) <= 1:
# No duplicates, return unchanged
return full_tag

# Log warning about duplicate styles with context
# Extract tag name for context
tag_name_match = re.match(r'<(\w+)', full_tag)
tag_name = tag_name_match.group(1) if tag_name_match else 'unknown'

# Get position of this tag in the original text to show nearby text content
tag_pos = textTag.text.find(full_tag)
if tag_pos >= 0:
# Find some actual text content near this tag (not HTML tags)
# Look ahead after this tag for text content
search_start = tag_pos + len(full_tag)
search_end = min(len(textTag.text), search_start + 200)
nearby = textTag.text[search_start:search_end]
# Extract text between tags
text_content = re.sub(r'<[^>]*>', '', nearby)[:20].strip()
context = f"near text: '{text_content}'" if text_content else "at start/end"
else:
context = ""

logging.warning(f"Merging duplicate 'style' attributes in <{tag_name}> tag ({len(styles)} instances) {context}")
logging.warning(f" Styles: {styles}")

# Merge all style values
merged_parts = []
for s in styles:
s = s.strip()
if s:
# Ensure ends with semicolon for proper CSS
if not s.endswith(';'):
s += ';'
merged_parts.append(s)
merged_style = ' '.join(merged_parts).strip()

# Replace: keep first style="..." and remove all subsequent ones
# First, remove ALL style attributes
tag_without_styles = re.sub(style_pattern, '', full_tag)

# Then add the merged style back as the first attribute
# Find position after tag name to insert style
tag_name_match = re.match(r'(<\w+)(\s|>)', tag_without_styles)
if tag_name_match:
prefix = tag_name_match.group(1) # e.g., '<li'
rest = tag_without_styles[len(prefix):] # everything after tag name
return f'{prefix} style="{merged_style}"{rest}'

# Fallback: shouldn't reach here, but return original if parsing fails
return full_tag

# Process each opening tag, merging duplicate style attributes
text_content = re.sub(r'<\w+[^>]*>', merge_duplicate_styles, textTag.text)

# Validate that we haven't lost any actual text content (only fixed attributes)
# Strip all HTML tags and compare character counts
original_text_only = re.sub(r'<[^>]*>', '', textTag.text)
processed_text_only = re.sub(r'<[^>]*>', '', text_content)

if len(original_text_only) != len(processed_text_only):
logging.error("=" * 80)
logging.error("PREPROCESSING VALIDATION FAILED: Text content length changed!")
logging.error(f"Original text-only length: {len(original_text_only)}")
logging.error(f"Processed text-only length: {len(processed_text_only)}")
logging.error(f"Difference: {len(processed_text_only) - len(original_text_only)} characters")
logging.error("-" * 80)
logging.error("Original text-only content:")
logging.error(original_text_only)
logging.error("-" * 80)
logging.error("Processed text-only content:")
logging.error(processed_text_only)
logging.error("=" * 80)
raise ValueError("Text preprocessing corrupted content - text length mismatch")

try:
htmlxml = etree.XML(text_content)
# Log what we successfully parsed
body = htmlxml.find('.//body')
if body is not None:
# Log all direct children of body to see structure
body_children = list(body)
else:
logging.warning("No <body> tag found in parsed HTML!")
except etree.XMLSyntaxError as e:
# Log detailed error information for debugging XML parsing issues
logging.error("=" * 80)
logging.error("XML PARSING ERROR in text area")
logging.error(f"Error: {e}")
logging.error(f"Original text content ({len(textTag.text)} characters):")
logging.error(textTag.text)
logging.error("-" * 80)
logging.error(f"Preprocessed text content ({len(text_content)} characters):")
logging.error(text_content)
logging.error("-" * 80)

# Try to highlight the problematic portion based on column number
if hasattr(e, 'position') and e.position:
col = e.position[1] if len(e.position) > 1 else None
else:
# Try to extract column from error message (e.g., "column 3838")
import re
match = re.search(r'column (\d+)', str(e))
col = int(match.group(1)) if match else None

if col is not None:
# Show context around the error (30 chars before and after)
start = max(0, col - 30)
end = min(len(text_content), col + 30)
context = text_content[start:end]
marker_pos = min(30, col - start)

logging.error(f"Context around column {col} in preprocessed text:")
logging.error(f" {context}")
logging.error(f" {' ' * marker_pos}^ (error position)")

logging.error("=" * 80)
# Re-throw the error for now
raise

body = htmlxml.find('.//body')
bstyle = dict([kv.split(':') for kv in body.get('style').lstrip(' ').rstrip(';').split('; ')])
try:
Expand Down Expand Up @@ -676,8 +810,15 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
# Concatenating them to just one index entry seems to work in practice
indexEntryText = None

# Track all direct children of body to validate we process everything
all_body_children = list(body)
unprocessed_children = set(all_body_children) # Will remove elements as we process them

htmlparas = body.findall(".//p")

for p in htmlparas:
# Mark this paragraph as processed
unprocessed_children.discard(p)
maxfs = 0 # cannot use the bodyfs as a default, there may not actually be any text at body size
if p.get('align') == 'center':
pdf_styleN.alignment = reportlab.lib.enums.TA_CENTER
Expand All @@ -703,6 +844,9 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
finalLeadingFactor = LineScales.lineScaleForFont(bodyfont) * pLineHeight

htmlspans = p.findall(".*")
logging.debug(f"Paragraph has {len(htmlspans)} child elements")
for child in htmlspans[:3]: # Log first 3 to see what they are
logging.debug(f" Child tag: {child.tag}")
if len(htmlspans) < 1: # i.e. there are no spans, just a paragraph
paragraphText = '<para autoLeading="max">'
paragraphText, maxfs = AppendItemTextInStyle(paragraphText, p.text, p, pdf,
Expand Down Expand Up @@ -779,7 +923,7 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
paragraphText = AppendText(paragraphText, html.escape(span.tail))

else:
logging.warning(f"Ignoring unhandled tag {item.tag}")
logging.warning(f"Ignoring unhandled tag {item.tag} in text area (tag content: {etree.tostring(item, encoding='unicode')[:100]}...)")

# try to create a paragraph with the current text and style. Catch errors.
try:
Expand All @@ -790,6 +934,143 @@ def processAreaTextTag(textTag, additional_fonts, area, areaHeight, areaRot, are
except Exception:
logging.exception('Exception')

# Process <ul> (unordered list) elements - bulleted lists
htmllists = body.findall("ul")

for ul in htmllists:
# Mark this list as processed
unprocessed_children.discard(ul)

listitems = ul.findall("li")

for li in listitems:
maxfs = 0

# Create a copy of the style for this list item with hanging indent
list_styleN = ParagraphStyle('list_item', parent=pdf_styleN)
# Hanging indent: first line at 0, subsequent lines indented
# Calculate indent based on font size - approximately 2x the font size
# accounts for bullet width + space
bullet_indent = bodyfs * 1.65 # Adjust multiplier if needed (1.5 - 2.5 range)
list_styleN.leftIndent = bullet_indent # Where wrapped lines start
list_styleN.firstLineIndent = -bullet_indent/2 # Pull first line (with bullet) back halfway position 0
bullet_txt = '• '

# Check alignment (though lists are typically left-aligned)
if li.get('align') == 'center':
list_styleN.alignment = reportlab.lib.enums.TA_CENTER
elif li.get('align') == 'right':
list_styleN.alignment = reportlab.lib.enums.TA_RIGHT
elif li.get('align') == 'justify':
list_styleN.alignment = reportlab.lib.enums.TA_JUSTIFY
else:
list_styleN.alignment = reportlab.lib.enums.TA_LEFT

# Get line height from <li> style if present
pLineHeight = 1.0
liStyleAttribute = li.get('style')
if liStyleAttribute is not None:
liStyle = dict([kv.split(':') for kv in
li.get('style').lstrip(' ').rstrip(';').split('; ')])
if 'line-height' in liStyle.keys():
try:
pLineHeight = floor(float(liStyle['line-height'].strip("%")))/100.0
except: # noqa: E722
logging.warning(f"Ignoring invalid list item line-height setting {liStyleAttribute}")
finalLeadingFactor = LineScales.lineScaleForFont(bodyfont) * pLineHeight

# Start paragraph - we'll add bullet inside the styled text
paragraphText = '<para autoLeading="max">'

# Check if there are child elements (spans, br, etc.)
lispans = li.findall(".*")

if len(lispans) < 1:
# Simple list item with just text, no spans
# Prepend bullet to the text so it gets styled
bullet_plus_text = bullet_txt + (li.text != None and li.text or "")
paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
additional_fonts, bodyfont, bodyfs, bweight, bstyle)
paragraphText += '</para>'
usefs = maxfs if maxfs > 0 else bodyfs
list_styleN.leading = usefs * finalLeadingFactor
pdf_flowableList.append(Paragraph(paragraphText, list_styleN))
else:
# List item with spans and other formatting
bullet_plus_text = bullet_txt + (li.text != None and li.text or "")
paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
additional_fonts, bodyfont, bodyfs, bweight, bstyle)
paragraphText, maxfs = AppendItemTextInStyle(paragraphText, bullet_plus_text, li, pdf,
additional_fonts, bodyfont, bodyfs, bweight, bstyle)
usefs = maxfs if maxfs > 0 else bodyfs
list_styleN.leading = usefs * finalLeadingFactor

# Process child elements (spans, br, etc.)
for item in lispans:
if item.tag == 'br':
br = item
# For lists, we don't typically break into multiple paragraphs on <br>
# Instead, insert a line break within the same paragraph
paragraphText += '<br/>'
if br.tail:
paragraphText, maxfs = AppendItemTextInStyle(paragraphText, br.tail, li, pdf,
additional_fonts, bodyfont, bodyfs, bweight, bstyle)

elif item.tag == 'span':
span = item
spanfont, spanfs, spanweight, spanstyle = CollectFontInfo(span, pdf, additional_fonts, bodyfont, bodyfs, bweight)

maxfs = max(maxfs, spanfs)

paragraphText = AppendSpanStart(paragraphText, spanfont, spanfs, spanweight, spanstyle, bstyle)

if span.text is not None:
paragraphText = AppendText(paragraphText, html.escape(span.text))

# Handle line breaks within spans
brs = span.findall(".//br")
if len(brs) > 0:
paragraphText = AppendSpanEnd(paragraphText, spanweight, spanstyle, bstyle)
for br in brs:
paragraphText += '<br/>'
if br.tail:
paragraphText, maxfs = AppendItemTextInStyle(paragraphText, br.tail, span, pdf,
additional_fonts, bodyfont, bodyfs, bweight, bstyle)
else:
paragraphText = AppendSpanEnd(paragraphText, spanweight, spanstyle, bstyle)

if span.tail is not None:
paragraphText = AppendText(paragraphText, html.escape(span.tail))

else:
logging.warning(f"Ignoring unhandled tag {item.tag} in list item (tag content: {etree.tostring(item, encoding='unicode')[:100]}...)")

# Finalize the list item paragraph
try:
paragraphText += '</para>'
usefs = maxfs if maxfs > 0 else bodyfs
list_styleN.leading = usefs * finalLeadingFactor
pdf_flowableList.append(Paragraph(paragraphText, list_styleN))
except Exception:
logging.exception('Exception')

# The <table> tag contains margin info, not actual content - mark it as processed
table = body.find('table')
if table is not None:
unprocessed_children.discard(table)

# Validate: warn about any body children that we didn't process
if unprocessed_children:
logging.warning("=" * 80)
logging.warning("TEXT CONTENT BEING SILENTLY IGNORED!")
logging.warning(f"Found {len(unprocessed_children)} unprocessed elements as direct children of <body>:")
for child in unprocessed_children:
child_text = ''.join(child.itertext())[:100] # Get text content, first 100 chars
logging.warning(f" Ignoring <{child.tag}> with {len(list(child))} children")
logging.warning(f" Text content preview: {child_text}")
logging.warning(f" XML: {etree.tostring(child, encoding='unicode')[:200]}...")
logging.warning("=" * 80)

if indexEntryText:
albumIndex.AddIndexEntry(pgno, indexEntryText)

Expand Down