Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 132 additions & 53 deletions reader3.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def process_epub(epub_path: str, output_dir: str) -> Book:
image_map = {} # Key: internal_path, Value: local_relative_path

for item in book.get_items():
if item.get_type() == ebooklib.ITEM_IMAGE:
if item.get_type() in (ebooklib.ITEM_IMAGE, ebooklib.ITEM_COVER):
# Normalize filename
original_fname = os.path.basename(item.get_name())
# Sanitize filename for OS
Expand All @@ -216,61 +216,140 @@ def process_epub(epub_path: str, output_dir: str) -> Book:
print("Warning: Empty TOC, building fallback from Spine...")
toc_structure = get_fallback_toc(book)

# 6. Process Content (Spine-based to preserve HTML validity)
# 6. Determine reading order (Spine, TOC, or manifest fallback)
def normalize_href(href: str) -> str:
if not href:
return ""
value = href.strip()
if value.startswith("./"):
value = value[2:]
try:
value = unquote(value)
except Exception:
pass
return value

def is_document_item(item) -> bool:
name = (item.get_name() or "").lower()
media_type = (getattr(item, "media_type", None) or "").lower()
if item.get_type() == ebooklib.ITEM_DOCUMENT:
return True
if media_type in ("text/html", "application/xhtml+xml"):
return True
return name.endswith((".html", ".xhtml", ".htm"))

doc_items = [item for item in book.get_items() if is_document_item(item)]
doc_by_full = {item.get_name(): item for item in doc_items}
doc_by_base = {}
for item in doc_items:
base = os.path.basename(item.get_name())
if base not in doc_by_base:
doc_by_base[base] = item
else:
# Avoid ambiguous basename matches.
doc_by_base[base] = None

def resolve_doc_item(href: str):
cleaned = normalize_href(href)
if not cleaned:
return None
if cleaned in doc_by_full:
return doc_by_full[cleaned]
base = os.path.basename(cleaned)
if base in doc_by_full:
return doc_by_full[base]
if base in doc_by_base and doc_by_base[base] is not None:
return doc_by_base[base]
return None

toc_files = []

def collect_toc_files(entries):
for entry in entries:
if entry.file_href:
toc_files.append(entry.file_href)
if entry.children:
collect_toc_files(entry.children)

collect_toc_files(toc_structure)

ordered_items = []
seen_ids = set()

if toc_files:
for href in toc_files:
item = resolve_doc_item(href)
if item and item.get_id() not in seen_ids:
ordered_items.append(item)
seen_ids.add(item.get_id())

if not ordered_items:
# Use spine order if present.
for item_id, _linear in book.spine:
item = book.get_item_with_id(item_id)
if item and item.get_type() == ebooklib.ITEM_DOCUMENT:
if item.get_id() not in seen_ids:
ordered_items.append(item)
seen_ids.add(item.get_id())

if not ordered_items:
ordered_items = doc_items[:]
seen_ids = {item.get_id() for item in ordered_items}

# Append any remaining document items not referenced in TOC/spine.
for item in doc_items:
if item.get_id() not in seen_ids:
ordered_items.append(item)
seen_ids.add(item.get_id())

# 7. Process Content (Ordered documents to preserve HTML validity)
print("Processing chapters...")
spine_chapters = []

# We iterate over the spine (linear reading order)
for i, spine_item in enumerate(book.spine):
item_id, linear = spine_item
item = book.get_item_with_id(item_id)

if not item:
continue

if item.get_type() == ebooklib.ITEM_DOCUMENT:
# Raw content
raw_content = item.get_content().decode('utf-8', errors='ignore')
soup = BeautifulSoup(raw_content, 'html.parser')

# A. Fix Images
for img in soup.find_all('img'):
src = img.get('src', '')
if not src: continue

# Decode URL (part01/image%201.jpg -> part01/image 1.jpg)
src_decoded = unquote(src)
filename = os.path.basename(src_decoded)

# Try to find in map
if src_decoded in image_map:
img['src'] = image_map[src_decoded]
elif filename in image_map:
img['src'] = image_map[filename]

# B. Clean HTML
soup = clean_html_content(soup)

# C. Extract Body Content only
body = soup.find('body')
if body:
# Extract inner HTML of body
final_html = "".join([str(x) for x in body.contents])
else:
final_html = str(soup)

# D. Create Object
chapter = ChapterContent(
id=item_id,
href=item.get_name(), # Important: This links TOC to Content
title=f"Section {i+1}", # Fallback, real titles come from TOC
content=final_html,
text=extract_plain_text(soup),
order=i
)
spine_chapters.append(chapter)

# 7. Final Assembly
for i, item in enumerate(ordered_items):
# Raw content
raw_content = item.get_content().decode('utf-8', errors='ignore')
soup = BeautifulSoup(raw_content, 'html.parser')

# A. Fix Images
for img in soup.find_all('img'):
src = img.get('src', '')
if not src:
continue

# Decode URL (part01/image%201.jpg -> part01/image 1.jpg)
src_decoded = unquote(src)
filename = os.path.basename(src_decoded)

# Try to find in map
if src_decoded in image_map:
img['src'] = image_map[src_decoded]
elif filename in image_map:
img['src'] = image_map[filename]

# B. Clean HTML
soup = clean_html_content(soup)

# C. Extract Body Content only
body = soup.find('body')
if body:
# Extract inner HTML of body
final_html = "".join([str(x) for x in body.contents])
else:
final_html = str(soup)

# D. Create Object
chapter = ChapterContent(
id=item.get_id(),
href=item.get_name(), # Important: This links TOC to Content
title=f"Section {i+1}", # Fallback, real titles come from TOC
content=final_html,
text=extract_plain_text(soup),
order=i
)
spine_chapters.append(chapter)

# 8. Final Assembly
final_book = Book(
metadata=metadata,
spine=spine_chapters,
Expand Down