karpathy · kekincai · Jan 14, 2026
diff --git a/reader3.py b/reader3.py
@@ -192,7 +192,7 @@ def process_epub(epub_path: str, output_dir: str) -> Book:
     image_map = {} # Key: internal_path, Value: local_relative_path
 
     for item in book.get_items():
-        if item.get_type() == ebooklib.ITEM_IMAGE:
+        if item.get_type() in (ebooklib.ITEM_IMAGE, ebooklib.ITEM_COVER):
             # Normalize filename
             original_fname = os.path.basename(item.get_name())
             # Sanitize filename for OS
@@ -216,61 +216,140 @@ def process_epub(epub_path: str, output_dir: str) -> Book:
         print("Warning: Empty TOC, building fallback from Spine...")
         toc_structure = get_fallback_toc(book)
 
-    # 6. Process Content (Spine-based to preserve HTML validity)
+    # 6. Determine reading order (Spine, TOC, or manifest fallback)
+    def normalize_href(href: str) -> str:
+        if not href:
+            return ""
+        value = href.strip()
+        if value.startswith("./"):
+            value = value[2:]
+        try:
+            value = unquote(value)
+        except Exception:
+            pass
+        return value
+
+    def is_document_item(item) -> bool:
+        name = (item.get_name() or "").lower()
+        media_type = (getattr(item, "media_type", None) or "").lower()
+        if item.get_type() == ebooklib.ITEM_DOCUMENT:
+            return True
+        if media_type in ("text/html", "application/xhtml+xml"):
+            return True
+        return name.endswith((".html", ".xhtml", ".htm"))
+
+    doc_items = [item for item in book.get_items() if is_document_item(item)]
+    doc_by_full = {item.get_name(): item for item in doc_items}
+    doc_by_base = {}
+    for item in doc_items:
+        base = os.path.basename(item.get_name())
+        if base not in doc_by_base:
+            doc_by_base[base] = item
+        else:
+            # Avoid ambiguous basename matches.
+            doc_by_base[base] = None
+
+    def resolve_doc_item(href: str):
+        cleaned = normalize_href(href)
+        if not cleaned:
+            return None
+        if cleaned in doc_by_full:
+            return doc_by_full[cleaned]
+        base = os.path.basename(cleaned)
+        if base in doc_by_full:
+            return doc_by_full[base]
+        if base in doc_by_base and doc_by_base[base] is not None:
+            return doc_by_base[base]
+        return None
+
+    toc_files = []
+
+    def collect_toc_files(entries):
+        for entry in entries:
+            if entry.file_href:
+                toc_files.append(entry.file_href)
+            if entry.children:
+                collect_toc_files(entry.children)
+
+    collect_toc_files(toc_structure)
+
+    ordered_items = []
+    seen_ids = set()
+
+    if toc_files:
+        for href in toc_files:
+            item = resolve_doc_item(href)
+            if item and item.get_id() not in seen_ids:
+                ordered_items.append(item)
+                seen_ids.add(item.get_id())
+
+    if not ordered_items:
+        # Use spine order if present.
+        for item_id, _linear in book.spine:
+            item = book.get_item_with_id(item_id)
+            if item and item.get_type() == ebooklib.ITEM_DOCUMENT:
+                if item.get_id() not in seen_ids:
+                    ordered_items.append(item)
+                    seen_ids.add(item.get_id())
+
+    if not ordered_items:
+        ordered_items = doc_items[:]
+        seen_ids = {item.get_id() for item in ordered_items}
+
+    # Append any remaining document items not referenced in TOC/spine.
+    for item in doc_items:
+        if item.get_id() not in seen_ids:
+            ordered_items.append(item)
+            seen_ids.add(item.get_id())
+
+    # 7. Process Content (Ordered documents to preserve HTML validity)
     print("Processing chapters...")
     spine_chapters = []
 
-    # We iterate over the spine (linear reading order)
-    for i, spine_item in enumerate(book.spine):
-        item_id, linear = spine_item
-        item = book.get_item_with_id(item_id)
-
-        if not item:
-            continue
-
-        if item.get_type() == ebooklib.ITEM_DOCUMENT:
-            # Raw content
-            raw_content = item.get_content().decode('utf-8', errors='ignore')
-            soup = BeautifulSoup(raw_content, 'html.parser')
-
-            # A. Fix Images
-            for img in soup.find_all('img'):
-                src = img.get('src', '')
-                if not src: continue
-
-                # Decode URL (part01/image%201.jpg -> part01/image 1.jpg)
-                src_decoded = unquote(src)
-                filename = os.path.basename(src_decoded)
-
-                # Try to find in map
-                if src_decoded in image_map:
-                    img['src'] = image_map[src_decoded]
-                elif filename in image_map:
-                    img['src'] = image_map[filename]
-
-            # B. Clean HTML
-            soup = clean_html_content(soup)
-
-            # C. Extract Body Content only
-            body = soup.find('body')
-            if body:
-                # Extract inner HTML of body
-                final_html = "".join([str(x) for x in body.contents])
-            else:
-                final_html = str(soup)
-
-            # D. Create Object
-            chapter = ChapterContent(
-                id=item_id,
-                href=item.get_name(), # Important: This links TOC to Content
-                title=f"Section {i+1}", # Fallback, real titles come from TOC
-                content=final_html,
-                text=extract_plain_text(soup),
-                order=i
-            )
-            spine_chapters.append(chapter)
-
-    # 7. Final Assembly
+    for i, item in enumerate(ordered_items):
+        # Raw content
+        raw_content = item.get_content().decode('utf-8', errors='ignore')
+        soup = BeautifulSoup(raw_content, 'html.parser')
+
+        # A. Fix Images
+        for img in soup.find_all('img'):
+            src = img.get('src', '')
+            if not src:
+                continue
+
+            # Decode URL (part01/image%201.jpg -> part01/image 1.jpg)
+            src_decoded = unquote(src)
+            filename = os.path.basename(src_decoded)
+
+            # Try to find in map
+            if src_decoded in image_map:
+                img['src'] = image_map[src_decoded]
+            elif filename in image_map:
+                img['src'] = image_map[filename]
+
+        # B. Clean HTML
+        soup = clean_html_content(soup)
+
+        # C. Extract Body Content only
+        body = soup.find('body')
+        if body:
+            # Extract inner HTML of body
+            final_html = "".join([str(x) for x in body.contents])
+        else:
+            final_html = str(soup)
+
+        # D. Create Object
+        chapter = ChapterContent(
+            id=item.get_id(),
+            href=item.get_name(), # Important: This links TOC to Content
+            title=f"Section {i+1}", # Fallback, real titles come from TOC
+            content=final_html,
+            text=extract_plain_text(soup),
+            order=i
+        )
+        spine_chapters.append(chapter)
+
+    # 8. Final Assembly
     final_book = Book(
         metadata=metadata,
         spine=spine_chapters,