diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..a05d772 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,95 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Reader3 is a lightweight, self-hosted EPUB reader web application. The core workflow is: +1. Process EPUB files into structured data using `reader3.py` +2. Serve the processed books via a FastAPI web server using `server.py` +3. Read books chapter-by-chapter with a clean web interface optimized for copying content to LLMs + +## Development Commands + +### Setup and Dependencies +The project uses [uv](https://docs.astral.sh/uv/) for dependency management. Python 3.10+ required. + +```bash +# Process an EPUB file (creates a {book_name}_data directory) +uv run reader3.py + +# Start the web server (runs at http://127.0.0.1:8123) +uv run server.py +``` + +### Library Management +- Books are stored as `{book_name}_data/` directories containing: + - `book.pkl` - Pickled Book object with metadata, spine, TOC, and content + - `images/` - Extracted images from the EPUB +- To remove a book: delete its `_data` directory +- Server auto-discovers all `*_data` directories in the root folder + +## Architecture + +### Core Data Model (reader3.py) + +**Book Processing Pipeline:** +1. `process_epub()` - Main entry point that orchestrates EPUB parsing +2. EPUB parsing via ebooklib → extracts metadata, spine (linear reading order), TOC (navigation tree), and images +3. HTML cleaning → removes scripts, styles, forms, dangerous elements +4. Image path rewriting → converts EPUB-internal paths to local `images/{filename}` paths +5. Serialization → entire Book object pickled to `book.pkl` + +**Key Data Structures:** +- `Book` - Master container with metadata, spine, toc, and image map +- `ChapterContent` - Represents a physical file in the EPUB spine (linear reading order). Contains cleaned HTML content and extracted plain text +- `TOCEntry` - Logical navigation entry (may have nested children). Maps to spine files via href matching +- `BookMetadata` - Standard DC metadata (title, authors, publisher, etc.) + +**Critical Distinction:** +- **Spine** = Physical reading order (files as they appear in EPUB) +- **TOC** = Logical navigation tree (may reference multiple positions in the same file via anchors) +- Server routes use spine indices (`/read/{book_id}/{chapter_index}`) for linear navigation +- TOC entries map to spine via filename matching in JavaScript (see reader.html:124-151) + +### Web Server (server.py) + +**FastAPI Routes:** +- `GET /` - Library view listing all processed books +- `GET /read/{book_id}` - Redirects to first chapter (index 0) +- `GET /read/{book_id}/{chapter_index}` - Main reader interface with sidebar TOC +- `GET /read/{book_id}/images/{image_name}` - Serves extracted images + +**Book Loading:** +- `load_book_cached()` uses `@lru_cache(maxsize=10)` to avoid repeated disk reads +- Books are loaded from pickle files on-demand +- Cache key is the folder name (e.g., "dracula_data") + +### Frontend (templates/) + +**library.html** - Grid view of available books with basic metadata + +**reader.html** - Two-column layout: +- Left sidebar: Nested TOC navigation tree (rendered via Jinja2 recursive macro) +- Right panel: Current chapter content with Previous/Next navigation +- JavaScript spine map (line 127-131) enables TOC → chapter index lookup +- `findAndGo()` function (line 133-151) handles TOC link clicks by mapping filenames to spine indices + +## Dependencies + +From pyproject.toml: +- `ebooklib` - EPUB parsing and manipulation +- `beautifulsoup4` - HTML parsing and cleaning +- `fastapi` - Web framework +- `jinja2` - Template engine +- `uvicorn` - ASGI server + +## Project Philosophy + +This is a minimal, "vibe-coded" project (per README) designed to illustrate reading books with LLMs. It intentionally avoids complexity: +- No database - just pickle files and directories +- No user accounts or authentication +- No advanced features (bookmarks, annotations, etc.) +- Simple file-based library management + +When making changes, preserve this simplicity and avoid adding unnecessary abstractions or features. diff --git a/pyproject.toml b/pyproject.toml index 31e6179..ea21b2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,4 +10,6 @@ dependencies = [ "fastapi>=0.121.2", "jinja2>=3.1.6", "uvicorn>=0.38.0", + "pymupdf>=1.25.1", + "python-multipart>=0.0.9", ] diff --git a/reader3.py b/reader3.py index d0b9d3f..7795d59 100644 --- a/reader3.py +++ b/reader3.py @@ -13,6 +13,7 @@ import ebooklib from ebooklib import epub from bs4 import BeautifulSoup, Comment +import fitz # PyMuPDF # --- Data structures --- @@ -283,6 +284,160 @@ def process_epub(epub_path: str, output_dir: str) -> Book: return final_book +def process_pdf(pdf_path: str, output_dir: str) -> Book: + """ + Process a PDF file into a Book object. + Attempts to extract TOC/bookmarks, falls back to page-based chunking if unavailable. + """ + print(f"Loading {pdf_path}...") + doc = fitz.open(pdf_path) + + # 1. Extract Metadata + metadata_dict = doc.metadata + title = metadata_dict.get('title', os.path.splitext(os.path.basename(pdf_path))[0]) + if not title or title.strip() == '': + title = os.path.splitext(os.path.basename(pdf_path))[0] + + author = metadata_dict.get('author', 'Unknown') + authors = [author] if author else [] + + metadata = BookMetadata( + title=title, + language="en", + authors=authors, + description=metadata_dict.get('subject'), + publisher=metadata_dict.get('producer'), + date=metadata_dict.get('creationDate') + ) + + # 2. Prepare Output Directory + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + images_dir = os.path.join(output_dir, 'images') + os.makedirs(images_dir, exist_ok=True) + + # 3. Try to Extract TOC/Outline + print("Extracting Table of Contents...") + toc_outline = doc.get_toc(simple=False) + + spine_chapters = [] + toc_structure = [] + + if toc_outline and len(toc_outline) > 0: + # TOC exists - use it to create chapters + print(f"Found {len(toc_outline)} TOC entries") + + # Build chapter ranges from TOC + chapter_ranges = [] + for i, entry in enumerate(toc_outline): + level, title, page_num = entry + start_page = page_num - 1 # fitz uses 0-based indexing + + # Find end page (start of next entry at same or higher level) + end_page = len(doc) - 1 + for j in range(i + 1, len(toc_outline)): + next_level, _, next_page = toc_outline[j] + if next_level <= level: + end_page = next_page - 2 + break + + chapter_ranges.append({ + 'level': level, + 'title': title, + 'start': start_page, + 'end': end_page, + 'order': i + }) + + # Create ChapterContent objects from TOC entries + for i, chapter_info in enumerate(chapter_ranges): + text_parts = [] + html_parts = ["
"] + + for page_num in range(chapter_info['start'], chapter_info['end'] + 1): + if page_num < 0 or page_num >= len(doc): + continue + page = doc[page_num] + page_text = page.get_text() + text_parts.append(page_text) + html_parts.append(f"

{page_text.replace(chr(10), '
')}

") + + html_parts.append("
") + + chapter = ChapterContent( + id=f"chapter_{i}", + href=f"chapter_{i}.html", + title=chapter_info['title'], + content="".join(html_parts), + text=" ".join(text_parts), + order=i + ) + spine_chapters.append(chapter) + + # Build TOC structure (flat for now - nested TOC would be more complex) + toc_entry = TOCEntry( + title=chapter_info['title'], + href=f"chapter_{i}.html", + file_href=f"chapter_{i}.html", + anchor="" + ) + toc_structure.append(toc_entry) + + else: + # No TOC - fall back to page-based chunking + print("No TOC found, using page-based chunking...") + pages_per_chapter = 10 + total_pages = len(doc) + + for chunk_start in range(0, total_pages, pages_per_chapter): + chunk_end = min(chunk_start + pages_per_chapter, total_pages) + chapter_num = chunk_start // pages_per_chapter + + text_parts = [] + html_parts = ["
"] + + for page_num in range(chunk_start, chunk_end): + page = doc[page_num] + page_text = page.get_text() + text_parts.append(page_text) + html_parts.append(f"

{page_text.replace(chr(10), '
')}

") + + html_parts.append("
") + + title = f"Pages {chunk_start + 1}-{chunk_end}" + chapter = ChapterContent( + id=f"chapter_{chapter_num}", + href=f"chapter_{chapter_num}.html", + title=title, + content="".join(html_parts), + text=" ".join(text_parts), + order=chapter_num + ) + spine_chapters.append(chapter) + + toc_entry = TOCEntry( + title=title, + href=f"chapter_{chapter_num}.html", + file_href=f"chapter_{chapter_num}.html", + anchor="" + ) + toc_structure.append(toc_entry) + + doc.close() + + # 4. Create Book object + final_book = Book( + metadata=metadata, + spine=spine_chapters, + toc=toc_structure, + images={}, # PDF image extraction can be added later if needed + source_file=os.path.basename(pdf_path), + processed_at=datetime.now().isoformat() + ) + + return final_book + + def save_to_pickle(book: Book, output_dir: str): p_path = os.path.join(output_dir, 'book.pkl') with open(p_path, 'wb') as f: @@ -296,14 +451,26 @@ def save_to_pickle(book: Book, output_dir: str): import sys if len(sys.argv) < 2: - print("Usage: python reader3.py ") + print("Usage: python reader3.py ") sys.exit(1) - epub_file = sys.argv[1] - assert os.path.exists(epub_file), "File not found." - out_dir = os.path.splitext(epub_file)[0] + "_data" + input_file = sys.argv[1] + assert os.path.exists(input_file), "File not found." + + # Detect file type + file_ext = os.path.splitext(input_file)[1].lower() + out_dir = os.path.splitext(input_file)[0] + "_data" + + # Process based on file type + if file_ext == '.epub': + book_obj = process_epub(input_file, out_dir) + elif file_ext == '.pdf': + book_obj = process_pdf(input_file, out_dir) + else: + print(f"Unsupported file type: {file_ext}") + print("Supported formats: .epub, .pdf") + sys.exit(1) - book_obj = process_epub(epub_file, out_dir) save_to_pickle(book_obj, out_dir) print("\n--- Summary ---") print(f"Title: {book_obj.metadata.title}") diff --git a/server.py b/server.py index 9c870dc..6f839ac 100644 --- a/server.py +++ b/server.py @@ -1,14 +1,16 @@ import os import pickle +import tempfile +import shutil from functools import lru_cache from typing import Optional -from fastapi import FastAPI, Request, HTTPException -from fastapi.responses import HTMLResponse, FileResponse +from fastapi import FastAPI, Request, HTTPException, UploadFile, File +from fastapi.responses import HTMLResponse, FileResponse, RedirectResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates -from reader3 import Book, BookMetadata, ChapterContent, TOCEntry +from reader3 import Book, BookMetadata, ChapterContent, TOCEntry, process_epub, process_pdf, save_to_pickle app = FastAPI() templates = Jinja2Templates(directory="templates") @@ -104,6 +106,62 @@ async def serve_image(book_id: str, image_name: str): return FileResponse(img_path) +@app.get("/upload", response_class=HTMLResponse) +async def upload_page(request: Request): + """Display the upload form.""" + return templates.TemplateResponse("upload.html", {"request": request}) + +@app.post("/upload") +async def upload_book(file: UploadFile = File(...)): + """ + Handle book upload and processing. + Accepts EPUB or PDF files, processes them, and redirects to library. + """ + # Validate file type + filename = file.filename + file_ext = os.path.splitext(filename)[1].lower() + + if file_ext not in ['.epub', '.pdf']: + raise HTTPException(status_code=400, detail="Only EPUB and PDF files are supported") + + try: + # Create a temporary file to save the upload + with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file: + # Read and write the uploaded file + content = await file.read() + tmp_file.write(content) + tmp_path = tmp_file.name + + # Determine output directory + base_name = os.path.splitext(filename)[0] + # Sanitize filename for directory name + safe_base_name = "".join([c for c in base_name if c.isalnum() or c in (' ', '-', '_')]).strip() + out_dir = os.path.join(BOOKS_DIR, f"{safe_base_name}_data") + + # Process based on file type + if file_ext == '.epub': + book_obj = process_epub(tmp_path, out_dir) + elif file_ext == '.pdf': + book_obj = process_pdf(tmp_path, out_dir) + + # Save to pickle + save_to_pickle(book_obj, out_dir) + + # Clean up temporary file + os.unlink(tmp_path) + + # Clear the cache so the new book appears + load_book_cached.cache_clear() + + # Redirect to library + return RedirectResponse(url="/", status_code=303) + + except Exception as e: + # Clean up on error + if 'tmp_path' in locals() and os.path.exists(tmp_path): + os.unlink(tmp_path) + raise HTTPException(status_code=500, detail=f"Error processing book: {str(e)}") + if __name__ == "__main__": import uvicorn print("Starting server at http://127.0.0.1:8123") diff --git a/templates/library.html b/templates/library.html index e7d094d..e50d4ab 100644 --- a/templates/library.html +++ b/templates/library.html @@ -19,9 +19,12 @@

Library

+
+ Upload Book +
{% if not books %} -

No processed books found. Run reader3.py on an epub first.

+

No processed books found. Upload a book or run reader3.py on an epub/pdf file.

{% endif %}
diff --git a/templates/upload.html b/templates/upload.html new file mode 100644 index 0000000..0fd1884 --- /dev/null +++ b/templates/upload.html @@ -0,0 +1,55 @@ + + + + + + Upload Book + + + +
+

Upload Book

+ +
+ Upload an EPUB or PDF file to add it to your library. The file will be processed automatically. +
+ +
+
+ + +
+ + + Cancel +
+ +
+
+

Processing your book... This may take a minute.

+
+
+ + + + diff --git a/uv.lock b/uv.lock index e2e2f80..22ba505 100644 --- a/uv.lock +++ b/uv.lock @@ -481,6 +481,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, ] +[[package]] +name = "pymupdf" +version = "1.26.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/d7/a6f0e03a117fa2ad79c4b898203bb212b17804f92558a6a339298faca7bb/pymupdf-1.26.6.tar.gz", hash = "sha256:a2b4531cd4ab36d6f1f794bb6d3c33b49bda22f36d58bb1f3e81cbc10183bd2b", size = 84322494, upload-time = "2025-11-05T15:20:46.786Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/5c/dec354eee5fe4966c715f33818ed4193e0e6c986cf8484de35b6c167fb8e/pymupdf-1.26.6-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e46f320a136ad55e5219e8f0f4061bdf3e4c12b126d2740d5a49f73fae7ea176", size = 23178988, upload-time = "2025-11-05T14:31:19.834Z" }, + { url = "https://files.pythonhosted.org/packages/ec/a0/11adb742d18142bd623556cd3b5d64649816decc5eafd30efc9498657e76/pymupdf-1.26.6-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:6844cd2396553c0fa06de4869d5d5ecb1260e6fc3b9d85abe8fa35f14dd9d688", size = 22469764, upload-time = "2025-11-05T14:32:34.654Z" }, + { url = "https://files.pythonhosted.org/packages/e4/c8/377cf20e31f58d4c243bfcf2d3cb7466d5b97003b10b9f1161f11eb4a994/pymupdf-1.26.6-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:617ba69e02c44f0da1c0e039ea4a26cf630849fd570e169c71daeb8ac52a81d6", size = 23502227, upload-time = "2025-11-06T11:03:56.934Z" }, + { url = "https://files.pythonhosted.org/packages/4f/bf/6e02e3d84b32c137c71a0a3dcdba8f2f6e9950619a3bc272245c7c06a051/pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:7777d0b7124c2ebc94849536b6a1fb85d158df3b9d873935e63036559391534c", size = 24115381, upload-time = "2025-11-05T14:33:54.338Z" }, + { url = "https://files.pythonhosted.org/packages/ab/9d/30f7fcb3776bfedde66c06297960debe4883b1667294a1ee9426c942e94d/pymupdf-1.26.6-cp310-abi3-win32.whl", hash = "sha256:8f3ef05befc90ca6bb0f12983200a7048d5bff3e1c1edef1bb3de60b32cb5274", size = 17203613, upload-time = "2025-11-05T17:19:47.494Z" }, + { url = "https://files.pythonhosted.org/packages/f9/e8/989f4eaa369c7166dc24f0eaa3023f13788c40ff1b96701f7047421554a8/pymupdf-1.26.6-cp310-abi3-win_amd64.whl", hash = "sha256:ce02ca96ed0d1acfd00331a4d41a34c98584d034155b06fd4ec0f051718de7ba", size = 18405680, upload-time = "2025-11-05T14:34:48.672Z" }, +] + +[[package]] +name = "python-multipart" +version = "0.0.20" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/87/f44d7c9f274c7ee665a29b885ec97089ec5dc034c7f3fafa03da9e39a09e/python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13", size = 37158, upload-time = "2024-12-16T19:45:46.972Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546, upload-time = "2024-12-16T19:45:44.423Z" }, +] + [[package]] name = "reader3" version = "0.1.0" @@ -490,6 +513,8 @@ dependencies = [ { name = "ebooklib" }, { name = "fastapi" }, { name = "jinja2" }, + { name = "pymupdf" }, + { name = "python-multipart" }, { name = "uvicorn" }, ] @@ -499,6 +524,8 @@ requires-dist = [ { name = "ebooklib", specifier = ">=0.20" }, { name = "fastapi", specifier = ">=0.121.2" }, { name = "jinja2", specifier = ">=3.1.6" }, + { name = "pymupdf", specifier = ">=1.25.1" }, + { name = "python-multipart", specifier = ">=0.0.9" }, { name = "uvicorn", specifier = ">=0.38.0" }, ]