diff --git a/README.md b/README.md index 4389217..f73bbd3 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,29 @@ Each processed file creates a subdirectory with: - `_metadata.json` - Metadata (page info, token count, etc.) - Extracted images are saved directly in the output directory +### Python API + +Use `BatchInputItem` when calling `InferenceManager.generate`, and omit `page_range` to process every page in a PDF: + +```python +from chandra.input import load_pdf_images +from chandra.model import BatchInputItem, InferenceManager + +manager = InferenceManager(method="hf") +images = load_pdf_images("document.pdf") + +batch = [ + BatchInputItem( + image=image, + prompt_type="ocr_layout", + ) + for image in images +] + +results = manager.generate(batch) +print(results[0].markdown) +``` + ### Streamlit Web App Launch the interactive demo for single-page processing: diff --git a/chandra/input.py b/chandra/input.py index 56829ab..3d6039a 100644 --- a/chandra/input.py +++ b/chandra/input.py @@ -26,7 +26,7 @@ def load_image( def load_pdf_images( filepath: str, - page_range: List[int], + page_range: List[int] | None = None, image_dpi: int = settings.IMAGE_DPI, min_pdf_image_dim: int = settings.MIN_PDF_IMAGE_DIM, ) -> List[Image.Image]: diff --git a/tests/unit/test_input.py b/tests/unit/test_input.py new file mode 100644 index 0000000..6baf720 --- /dev/null +++ b/tests/unit/test_input.py @@ -0,0 +1,67 @@ +from PIL import Image + +from chandra.input import load_pdf_images + + +class FakeRenderedPage: + def __init__(self, size): + self.size = size + + def to_pil(self): + return Image.new("RGB", self.size, "white") + + +class FakePage: + def __init__(self, width=200, height=300): + self.width = width + self.height = height + + def get_width(self): + return self.width + + def get_height(self): + return self.height + + def render(self, scale): + assert scale > 0 + return FakeRenderedPage((self.width, self.height)) + + +class FakePdfDocument: + def __init__(self, filepath): + self.filepath = filepath + self.pages = [FakePage(200, 300), FakePage(300, 400)] + self.forms_initialized = False + self.closed = False + + def init_forms(self): + self.forms_initialized = True + + def __len__(self): + return len(self.pages) + + def __getitem__(self, index): + return self.pages[index] + + def close(self): + self.closed = True + + +def test_load_pdf_images_processes_all_pages_when_page_range_omitted(monkeypatch): + monkeypatch.setattr("chandra.input.pdfium.PdfDocument", FakePdfDocument) + monkeypatch.setattr("chandra.input.flatten", lambda page: None) + + images = load_pdf_images("dummy.pdf") + + assert len(images) == 2 + assert [image.size for image in images] == [(200, 300), (300, 400)] + + +def test_load_pdf_images_respects_page_range(monkeypatch): + monkeypatch.setattr("chandra.input.pdfium.PdfDocument", FakePdfDocument) + monkeypatch.setattr("chandra.input.flatten", lambda page: None) + + images = load_pdf_images("dummy.pdf", page_range=[1]) + + assert len(images) == 1 + assert images[0].size == (300, 400)