diff --git a/FULL_BENCHMARKS.md b/FULL_BENCHMARKS.md new file mode 100644 index 0000000..a220015 --- /dev/null +++ b/FULL_BENCHMARKS.md @@ -0,0 +1,105 @@ +# Full 90-Language Benchmark + +This is a comprehensive multilingual evaluation covering 90 languages, comparing Chandra 2 against Gemini 2.5 Flash. The average scores are lower than the [43-language benchmark](README.md#multilingual-benchmark-table) because this includes many lower-resource languages. + +## Overall Scores + +| | Chandra 2 | Gemini 2.5 Flash | +|---|:---:|:---:| +| **Average** | **72.7% +/- 1.2%** | **60.8% +/- 1.3%** | + +## Results by Language + +| Language | Chandra 2 | Gemini 2.5 Flash | +|----------|:--------:|:----------------:| +| af | 80.4% | 85.8% | +| am | 34.4% | 0.5% | +| ar | 68.4% | 84.4% | +| as | 35.8% | 23.1% | +| az | 75.2% | 74.0% | +| be | 80.7% | 66.4% | +| bg | 83.1% | 64.3% | +| bn | 72.8% | 55.3% | +| br | 90.0% | 69.4% | +| bs | 84.8% | 85.1% | +| ca | 85.1% | 88.0% | +| cs | 85.3% | 79.1% | +| cy | 82.2% | 77.6% | +| da | 91.1% | 86.0% | +| de | 94.8% | 88.3% | +| el | 85.6% | 83.5% | +| en | 96.6% | 90.3% | +| eo | 80.1% | 71.9% | +| es | 89.3% | 86.8% | +| et | 75.2% | 73.7% | +| eu | 80.2% | 74.6% | +| fa | 75.1% | 61.8% | +| fi | 83.4% | 86.0% | +| fr | 93.7% | 86.1% | +| fy | 81.2% | 70.1% | +| ga | 80.9% | 70.1% | +| gd | 71.8% | 59.5% | +| gl | 80.9% | 80.9% | +| gu | 70.8% | 47.6% | +| ha | 72.1% | 59.1% | +| he | 70.4% | 50.9% | +| hi | 78.4% | 82.7% | +| hr | 90.1% | 88.2% | +| hu | 82.1% | 84.5% | +| hy | 64.2% | 42.1% | +| id | 91.6% | 88.3% | +| is | 77.3% | 72.2% | +| it | 94.6% | 85.7% | +| ja | 86.9% | 80.0% | +| jv | 73.2% | 80.4% | +| ka | 77.0% | 39.3% | +| kk | 80.5% | 77.2% | +| km | 46.1% | 6.3% | +| kn | 63.2% | 24.5% | +| ko | 81.5% | 84.8% | +| ku | 62.0% | 63.2% | +| ky | 81.2% | 69.8% | +| la | 73.8% | 70.5% | +| lo | 60.9% | 13.3% | +| lt | 79.8% | 70.5% | +| lv | 76.9% | 81.5% | +| mg | 81.2% | 78.4% | +| mk | 83.5% | 77.4% | +| ml | 64.3% | 23.8% | +| mn | 88.4% | 71.4% | +| mr | 75.0% | 69.7% | +| ms | 79.3% | 79.8% | +| my | 55.9% | 15.8% | +| ne | 45.3% | 43.0% | +| nl | 88.6% | 87.5% | +| no | 90.5% | 87.8% | +| or | 31.1% | 11.2% | +| pa | 48.3% | 22.4% | +| pl | 91.5% | 91.1% | +| ps | 12.6% | 13.3% | +| pt | 95.2% | 89.4% | +| ro | 84.5% | 76.7% | +| ru | 85.5% | 82.8% | +| sa | 51.1% | 44.6% | +| sd | 50.0% | 29.3% | +| si | 62.4% | 26.2% | +| sk | 77.3% | 81.2% | +| sl | 81.0% | 80.1% | +| so | 82.4% | 69.9% | +| sq | 75.3% | 77.1% | +| sr | 90.3% | 89.7% | +| su | 85.7% | 96.4% | +| sv | 93.3% | 91.1% | +| sw | 88.9% | 80.9% | +| ta | 77.7% | 53.9% | +| te | 58.6% | 33.3% | +| th | 62.6% | 66.7% | +| tr | 84.1% | 84.1% | +| ug | 25.8% | 5.4% | +| uk | 91.0% | 87.9% | +| ur | 44.1% | 57.6% | +| uz | 77.2% | 52.8% | +| vi | 82.6% | 89.5% | +| xh | 82.1% | 62.1% | +| yi | 24.9% | 6.8% | +| zh | 88.7% | 70.0% | diff --git a/README.md b/README.md index 72f7a07..31d88e8 100644 --- a/README.md +++ b/README.md @@ -11,107 +11,107 @@ Discord

-# Chandra +
-An OCR model for complex documents — handwriting, tables, math equations, and messy forms. +# Chandra OCR 2 - +Chandra OCR 2 is a state of the art OCR model that converts images and PDFs into structured HTML/Markdown/JSON while preserving layout information. -## Benchmarks +## News -Overall scores on the [olmocr bench](https://github.com/allenai/olmocr): +- 3/2026 - Chandra 2 is here with significant improvements to math, tables, layout, and multilingual OCR +- 10/2025 - Chandra 1 launched - +## Features -## Hosted API +- Tops external olmocr benchmark and significant improvement in internal multilingual benchmarks +- Convert documents to markdown, html, or json with detailed layout information +- Support for 90+ languages ([benchmark below](#multilingual-benchmark)) +- Excellent handwriting support +- Reconstructs forms accurately, including checkboxes +- Strong performance with tables, math, and complex layouts +- Extracts images and diagrams, and adds captions and structured data +- Two inference modes: local (HuggingFace) and remote (vLLM server) -A hosted API with additional accuracy improvements is available at [datalab.to](https://www.datalab.to/). Try the [free playground](https://www.datalab.to/playground) without installing. + -## Community +## Hosted API -Join [Discord](https://discord.gg//KuZwXNGnfH) to discuss development and get help. +- We have a hosted API for Chandra [here](https://www.datalab.to/), which is more accurate and faster. +- There is a free playground [here](https://www.datalab.to/playground) if you want to try Chandra without installing. -## Quick Start +## Quickstart + +The easiest way to start is with the CLI tools: ```shell pip install chandra-ocr -# Start vLLM server, then run OCR +# With vLLM (recommended, lightweight install) chandra_vllm chandra input.pdf ./output -# Or use HuggingFace locally +# With HuggingFace (requires torch) +pip install chandra-ocr[hf] chandra input.pdf ./output --method hf -# Interactive web app +# Interactive streamlit app +pip install chandra-ocr[app] chandra_app ``` -**Python:** - -```python -from chandra.model import InferenceManager -from chandra.input import load_pdf_images - -manager = InferenceManager(method="hf") -images = load_pdf_images("document.pdf") -results = manager.generate(images) -print(results[0].markdown) -``` - -## How it Works. - -- **Two inference modes**: Run locally via HuggingFace Transformers, or deploy a vLLM server for production throughput -- **Layout-aware output**: Every text block, table, and image comes with bounding box coordinates -- **Structured formats**: Output as Markdown, HTML, or JSON with full layout metadata -- **40+ languages** supported +## Benchmarks -## What It Handles +Multilingual performance was a focus for us with Chandra 2. There isn't a good public multilingual OCR benchmark, so we made our own. This tests tables, math, ordering, layout, and text accuracy. -**Handwriting** — Doctor notes, filled forms, homework. Chandra reads cursive and messy print that trips up traditional OCR. + -**Tables** — Preserves structure including merged cells (colspan/rowspan). Works on financial filings, invoices, and data tables. +See full scores [below](#multilingual-benchmark-table). We also have a [full 90-language benchmark](FULL_BENCHMARKS.md). -**Math** — Inline and block equations rendered as LaTeX. Handles textbooks, worksheets, and research papers. +We also benchmarked Chandra 2 with the widely accepted olmocr benchmark: -**Forms** — Reconstructs checkboxes, radio buttons, and form fields with their values. + -**Complex Layouts** — Multi-column documents, newspapers, textbooks with figures and captions. +See full scores [below](#benchmark-table). ## Examples -| | | -|---|---| -|
**Handwriting** |
**Tables** | -|
**Math** |
**Newspapers** | - -
-More examples - -| Type | Name | Link | -|------|------|------| -| Tables | 10K Filing | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/tables/10k.png) | -| Forms | Lease Agreement | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/forms/lease.png) | -| Handwriting | Math Homework | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/handwriting/math_hw.png) | -| Books | Geography Textbook | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/books/geo_textbook_page.png) | -| Books | Exercise Problems | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/books/exercises.png) | -| Math | Attention Diagram | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/math/attn_all.png) | -| Math | Worksheet | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/math/worksheet.png) | -| Newspapers | LA Times | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/newspapers/la_times.png) | -| Other | Transcript | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/other/transcript.png) | -| Other | Flowchart | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/other/flowchart.png) | - -
+| Type | Name | Link | +|------|--------------------------|-------------------------------------------------------------------------------------------------------------| +| Math | CS229 Textbook | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/math/cs229.png) | +| Math | Handwritten Math | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/math/handwritten_math.png) | +| Math | Chinese Math | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/math/chinese_math.png) | +| Tables | Statistical Distribution | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/tables/complex_tables.png) | +| Tables | Financial Table | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/tables/financial_table.png) | +| Forms | Registration Form | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/forms/handwritten_form.png) | +| Forms | Lease Form | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/forms/lease_filled.png) | +| Handwriting | Cursive Writing | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/handwriting/cursive_writing.png) | +| Handwriting | Handwritten Notes | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/handwriting/handwritten_notes.png) | +| Languages | Arabic | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/languages/arabic.png) | +| Languages | Japanese | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/languages/japanese.png) | +| Languages | Hindi | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/languages/hindi.png) | +| Languages | Russian | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/languages/russian.png) | +| Other | Charts | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/other/charts.png) | +| Other | Chemistry | [View](https://github.com/datalab-to/chandra/blob/master/assets/examples/other/chemistry.png) | ## Installation +### Package + ```bash +# Base install (for vLLM backend) pip install chandra-ocr + +# With HuggingFace backend (includes torch, transformers) +pip install chandra-ocr[hf] + +# With all extras +pip install chandra-ocr[all] ``` -For HuggingFace inference, we recommend installing [flash attention](https://github.com/Dao-AILab/flash-attention) for better performance. +If you're using the HuggingFace method, we also recommend installing [flash attention](https://github.com/Dao-AILab/flash-attention) for better performance. -**From source:** +### From Source ```bash git clone https://github.com/datalab-to/chandra.git @@ -124,69 +124,164 @@ source .venv/bin/activate ### CLI +Process single files or entire directories: + ```bash -# Single file with vLLM server +# Single file, with vllm server (see below for how to launch vllm) chandra input.pdf ./output --method vllm -# Directory with local model +# Process all files in a directory with local model chandra ./documents ./output --method hf ``` -**Options:** +**CLI Options:** - `--method [hf|vllm]`: Inference method (default: vllm) - `--page-range TEXT`: Page range for PDFs (e.g., "1-5,7,9-12") - `--max-output-tokens INTEGER`: Max tokens per page - `--max-workers INTEGER`: Parallel workers for vLLM - `--include-images/--no-images`: Extract and save images (default: include) - `--include-headers-footers/--no-headers-footers`: Include page headers/footers (default: exclude) -- `--batch-size INTEGER`: Pages per batch (default: 1) +- `--batch-size INTEGER`: Pages per batch (default: 28 for vllm, 1 for hf) -**Output structure:** +**Output Structure:** -``` -output/ -└── filename/ - ├── filename.md # Markdown - ├── filename.html # HTML with bounding boxes - ├── filename_metadata.json - └── images/ # Extracted images +Each processed file creates a subdirectory with: +- `.md` - Markdown output +- `.html` - HTML output +- `_metadata.json` - Metadata (page info, token count, etc.) +- Extracted images are saved directly in the output directory + +### Streamlit Web App + +Launch the interactive demo for single-page processing: + +```bash +chandra_app ``` -### vLLM Server +### vLLM Server (Optional) -For production or batch processing: +For production deployments or batch processing, use the vLLM server: ```bash chandra_vllm ``` -Launches a Docker container with optimized inference. Configure via environment: +This launches a Docker container with optimized inference settings. Configure via environment variables: - `VLLM_API_BASE`: Server URL (default: `http://localhost:8000/v1`) -- `VLLM_MODEL_NAME`: Model name (default: `chandra`) +- `VLLM_MODEL_NAME`: Model name for the server (default: `chandra`) - `VLLM_GPUS`: GPU device IDs (default: `0`) +You can also start your own vllm server with the `datalab-to/chandra-ocr-2` model. + ### Configuration -Settings via environment variables or `local.env`: +Settings can be configured via environment variables or a `local.env` file: ```bash -MODEL_CHECKPOINT=datalab-to/chandra -MAX_OUTPUT_TOKENS=8192 +# Model settings +MODEL_CHECKPOINT=datalab-to/chandra-ocr-2 +MAX_OUTPUT_TOKENS=12384 + +# vLLM settings VLLM_API_BASE=http://localhost:8000/v1 +VLLM_MODEL_NAME=chandra VLLM_GPUS=0 ``` -## Commercial Usage - -Code is Apache 2.0. Model weights use a modified OpenRAIL-M license: free for research, personal use, and startups under $2M funding/revenue. Cannot be used competitively with our API. For broader commercial licensing, see [pricing](https://www.datalab.to/pricing?utm_source=gh-chandra). - -## Credits +# Commercial usage + +This code is Apache 2.0, and our model weights use a modified OpenRAIL-M license (free for research, personal use, and startups under $2M funding/revenue, cannot be used competitively with our API). To remove the OpenRAIL license requirements, or for broader commercial licensing, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-chandra). + +# Benchmark table + +| **Model** | ArXiv | Old Scans Math | Tables | Old Scans | Headers and Footers | Multi column | Long tiny text | Base | Overall | Source | +|:--------------------------|:--------:|:--------------:|:--------:|:---------:|:-------------------:|:------------:|:--------------:|:----:|:--------------:|:------:| +| Datalab API | **90.4** | **90.2** | **90.7** | **54.6** | 91.6 | 83.7 | **92.3** | **99.9** | **86.7 ± 0.8** | Own benchmarks | +| Chandra 2 | 90.2 | 89.3 | 89.9 | 49.8 | 92.5 | 83.5 | 92.1 | 99.6 | 85.9 ± 0.8 | Own benchmarks | +| dots.ocr 1.5 | 85.9 | 85.5 | **90.7** | 48.2 | 94.0 | **85.3** | 81.6 | 99.7 | 83.9 | dots.ocr repo | +| Chandra 1 | 82.2 | 80.3 | 88.0 | 50.4 | 90.8 | 81.2 | **92.3** | **99.9** | 83.1 ± 0.9 | Own benchmarks | +| olmOCR 2 | 83.0 | 82.3 | 84.9 | 47.7 | **96.1** | 83.7 | 81.9 | 99.6 | 82.4 | olmocr repo | +| dots.ocr | 82.1 | 64.2 | 88.3 | 40.9 | 94.1 | 82.4 | 81.2 | 99.5 | 79.1 ± 1.0 | dots.ocr repo | +| olmOCR v0.3.0 | 78.6 | 79.9 | 72.9 | 43.9 | 95.1 | 77.3 | 81.2 | 98.9 | 78.5 ± 1.1 | olmocr repo | +| Datalab Marker v1.10.0 | 83.8 | 69.7 | 74.8 | 32.3 | 86.6 | 79.4 | 85.7 | 99.6 | 76.5 ± 1.0 | Own benchmarks | +| Deepseek OCR | 75.2 | 72.3 | 79.7 | 33.3 | **96.1** | 66.7 | 80.1 | 99.7 | 75.4 ± 1.0 | Own benchmarks | +| Mistral OCR API | 77.2 | 67.5 | 60.6 | 29.3 | 93.6 | 71.3 | 77.1 | 99.4 | 72.0 ± 1.1 | olmocr repo | +| GPT-4o (Anchored) | 53.5 | 74.5 | 70.0 | 40.7 | 93.8 | 69.3 | 60.6 | 96.8 | 69.9 ± 1.1 | olmocr repo | +| Qwen 3 VL 8B | 70.2 | 75.1 | 45.6 | 37.5 | 89.1 | 62.1 | 43.0 | 94.3 | 64.6 ± 1.1 | Own benchmarks | +| Gemini Flash 2 (Anchored) | 54.5 | 56.1 | 72.1 | 34.2 | 64.7 | 61.5 | 71.5 | 95.6 | 63.8 ± 1.2 | olmocr repo | + + +# Multilingual benchmark table + +The table below covers the 43 most common languages, benchmarked across multiple models. For a comprehensive evaluation across 90 languages (Chandra 2 vs Gemini 2.5 Flash only), see the [full 90-language benchmark](#full-90-language-benchmark-table). + +| Language | Datalab API | Chandra 2 | Chandra 1 | Gemini 2.5 Flash | GPT-5 Mini | +|---|:---:|:---:|:---:|:---:|:---:| +| ar | 67.6% | 68.4% | 34.0% | 84.4% | 55.6% | +| bn | 85.1% | 72.8% | 45.6% | 55.3% | 23.3% | +| ca | 88.7% | 85.1% | 84.2% | 88.0% | 78.5% | +| cs | 88.2% | 85.3% | 84.7% | 79.1% | 78.8% | +| da | 90.1% | 91.1% | 88.4% | 86.0% | 87.7% | +| de | 93.8% | 94.8% | 83.0% | 88.3% | 93.8% | +| el | 89.9% | 85.6% | 85.5% | 83.5% | 82.4% | +| es | 91.8% | 89.3% | 88.7% | 86.8% | 97.1% | +| fa | 82.2% | 75.1% | 69.6% | 61.8% | 56.4% | +| fi | 85.7% | 83.4% | 78.4% | 86.0% | 84.7% | +| fr | 93.3% | 93.7% | 89.6% | 86.1% | 91.1% | +| gu | 73.8% | 70.8% | 44.6% | 47.6% | 11.5% | +| he | 76.4% | 70.4% | 38.9% | 50.9% | 22.3% | +| hi | 80.5% | 78.4% | 70.2% | 82.7% | 41.0% | +| hr | 93.4% | 90.1% | 85.9% | 88.2% | 81.3% | +| hu | 88.1% | 82.1% | 82.5% | 84.5% | 84.8% | +| id | 91.3% | 91.6% | 86.7% | 88.3% | 89.7% | +| it | 94.4% | 94.1% | 89.1% | 85.7% | 91.6% | +| ja | 87.3% | 86.9% | 85.4% | 80.0% | 76.1% | +| jv | 87.5% | 73.2% | 85.1% | 80.4% | 69.6% | +| kn | 70.0% | 63.2% | 20.6% | 24.5% | 10.1% | +| ko | 89.1% | 81.5% | 82.3% | 84.8% | 78.4% | +| la | 78.0% | 73.8% | 55.9% | 70.5% | 54.6% | +| ml | 72.4% | 64.3% | 18.1% | 23.8% | 11.9% | +| mr | 80.8% | 75.0% | 57.0% | 69.7% | 20.9% | +| nl | 90.0% | 88.6% | 85.3% | 87.5% | 83.8% | +| no | 89.2% | 90.3% | 85.5% | 87.8% | 87.4% | +| pl | 93.8% | 91.5% | 83.9% | 89.7% | 90.4% | +| pt | 97.0% | 95.2% | 84.3% | 89.4% | 90.8% | +| ro | 86.2% | 84.5% | 82.1% | 76.1% | 77.3% | +| ru | 88.8% | 85.5% | 88.7% | 82.8% | 72.2% | +| sa | 57.5% | 51.1% | 33.6% | 44.6% | 12.5% | +| sr | 95.3% | 90.3% | 82.3% | 89.7% | 83.0% | +| sv | 91.9% | 92.8% | 82.1% | 91.1% | 92.1% | +| ta | 82.9% | 77.7% | 50.8% | 53.9% | 8.1% | +| te | 69.4% | 58.6% | 19.5% | 33.3% | 9.9% | +| th | 71.6% | 62.6% | 47.0% | 66.7% | 53.8% | +| tr | 88.9% | 84.1% | 68.1% | 84.1% | 78.2% | +| uk | 93.1% | 91.0% | 88.5% | 87.9% | 81.9% | +| ur | 54.1% | 43.2% | 28.1% | 57.6% | 16.9% | +| vi | 85.0% | 80.4% | 81.6% | 89.5% | 83.6% | +| zh | 87.8% | 88.7% | 88.3% | 70.0% | 70.4% | +| **Average** | **80.4%** | **77.8%** | **69.4%** | **67.6%** | **60.5%** | + +# Full 90-language benchmark table + +We also have a more comprehensive evaluation covering 90 languages, comparing Chandra 2 against Gemini 2.5 Flash. The average scores are lower than the 43-language table above because this includes many lower-resource languages. Chandra 2 averages **72.7%** vs Gemini 2.5 Flash at **60.8%**. + +See the [full 90-language results](FULL_BENCHMARKS.md). + +## Throughput + +Benchmarked with vLLM on a single NVIDIA H100 80GB GPU using a diverse mix of documents (math, tables, scans, multi-column layouts) from the olmOCR benchmark set. This set is significantly slower than real-world usage - we estimate 2 pages/s in real-world usage. + +| Configuration | Pages/sec | Avg Latency | P95 Latency | Failure Rate | +|---|:---:|:---:|:---:|:---:| +| vLLM, 96 concurrent sequences | 1.44 | 60s | 156s | 0% | + +# Credits + +Thank you to the following open source projects: - [Huggingface Transformers](https://github.com/huggingface/transformers) -- [vLLM](https://github.com/vllm-project/vllm) +- [VLLM](https://github.com/vllm-project/vllm) - [olmocr](https://github.com/allenai/olmocr) -- [Qwen3 VL](https://github.com/QwenLM/Qwen3) - -## Support Datalab -If you find this repository helpful, please consider giving it a star ⭐ +- [Qwen 3.5](https://github.com/QwenLM/Qwen3) \ No newline at end of file diff --git a/assets/benchmarks/bench.png b/assets/benchmarks/bench.png index a3dd954..8df0778 100644 Binary files a/assets/benchmarks/bench.png and b/assets/benchmarks/bench.png differ diff --git a/assets/benchmarks/bench_top.png b/assets/benchmarks/bench_top.png new file mode 100644 index 0000000..5cf65b4 Binary files /dev/null and b/assets/benchmarks/bench_top.png differ diff --git a/assets/benchmarks/multilingual.png b/assets/benchmarks/multilingual.png new file mode 100644 index 0000000..91ed4a0 Binary files /dev/null and b/assets/benchmarks/multilingual.png differ diff --git a/assets/examples/books/exercises.png b/assets/examples/books/exercises.png deleted file mode 100644 index 32adc11..0000000 Binary files a/assets/examples/books/exercises.png and /dev/null differ diff --git a/assets/examples/books/geo_textbook_page.png b/assets/examples/books/geo_textbook_page.png deleted file mode 100644 index eacc107..0000000 Binary files a/assets/examples/books/geo_textbook_page.png and /dev/null differ diff --git a/assets/examples/forms/handwritten_form.png b/assets/examples/forms/handwritten_form.png index 0f5b709..9caf252 100644 Binary files a/assets/examples/forms/handwritten_form.png and b/assets/examples/forms/handwritten_form.png differ diff --git a/assets/examples/forms/lease.png b/assets/examples/forms/lease.png deleted file mode 100644 index 1761d55..0000000 Binary files a/assets/examples/forms/lease.png and /dev/null differ diff --git a/assets/examples/forms/lease_filled.png b/assets/examples/forms/lease_filled.png new file mode 100644 index 0000000..24b3569 Binary files /dev/null and b/assets/examples/forms/lease_filled.png differ diff --git a/assets/examples/handwriting/cursive_writing.png b/assets/examples/handwriting/cursive_writing.png new file mode 100644 index 0000000..e358c96 Binary files /dev/null and b/assets/examples/handwriting/cursive_writing.png differ diff --git a/assets/examples/handwriting/doctor_note.png b/assets/examples/handwriting/doctor_note.png deleted file mode 100644 index 5ce581a..0000000 Binary files a/assets/examples/handwriting/doctor_note.png and /dev/null differ diff --git a/assets/examples/handwriting/handwritten_notes.png b/assets/examples/handwriting/handwritten_notes.png new file mode 100644 index 0000000..08c3a56 Binary files /dev/null and b/assets/examples/handwriting/handwritten_notes.png differ diff --git a/assets/examples/handwriting/math_hw.png b/assets/examples/handwriting/math_hw.png deleted file mode 100644 index e48e953..0000000 Binary files a/assets/examples/handwriting/math_hw.png and /dev/null differ diff --git a/assets/examples/languages/arabic.png b/assets/examples/languages/arabic.png new file mode 100644 index 0000000..bde3c65 Binary files /dev/null and b/assets/examples/languages/arabic.png differ diff --git a/assets/examples/languages/hindi.png b/assets/examples/languages/hindi.png new file mode 100644 index 0000000..2e2ae94 Binary files /dev/null and b/assets/examples/languages/hindi.png differ diff --git a/assets/examples/languages/japanese.png b/assets/examples/languages/japanese.png new file mode 100644 index 0000000..0461423 Binary files /dev/null and b/assets/examples/languages/japanese.png differ diff --git a/assets/examples/languages/russian.png b/assets/examples/languages/russian.png new file mode 100644 index 0000000..ad817d2 Binary files /dev/null and b/assets/examples/languages/russian.png differ diff --git a/assets/examples/math/attn_all.png b/assets/examples/math/attn_all.png deleted file mode 100644 index 91dee65..0000000 Binary files a/assets/examples/math/attn_all.png and /dev/null differ diff --git a/assets/examples/math/chinese_math.png b/assets/examples/math/chinese_math.png new file mode 100644 index 0000000..e6cb2bb Binary files /dev/null and b/assets/examples/math/chinese_math.png differ diff --git a/assets/examples/math/cs229.png b/assets/examples/math/cs229.png new file mode 100644 index 0000000..d505532 Binary files /dev/null and b/assets/examples/math/cs229.png differ diff --git a/assets/examples/math/ega.png b/assets/examples/math/ega.png deleted file mode 100644 index de02a3a..0000000 Binary files a/assets/examples/math/ega.png and /dev/null differ diff --git a/assets/examples/math/handwritten_math.png b/assets/examples/math/handwritten_math.png new file mode 100644 index 0000000..baafa06 Binary files /dev/null and b/assets/examples/math/handwritten_math.png differ diff --git a/assets/examples/math/worksheet.png b/assets/examples/math/worksheet.png deleted file mode 100644 index ccc1f5f..0000000 Binary files a/assets/examples/math/worksheet.png and /dev/null differ diff --git a/assets/examples/newspapers/la_times.png b/assets/examples/newspapers/la_times.png deleted file mode 100644 index 0c91d2d..0000000 Binary files a/assets/examples/newspapers/la_times.png and /dev/null differ diff --git a/assets/examples/newspapers/nyt.png b/assets/examples/newspapers/nyt.png deleted file mode 100644 index e83149d..0000000 Binary files a/assets/examples/newspapers/nyt.png and /dev/null differ diff --git a/assets/examples/other/charts.png b/assets/examples/other/charts.png new file mode 100644 index 0000000..bb2a0d3 Binary files /dev/null and b/assets/examples/other/charts.png differ diff --git a/assets/examples/other/chemistry.png b/assets/examples/other/chemistry.png new file mode 100644 index 0000000..36d3041 Binary files /dev/null and b/assets/examples/other/chemistry.png differ diff --git a/assets/examples/other/flowchart.png b/assets/examples/other/flowchart.png deleted file mode 100644 index 4417937..0000000 Binary files a/assets/examples/other/flowchart.png and /dev/null differ diff --git a/assets/examples/other/transcript.png b/assets/examples/other/transcript.png deleted file mode 100644 index e1a6ee8..0000000 Binary files a/assets/examples/other/transcript.png and /dev/null differ diff --git a/assets/examples/tables/10k.png b/assets/examples/tables/10k.png deleted file mode 100644 index 69a7bea..0000000 Binary files a/assets/examples/tables/10k.png and /dev/null differ diff --git a/assets/examples/tables/complex_tables.png b/assets/examples/tables/complex_tables.png new file mode 100644 index 0000000..48cce15 Binary files /dev/null and b/assets/examples/tables/complex_tables.png differ diff --git a/assets/examples/tables/financial_table.png b/assets/examples/tables/financial_table.png new file mode 100644 index 0000000..4dbed07 Binary files /dev/null and b/assets/examples/tables/financial_table.png differ diff --git a/assets/examples/tables/water_damage.png b/assets/examples/tables/water_damage.png deleted file mode 100644 index a2cb26f..0000000 Binary files a/assets/examples/tables/water_damage.png and /dev/null differ diff --git a/chandra/model/hf.py b/chandra/model/hf.py index 6be4e47..3f4ebf0 100644 --- a/chandra/model/hf.py +++ b/chandra/model/hf.py @@ -13,29 +13,21 @@ def generate_hf( bbox_scale: int = settings.BBOX_SCALE, **kwargs, ) -> List[GenerationResult]: - from qwen_vl_utils import process_vision_info - if max_output_tokens is None: max_output_tokens = settings.MAX_OUTPUT_TOKENS - messages = [ - process_batch_element(item, model.processor, bbox_scale) for item in batch - ] - text = model.processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) + conversations = [[process_batch_element(item)] for item in batch] - image_inputs, _ = process_vision_info(messages) - inputs = model.processor( - text=text, - images=image_inputs, - padding=True, + inputs = model.processor.apply_chat_template( + conversations, + tokenize=True, + add_generation_prompt=True, + return_dict=True, return_tensors="pt", - padding_side="left", + padding=True, ) - inputs = inputs.to("cuda") + inputs = inputs.to(model.device) - # Inference: Generation of the output generated_ids = model.generate(**inputs, max_new_tokens=max_output_tokens) generated_ids_trimmed = [ out_ids[len(in_ids) :] @@ -53,25 +45,29 @@ def generate_hf( return results -def process_batch_element(item: BatchInputItem, processor, bbox_scale: int): +def process_batch_element(item: BatchInputItem): prompt = item.prompt prompt_type = item.prompt_type if not prompt: - prompt = PROMPT_MAPPING[prompt_type].replace("{bbox_scale}", str(bbox_scale)) + prompt = PROMPT_MAPPING[prompt_type] content = [] image = scale_to_fit(item.image) # Guarantee max size content.append({"type": "image", "image": image}) - content.append({"type": "text", "text": prompt}) - message = {"role": "user", "content": content} - return message + return {"role": "user", "content": content} def load_model(): - import torch - from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor + try: + import torch + from transformers import AutoModelForImageTextToText, AutoProcessor + except ImportError: + raise ImportError( + "HuggingFace backend requires additional dependencies. " + "Install with: pip install chandra-ocr[hf]" + ) device_map = "auto" if settings.TORCH_DEVICE: @@ -84,10 +80,11 @@ def load_model(): if settings.TORCH_ATTN: kwargs["attn_implementation"] = settings.TORCH_ATTN - model = Qwen3VLForConditionalGeneration.from_pretrained( + model = AutoModelForImageTextToText.from_pretrained( settings.MODEL_CHECKPOINT, **kwargs ) model = model.eval() - processor = Qwen3VLProcessor.from_pretrained(settings.MODEL_CHECKPOINT) + processor = AutoProcessor.from_pretrained(settings.MODEL_CHECKPOINT) + processor.tokenizer.padding_side = "left" model.processor = processor return model diff --git a/chandra/model/util.py b/chandra/model/util.py index 7cef96f..f7e97a3 100644 --- a/chandra/model/util.py +++ b/chandra/model/util.py @@ -1,42 +1,65 @@ -import math from typing import Tuple from PIL import Image -from chandra.output import parse_markdown - def scale_to_fit( img: Image.Image, max_size: Tuple[int, int] = (3072, 2048), - min_size: Tuple[int, int] = (28, 28), + min_size: Tuple[int, int] = (1792, 28), + grid_size: int = 28, ): resample_method = Image.Resampling.LANCZOS width, height = img.size # Check for empty or invalid image - if width == 0 or height == 0: + if width <= 0 or height <= 0: return img - max_width, max_height = max_size - min_width, min_height = min_size - + original_ar = width / height current_pixels = width * height - max_pixels = max_width * max_height - min_pixels = min_width * min_height + max_pixels = max_size[0] * max_size[1] + min_pixels = min_size[0] * min_size[1] + # 1. Determine ideal float scale based on pixel bounds + scale = 1.0 if current_pixels > max_pixels: - scale_factor = (max_pixels / current_pixels) ** 0.5 - - new_width = math.floor(width * scale_factor) - new_height = math.floor(height * scale_factor) + scale = (max_pixels / current_pixels) ** 0.5 elif current_pixels < min_pixels: - scale_factor = (min_pixels / current_pixels) ** 0.5 + scale = (min_pixels / current_pixels) ** 0.5 + + # 2. Convert dimensions to integer "grid blocks" + w_blocks = max(1, round((width * scale) / grid_size)) + h_blocks = max(1, round((height * scale) / grid_size)) + + # 3. Refinement Loop: Ensure we are under the max limit + while (w_blocks * h_blocks * grid_size * grid_size) > max_pixels: + if w_blocks == 1 and h_blocks == 1: + break + + if w_blocks == 1: + h_blocks -= 1 + continue + if h_blocks == 1: + w_blocks -= 1 + continue + + # Compare distortion: Which move preserves Aspect Ratio better? + ar_w_loss = abs(((w_blocks - 1) / h_blocks) - original_ar) + ar_h_loss = abs((w_blocks / (h_blocks - 1)) - original_ar) - new_width = math.ceil(width * scale_factor) - new_height = math.ceil(height * scale_factor) - else: + if ar_w_loss < ar_h_loss: + w_blocks -= 1 + else: + h_blocks -= 1 + + # 4. Calculate final pixel dimensions + new_width = w_blocks * grid_size + new_height = h_blocks * grid_size + + # Return original if no changes were needed + if (new_width, new_height) == (width, height): return img return img.resize((new_width, new_height), resample=resample_method) @@ -49,12 +72,6 @@ def detect_repeat_token( cut_from_end: int = 0, scaling_factor: float = 3.0, ): - try: - predicted_tokens = parse_markdown(predicted_tokens) - except Exception as e: - print(f"Error parsing markdown: {e}") - return True - if cut_from_end > 0: predicted_tokens = predicted_tokens[:-cut_from_end] diff --git a/chandra/model/vllm.py b/chandra/model/vllm.py index fce2ed6..94ad975 100644 --- a/chandra/model/vllm.py +++ b/chandra/model/vllm.py @@ -30,6 +30,8 @@ def generate_vllm( max_failure_retries: int | None = None, bbox_scale: int = settings.BBOX_SCALE, vllm_api_base: str = settings.VLLM_API_BASE, + temperature: float = 0.0, + top_p: float = 0.1, ) -> List[GenerationResult]: client = OpenAI( api_key=settings.VLLM_API_KEY, @@ -51,14 +53,10 @@ def generate_vllm( models = client.models.list() model_name = models.data[0].id - def _generate( - item: BatchInputItem, temperature: float = 0, top_p: float = 0.1 - ) -> GenerationResult: + def _generate(item: BatchInputItem, temperature, top_p) -> GenerationResult: prompt = item.prompt if not prompt: - prompt = PROMPT_MAPPING[item.prompt_type].replace( - "{bbox_scale}", str(bbox_scale) - ) + prompt = PROMPT_MAPPING[item.prompt_type] content = [] image = scale_to_fit(item.image) @@ -93,11 +91,12 @@ def _generate( return result def process_item(item, max_retries, max_failure_retries=None): - result = _generate(item) + result = _generate(item, temperature=temperature, top_p=top_p) retries = 0 while _should_retry(result, retries, max_retries, max_failure_retries): - result = _generate(item, temperature=0.3, top_p=0.95) + retry_temperature = min(temperature + 0.2 * (retries + 1), 0.8) + result = _generate(item, temperature=retry_temperature, top_p=0.95) retries += 1 return result diff --git a/chandra/output.py b/chandra/output.py index a8a8b18..030a4fd 100644 --- a/chandra/output.py +++ b/chandra/output.py @@ -1,5 +1,4 @@ import hashlib -import json import re from dataclasses import dataclass, asdict from functools import lru_cache @@ -28,7 +27,7 @@ def extract_images(html: str, chunks: dict, image: Image.Image): for idx, chunk in enumerate(chunks): div_idx += 1 if chunk["label"] in ["Image", "Figure"]: - img = chunk["content"].find("img") + img = BeautifulSoup(chunk["content"], "html.parser").find("img") if not img: continue bbox = chunk["bbox"] @@ -54,6 +53,9 @@ def parse_html( div_idx += 1 label = div.get("data-label") + if label == "Blank-Page": + continue + # Skip headers and footers if not included if label and not include_headers_footers: if label in ["Page-Header", "Page-Footer"]: @@ -74,6 +76,12 @@ def parse_html( img = BeautifulSoup(f"", "html.parser") div.append(img) + # Strip img tags without src in non-image blocks (model hallucinations) + if label not in ["Image", "Figure"]: + for img_tag in div.find_all("img"): + if not img_tag.get("src"): + img_tag.decompose() + # Wrap text content in

tags if no inner HTML tags exist if label in ["Text"] and not re.search( "<.+>", str(div.decode_contents()).strip() @@ -196,19 +204,20 @@ def parse_layout(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE): height_scaler = height / bbox_scale layout_blocks = [] for div in top_level_divs: + label = div.get("data-label") + if label == "Blank-Page": + continue + bbox = div.get("data-bbox") try: - bbox = json.loads(bbox) + bbox = bbox.split(" ") + bbox = list(map(int, bbox)) assert len(bbox) == 4, "Invalid bbox length" except Exception: - try: - bbox = bbox.split(" ") - assert len(bbox) == 4, "Invalid bbox length" - except Exception: - bbox = [0, 0, 1, 1] + print(f"Invalid bbox format: {bbox}, defaulting to full image") + bbox = [0, 0, 1, 1] - bbox = list(map(int, bbox)) # Normalize bbox bbox = [ max(0, int(bbox[0] * width_scaler)), @@ -216,8 +225,16 @@ def parse_layout(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE): min(int(bbox[2] * width_scaler), width), min(int(bbox[3] * height_scaler), height), ] - label = div.get("data-label", "block") + if not label: + label = "block" content = str(div.decode_contents()) + + # Strip nested data-bbox attributes (not needed in open source) + content_soup = BeautifulSoup(content, "html.parser") + for tag in content_soup.find_all(attrs={"data-bbox": True}): + del tag["data-bbox"] + content = str(content_soup) + layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content)) return layout_blocks diff --git a/chandra/prompts.py b/chandra/prompts.py index 954897e..5b19946 100644 --- a/chandra/prompts.py +++ b/chandra/prompts.py @@ -34,6 +34,7 @@ "thead", "big", "code", + "chem", ] ALLOWED_ATTRIBUTES = [ "class", @@ -48,6 +49,8 @@ "href", "alt", "align", + "data-bbox", + "data-label", ] PROMPT_ENDING = f""" @@ -57,15 +60,17 @@ * Inline math: Surround math with ... tags. Math expressions should be rendered in KaTeX-compatible LaTeX. Use display for block math. * Tables: Use colspan and rowspan attributes to match table structure. * Formatting: Maintain consistent formatting with the image, including spacing, indentation, subscripts/superscripts, and special characters. -* Images: Include a description of any images in the alt attribute of an tag. Do not fill out the src property. +* Images: Include a description of any images in the alt attribute of an tag. Do not fill out the src property. Describe in detail inside the div tag. Also convert charts to high fidelity data, and convert diagrams to mermaid. * Forms: Mark checkboxes and radio buttons properly. * Text: join lines together properly into paragraphs using

...

tags. Use
tags for line breaks within paragraphs, but only when absolutely necessary to maintain meaning. +* Chemistry: Use ... tags for chemical formulas with reactive SMILES. +* Lists: Preserve indents and proper list markers. * Use the simplest possible HTML structure that accurately represents the content of the block. * Make sure the text is accurate and easy for a human to read and interpret. Reading order should be correct and natural. """.strip() OCR_LAYOUT_PROMPT = f""" -OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in [x0, y0, x1, y1] format. Bboxes are normalized 0-{{bbox_scale}}. The data-label attribute is the label for the block. +OCR this image to HTML, arranged as layout blocks. Each layout block should be a div with the data-bbox attribute representing the bounding box of the block in x0 y0 x1 y1 format. Bboxes are normalized 0-1000. The data-label attribute is the label for the block. Use the following labels: - Caption @@ -83,6 +88,10 @@ - Form - Table-Of-Contents - Figure +- Chemical-Block +- Diagram +- Bibliography +- Blank-Page {PROMPT_ENDING} """.strip() @@ -97,3 +106,9 @@ "ocr_layout": OCR_LAYOUT_PROMPT, "ocr": OCR_PROMPT, } + +if __name__ == "__main__": + print("OCR Layout Prompt:") + print(OCR_LAYOUT_PROMPT) + print("\nOCR Prompt:") + print(OCR_PROMPT) diff --git a/chandra/scripts/vllm.py b/chandra/scripts/vllm.py index 26ea073..5c602c1 100644 --- a/chandra/scripts/vllm.py +++ b/chandra/scripts/vllm.py @@ -1,11 +1,66 @@ +import argparse +import json +import math import os import subprocess import sys from chandra.settings import settings +# H100 80GB is the baseline for scaling +BASELINE_VRAM_GB = 80 +BASELINE_MAX_BATCHED_TOKENS = 8192 +BASELINE_MAX_NUM_SEQS = 64 + +GPU_VRAM_GB = { + "h100": 80, + "a100-80": 80, + "a100": 40, + "a100-40": 40, + "l40s": 48, + "a10": 24, + "l4": 24, + "4090": 24, + "3090": 24, + "t4": 16, +} + + +def get_gpu_settings(gpu: str): + vram = GPU_VRAM_GB.get(gpu) + if vram is None: + available = ", ".join(sorted(GPU_VRAM_GB.keys())) + print(f"Unknown GPU '{gpu}'. Available: {available}") + sys.exit(1) + + ratio = vram / BASELINE_VRAM_GB + # Scale and round down to nearest power of 2 for batched tokens + raw_tokens = BASELINE_MAX_BATCHED_TOKENS * ratio + max_batched_tokens = max(1024, 2 ** math.floor(math.log2(raw_tokens))) + # Scale and round down to nearest multiple of 8 for seqs + max_num_seqs = max(8, (int(BASELINE_MAX_NUM_SEQS * ratio) // 8) * 8) + + return max_batched_tokens, max_num_seqs + def main(): + parser = argparse.ArgumentParser(description="Launch vLLM server for Chandra") + parser.add_argument( + "--gpu", + default="h100", + choices=sorted(GPU_VRAM_GB.keys()), + help="GPU type for optimal settings (default: h100)", + ) + parser.add_argument( + "--mtp", + action="store_true", + default=False, + help="Enable MTP speculative decoding (disabled by default, unstable with vLLM)", + ) + args = parser.parse_args() + + max_batched_tokens, max_num_seqs = get_gpu_settings(args.gpu) + cmd = [ "sudo", "docker", @@ -16,33 +71,41 @@ def main(): f"device={settings.VLLM_GPUS}", "-v", f"{os.path.expanduser('~')}/.cache/huggingface:/root/.cache/huggingface", - "--env", - "VLLM_ATTENTION_BACKEND=TORCH_SDPA", "-p", "8000:8000", "--ipc=host", - "vllm/vllm-openai:latest", + "vllm/vllm-openai:v0.17.0", "--model", settings.MODEL_CHECKPOINT, "--no-enforce-eager", "--max-num-seqs", - "32", + str(max_num_seqs), "--dtype", "bfloat16", "--max-model-len", - "32768", + "18000", "--max_num_batched_tokens", - "65536", + str(max_batched_tokens), "--gpu-memory-utilization", - ".9", + ".85", + "--enable-prefix-caching", + "--mm-processor-kwargs", + json.dumps({"min_pixels": 3136, "max_pixels": 6291456}), "--served-model-name", settings.VLLM_MODEL_NAME, ] - print(f"Starting vLLM server with command: {' '.join(cmd)}") + if args.mtp: + spec_config = json.dumps({"method": "mtp", "num_speculative_tokens": 1}) + cmd.extend(["--speculative-config", spec_config]) + + vram = GPU_VRAM_GB[args.gpu] + print(f"GPU: {args.gpu} ({vram}GB VRAM)") + print(f"max-num-batched-tokens: {max_batched_tokens}, max-num-seqs: {max_num_seqs}") + print(f"MTP: {'enabled' if args.mtp else 'disabled'}") + print(f"Command: {' '.join(cmd)}") try: - # Use subprocess.run() which blocks and streams output automatically subprocess.run(cmd, check=True) except KeyboardInterrupt: print("\nShutting down vLLM server...") diff --git a/chandra/settings.py b/chandra/settings.py index ceb06ba..6d0f8a0 100644 --- a/chandra/settings.py +++ b/chandra/settings.py @@ -9,11 +9,11 @@ class Settings(BaseSettings): IMAGE_DPI: int = 192 MIN_PDF_IMAGE_DIM: int = 1024 MIN_IMAGE_DIM: int = 1536 - MODEL_CHECKPOINT: str = "datalab-to/chandra" + MODEL_CHECKPOINT: str = "datalab-to/chandra-ocr-2" TORCH_DEVICE: str | None = None MAX_OUTPUT_TOKENS: int = 12384 TORCH_ATTN: str | None = None - BBOX_SCALE: int = 1024 + BBOX_SCALE: int = 1000 # vLLM server settings VLLM_API_KEY: str = "EMPTY" diff --git a/pyproject.toml b/pyproject.toml index 6be56bf..9ad2c0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "chandra-ocr" -version = "0.1.9" +version = "0.2.0" description = "OCR model that converts documents to markdown, HTML, or JSON." readme = "README.md" requires-python = ">=3.10" @@ -13,7 +13,6 @@ dependencies = [ "beautifulsoup4>=4.14.2", "click>=8.0.0", "filetype>=1.2.0", - "flask>=3.0.0", "markdownify==1.1.0", "openai>=2.2.0", "pillow>=10.2.0", @@ -21,13 +20,22 @@ dependencies = [ "pydantic-settings>=2.11.0", "pypdfium2>=4.30.0", "python-dotenv>=1.1.1", - "qwen-vl-utils>=0.0.14", + "six>=1.17.0", +] + +[project.optional-dependencies] +hf = [ "torch>=2.8.0", "torchvision>=0.23.0", - "transformers>=4.57.1", - "streamlit>=1.50.0", + "transformers>=5.2.0", "accelerate>=1.11.0", ] +app = [ + "streamlit>=1.50.0", +] +all = [ + "chandra-ocr[hf,app]", +] [build-system] requires = ["setuptools>=61"] # or "setuptools>=61", "flit-core", etc. @@ -44,6 +52,8 @@ include = ["chandra*"] [dependency-groups] dev = [ + "chandra-ocr[all]", + "flask>=3.0.0", "pre-commit>=4.3.0", "pytest>=8.4.2", ] diff --git a/uv.lock b/uv.lock index e68162f..3fbe1fd 100644 --- a/uv.lock +++ b/uv.lock @@ -47,6 +47,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/aa/f3/0b6ced594e51cc95d8c1fc1640d3623770d01e4969d29c0bd09945fafefa/altair-5.5.0-py3-none-any.whl", hash = "sha256:91a310b926508d560fe0148d02a194f38b824122641ef528113d029fcd129f8c", size = 731200 }, ] +[[package]] +name = "annotated-doc" +version = "0.0.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303 }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -80,56 +89,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615 }, ] -[[package]] -name = "av" -version = "15.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e9/c3/83e6e73d1592bc54436eae0bc61704ae0cff0c3cfbde7b58af9ed67ebb49/av-15.1.0.tar.gz", hash = "sha256:39cda2dc810e11c1938f8cb5759c41d6b630550236b3365790e67a313660ec85", size = 3774192 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/6a/91e3e68ae0d1b53b480ec69a96f2ae820fb007bc60e6b821741f31c7ba4e/av-15.1.0-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:cf067b66cee2248220b29df33b60eb4840d9e7b9b75545d6b922f9c41d88c4ee", size = 21781685 }, - { url = "https://files.pythonhosted.org/packages/bc/6d/afa951b9cb615c3bc6d95c4eed280c6cefb52c006f4e15e79043626fab39/av-15.1.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:26426163d96fc3bde9a015ba4d60da09ef848d9284fe79b4ca5e60965a008fc5", size = 26962481 }, - { url = "https://files.pythonhosted.org/packages/3c/42/0c384884235c42c439cef28cbd129e4624ad60229119bf3c6c6020805119/av-15.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:92f524541ce74b8a12491d8934164a5c57e983da24826547c212f60123de400b", size = 37571839 }, - { url = "https://files.pythonhosted.org/packages/25/c0/5c967b0872fce1add80a8f50fa7ce11e3e3e5257c2b079263570bc854699/av-15.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:659f9d6145fb2c58e8b31907283b6ba876570f5dd6e7e890d74c09614c436c8e", size = 39070227 }, - { url = "https://files.pythonhosted.org/packages/e2/81/e333056d49363c35a74b828ed5f87c96dfbcc1a506b49d79a31ac773b94d/av-15.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:07a8ae30c0cfc3132eff320a6b27d18a5e0dda36effd0ae28892888f4ee14729", size = 39619362 }, - { url = "https://files.pythonhosted.org/packages/d5/ae/50cc2af1bf68452cbfec8d1b2554c18f6d167c8ba6d7ad7707797dfd1541/av-15.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e33a76e38f03bb5de026b9f66ccf23dc01ddd2223221096992cb52ac22e62538", size = 40371627 }, - { url = "https://files.pythonhosted.org/packages/50/e6/381edf1779106dd31c9ef1ac9842f643af4465b8a87cbc278d3eaa76229a/av-15.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:aa4bf12bdce20edc2a3b13a2776c474c5ab63e1817d53793714504476eeba82e", size = 31340369 }, - { url = "https://files.pythonhosted.org/packages/47/58/4e44cf6939be7aba96a4abce024e1be11ba7539ecac74d09369b8c03aa05/av-15.1.0-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:b785948762a8d45fc58fc24a20251496829ace1817e9a7a508a348d6de2182c3", size = 21767323 }, - { url = "https://files.pythonhosted.org/packages/9b/f6/a946544cdb49f6d892d2761b1d61a8bc6ce912fe57ba06769bdc640c0a7f/av-15.1.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9c7131494a3a318612b4ee4db98fe5bc50eb705f6b6536127c7ab776c524fd8b", size = 26946268 }, - { url = "https://files.pythonhosted.org/packages/70/7c/b33513c0af73d0033af59a98f035b521c5b93445a6af7e9efbf41a6e8383/av-15.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2b9623ae848625c59213b610c8665817924f913580c7c5c91e0dc18936deb00d", size = 38062118 }, - { url = "https://files.pythonhosted.org/packages/5e/95/31b7fb34f9fea7c7389240364194f4f56ad2d460095038cc720f50a90bb3/av-15.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c8ef597087db560514617143532b1fafc4825ebb2dda9a22418f548b113a0cc7", size = 39571086 }, - { url = "https://files.pythonhosted.org/packages/e7/b0/7b0b45474a4e90c35c11d0032947d8b3c7386872957ce29c6f12add69a74/av-15.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:08eac47a90ebae1e2bd5935f400dd515166019bab4ff5b03c4625fa6ac3a0a5e", size = 40112634 }, - { url = "https://files.pythonhosted.org/packages/aa/04/038b94bc9a1ee10a451c867d4a2fc91e845f83bfc2dae9df25893abcb57f/av-15.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d3f66ff200ea166e606cb3c5cb1bd2fc714effbec2e262a5d67ce60450c8234a", size = 40878695 }, - { url = "https://files.pythonhosted.org/packages/1d/3d/9f8f96c0deeaaf648485a3dbd1699b2f0580f2ce8a36cb616c0138ba7615/av-15.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:57b99544d91121b8bea570e4ddf61700f679a6b677c1f37966bc1a22e1d4cd5c", size = 31335683 }, - { url = "https://files.pythonhosted.org/packages/d1/58/de78b276d20db6ffcd4371283df771721a833ba525a3d57e753d00a9fe79/av-15.1.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:40c5df37f4c354ab8190c6fd68dab7881d112f527906f64ca73da4c252a58cee", size = 21760991 }, - { url = "https://files.pythonhosted.org/packages/56/cc/45f85775304ae60b66976360d82ba5b152ad3fd91f9267d5020a51e9a828/av-15.1.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:af455ce65ada3d361f80c90c810d9bced4db5655ab9aa513024d6c71c5c476d5", size = 26953097 }, - { url = "https://files.pythonhosted.org/packages/f3/f8/2d781e5e71d02fc829487e775ccb1185e72f95340d05f2e84eb57a11e093/av-15.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86226d2474c80c3393fa07a9c366106029ae500716098b72b3ec3f67205524c3", size = 38319710 }, - { url = "https://files.pythonhosted.org/packages/ac/13/37737ef2193e83862ccacff23580c39de251da456a1bf0459e762cca273c/av-15.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:11326f197e7001c4ca53a83b2dbc67fd39ddff8cdf62ce6be3b22d9f3f9338bd", size = 39915519 }, - { url = "https://files.pythonhosted.org/packages/26/e9/e8032c7b8f2a4129a03f63f896544f8b7cf068e2db2950326fa2400d5c47/av-15.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a631ea879cc553080ee62874f4284765c42ba08ee0279851a98a85e2ceb3cc8d", size = 40286166 }, - { url = "https://files.pythonhosted.org/packages/e2/23/612c0fd809444d04b8387a2dfd942ccc77829507bd78a387ff65a9d98c24/av-15.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8f383949b010c3e731c245f80351d19dc0c08f345e194fc46becb1cb279be3ff", size = 41150592 }, - { url = "https://files.pythonhosted.org/packages/15/74/6f8e38a3b0aea5f28e72813672ff45b64615f2c69e6a4a558718c95edb9f/av-15.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d5921aa45f4c1f8c1a8d8185eb347e02aa4c3071278a2e2dd56368d54433d643", size = 31336093 }, - { url = "https://files.pythonhosted.org/packages/2e/bc/78b2ffa8235eeffc29aa4a8cc47b02e660cfec32f601f39a00975fb06d0e/av-15.1.0-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2f77853c3119c59d1bff4214ccbe46e3133eccff85ed96adee51c68684443f4e", size = 21726244 }, - { url = "https://files.pythonhosted.org/packages/1a/99/66d69453a2dce028e6e8ebea085d90e880aac03d3a3ab7d8ec16755ffd75/av-15.1.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:c0bc4471c156a0a1c70a607502434f477bc8dfe085eef905e55b4b0d66bcd3a5", size = 26918663 }, - { url = "https://files.pythonhosted.org/packages/fa/51/1a7dfbeda71f2772bc46d758af0e7fab1cc8388ce4bc7f24aecbc4bfd764/av-15.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:37839d4fa1407f047af82560dfc0f94d8d6266071eff49e1cbe16c4483054621", size = 38041408 }, - { url = "https://files.pythonhosted.org/packages/d7/97/2c4e0288ad4359b6064cb06ae79c2ff3a84ac73d27e91f2161b75fcd86fa/av-15.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:729179cd8622815e8b6f6854d13a806fe710576e08895c77e5e4ad254609de9a", size = 39642563 }, - { url = "https://files.pythonhosted.org/packages/ea/94/2362502149e276d00957edabcc201a5f4d5109a8a7b4fd30793714a532f3/av-15.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4abdf085bfa4eec318efccff567831b361ea56c045cc38366811552e3127c665", size = 40022119 }, - { url = "https://files.pythonhosted.org/packages/df/58/1a0ce1b3835d9728da0a7a54aeffaa0a2b1a88405eaed9322efd55212a54/av-15.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f985661644879e4520d28a995fcb2afeb951bc15a1d51412eb8e5f36da85b6fe", size = 40885158 }, - { url = "https://files.pythonhosted.org/packages/30/e6/054bb64e424d90b77ed5fc6a7358e4013fb436154c998fc90a89a186313f/av-15.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:7d7804a44c8048bb4b014a99353dd124663a12cd1d4613ba2bd3b457c3b1d539", size = 31312256 }, - { url = "https://files.pythonhosted.org/packages/6f/8b/89eae6dca10d7d2b83c131025a31ccc750be78699ac0304439faa1d1df99/av-15.1.0-cp314-cp314-macosx_13_0_arm64.whl", hash = "sha256:5dd73c6447947edcb82e5fecf96e1f146aeda0f169c7ad4c54df4d9f66f63fde", size = 21730645 }, - { url = "https://files.pythonhosted.org/packages/a3/f0/abffaf69405ed68041524be12a1e294faf396971d6a0e70eb00e93687df7/av-15.1.0-cp314-cp314-macosx_13_0_x86_64.whl", hash = "sha256:a81cd515934a5d51290aa66b059b7ed29c4a212e704f3c5e99e32877ff1c312c", size = 26913753 }, - { url = "https://files.pythonhosted.org/packages/37/9e/7af078bcfc3cd340c981ac5d613c090ab007023d2ac13b05acd52f22f069/av-15.1.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:57cc7a733a7e7d7a153682f35c9cf5d01e8269367b049c954779de36fc3d0b10", size = 38027048 }, - { url = "https://files.pythonhosted.org/packages/02/76/1f9dac11ad713e3619288993ea04e9c9cf4ec0f04e5ee81e83b8129dd8f3/av-15.1.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:a77b75bdb6899a64302ff923a5246e0747b3f0a3ecee7d61118db407a22c3f53", size = 39565396 }, - { url = "https://files.pythonhosted.org/packages/8b/32/2188c46e2747247458ffc26b230c57dd28e61f65ff7b9e6223a411af5e98/av-15.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d0a1154ce081f1720082a133cfe12356c59f62dad2b93a7a1844bf1dcd010d85", size = 40015050 }, - { url = "https://files.pythonhosted.org/packages/1e/41/b57fbce9994580619d7574817ece0fe0e7b822cde2af57904549d0150b8d/av-15.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8a7bf5a34dee15c86790414fa86a144e6d0dcc788bc83b565fdcbc080b4fbc90", size = 40821225 }, - { url = "https://files.pythonhosted.org/packages/b1/36/e85cd1f0d3369c6764ad422882895d082f7ececb66d3df8aeae3234ef7a6/av-15.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:e30c9a6fd9734784941384a2e25fad3c22881a7682f378914676aa7e795acdb7", size = 31311750 }, - { url = "https://files.pythonhosted.org/packages/80/d8/08a681758a4e49adfda409a6a35eff533f42654c6a6cfa102bc5cae1a728/av-15.1.0-cp314-cp314t-macosx_13_0_arm64.whl", hash = "sha256:60666833d7e65ebcfc48034a072de74349edbb62c9aaa3e6722fef31ca028eb6", size = 21828343 }, - { url = "https://files.pythonhosted.org/packages/4a/52/29bec3fe68669b21f7d1ab5d94e21f597b8dfd37f50a3e3c9af6a8da925c/av-15.1.0-cp314-cp314t-macosx_13_0_x86_64.whl", hash = "sha256:53fbdae45aa2a49a22e864ff4f4017416ef62c060a172085d3247ba0a101104e", size = 27001666 }, - { url = "https://files.pythonhosted.org/packages/9d/54/2c1d1faced66d708f5df328e800997cb47f90b500a214130c3a0f2ad601e/av-15.1.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:e6c51061667983dc801502aff9140bbc4f0e0d97f879586f17fb2f9a7e49c381", size = 39496753 }, - { url = "https://files.pythonhosted.org/packages/c3/76/06ded5e52c4dcc2d9b5184c6da8de5ea77bd7ecb79a59a2b9700f1984949/av-15.1.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:2f80ec387f04aa34868662b11018b5f09654ae1530a61e24e92a142a24b10b62", size = 40784729 }, - { url = "https://files.pythonhosted.org/packages/52/ef/797b76f3b39c99a96e387f501bbc07dca340b27d3dda12862fe694066b63/av-15.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4975e03177d37d8165c99c8d494175675ba8acb72458fb5d7e43f746a53e0374", size = 41284953 }, - { url = "https://files.pythonhosted.org/packages/31/47/e4656f00e62fd059ea5a40b492dea784f5aecfe1dfac10c0d7a0664ce200/av-15.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8f78f3dad11780b4cdd024cdb92ce43cb170929297c00f2f4555c2b103f51e55", size = 41985340 }, - { url = "https://files.pythonhosted.org/packages/b1/c9/15bb4fd7a1f39d70db35af2b9c20a0ae19e4220eb58a8b8446e903b98d72/av-15.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:9a20c5eba3ec49c2f4b281797021923fc68a86aeb66c5cda4fd0252fa8004951", size = 31487337 }, -] - [[package]] name = "beautifulsoup4" version = "4.14.2" @@ -181,14 +140,12 @@ wheels = [ [[package]] name = "chandra-ocr" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ - { name = "accelerate" }, { name = "beautifulsoup4" }, { name = "click" }, { name = "filetype" }, - { name = "flask" }, { name = "markdownify" }, { name = "openai" }, { name = "pillow" }, @@ -196,26 +153,42 @@ dependencies = [ { name = "pydantic-settings" }, { name = "pypdfium2" }, { name = "python-dotenv" }, - { name = "qwen-vl-utils" }, + { name = "six" }, +] + +[package.optional-dependencies] +all = [ + { name = "accelerate" }, { name = "streamlit" }, { name = "torch" }, { name = "torchvision" }, { name = "transformers" }, ] +app = [ + { name = "streamlit" }, +] +hf = [ + { name = "accelerate" }, + { name = "torch" }, + { name = "torchvision" }, + { name = "transformers" }, +] [package.dev-dependencies] dev = [ + { name = "chandra-ocr", extra = ["all"] }, + { name = "flask" }, { name = "pre-commit" }, { name = "pytest" }, ] [package.metadata] requires-dist = [ - { name = "accelerate", specifier = ">=1.11.0" }, + { name = "accelerate", marker = "extra == 'hf'", specifier = ">=1.11.0" }, { name = "beautifulsoup4", specifier = ">=4.14.2" }, + { name = "chandra-ocr", extras = ["hf", "app"], marker = "extra == 'all'" }, { name = "click", specifier = ">=8.0.0" }, { name = "filetype", specifier = ">=1.2.0" }, - { name = "flask", specifier = ">=3.0.0" }, { name = "markdownify", specifier = "==1.1.0" }, { name = "openai", specifier = ">=2.2.0" }, { name = "pillow", specifier = ">=10.2.0" }, @@ -223,15 +196,17 @@ requires-dist = [ { name = "pydantic-settings", specifier = ">=2.11.0" }, { name = "pypdfium2", specifier = ">=4.30.0" }, { name = "python-dotenv", specifier = ">=1.1.1" }, - { name = "qwen-vl-utils", specifier = ">=0.0.14" }, - { name = "streamlit", specifier = ">=1.50.0" }, - { name = "torch", specifier = ">=2.8.0" }, - { name = "torchvision", specifier = ">=0.23.0" }, - { name = "transformers", specifier = ">=4.57.1" }, + { name = "six", specifier = ">=1.17.0" }, + { name = "streamlit", marker = "extra == 'app'", specifier = ">=1.50.0" }, + { name = "torch", marker = "extra == 'hf'", specifier = ">=2.8.0" }, + { name = "torchvision", marker = "extra == 'hf'", specifier = ">=0.23.0" }, + { name = "transformers", marker = "extra == 'hf'", specifier = ">=5.2.0" }, ] [package.metadata.requires-dev] dev = [ + { name = "chandra-ocr", extras = ["all"] }, + { name = "flask", specifier = ">=3.0.0" }, { name = "pre-commit", specifier = ">=4.3.0" }, { name = "pytest", specifier = ">=8.4.2" }, ] @@ -430,17 +405,34 @@ wheels = [ [[package]] name = "hf-xet" -version = "1.1.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/74/31/feeddfce1748c4a233ec1aa5b7396161c07ae1aa9b7bdbc9a72c3c7dd768/hf_xet-1.1.10.tar.gz", hash = "sha256:408aef343800a2102374a883f283ff29068055c111f003ff840733d3b715bb97", size = 487910 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f7/a2/343e6d05de96908366bdc0081f2d8607d61200be2ac802769c4284cc65bd/hf_xet-1.1.10-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:686083aca1a6669bc85c21c0563551cbcdaa5cf7876a91f3d074a030b577231d", size = 2761466 }, - { url = "https://files.pythonhosted.org/packages/31/f9/6215f948ac8f17566ee27af6430ea72045e0418ce757260248b483f4183b/hf_xet-1.1.10-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71081925383b66b24eedff3013f8e6bbd41215c3338be4b94ba75fd75b21513b", size = 2623807 }, - { url = "https://files.pythonhosted.org/packages/15/07/86397573efefff941e100367bbda0b21496ffcdb34db7ab51912994c32a2/hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b6bceb6361c80c1cc42b5a7b4e3efd90e64630bcf11224dcac50ef30a47e435", size = 3186960 }, - { url = "https://files.pythonhosted.org/packages/01/a7/0b2e242b918cc30e1f91980f3c4b026ff2eedaf1e2ad96933bca164b2869/hf_xet-1.1.10-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eae7c1fc8a664e54753ffc235e11427ca61f4b0477d757cc4eb9ae374b69f09c", size = 3087167 }, - { url = "https://files.pythonhosted.org/packages/4a/25/3e32ab61cc7145b11eee9d745988e2f0f4fafda81b25980eebf97d8cff15/hf_xet-1.1.10-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0a0005fd08f002180f7a12d4e13b22be277725bc23ed0529f8add5c7a6309c06", size = 3248612 }, - { url = "https://files.pythonhosted.org/packages/2c/3d/ab7109e607ed321afaa690f557a9ada6d6d164ec852fd6bf9979665dc3d6/hf_xet-1.1.10-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f900481cf6e362a6c549c61ff77468bd59d6dd082f3170a36acfef2eb6a6793f", size = 3353360 }, - { url = "https://files.pythonhosted.org/packages/ee/0e/471f0a21db36e71a2f1752767ad77e92d8cde24e974e03d662931b1305ec/hf_xet-1.1.10-cp37-abi3-win_amd64.whl", hash = "sha256:5f54b19cc347c13235ae7ee98b330c26dd65ef1df47e5316ffb1e87713ca7045", size = 2804691 }, +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/08/23c84a26716382c89151b5b447b4beb19e3345f3a93d3b73009a71a57ad3/hf_xet-1.4.2.tar.gz", hash = "sha256:b7457b6b482d9e0743bd116363239b1fa904a5e65deede350fbc0c4ea67c71ea", size = 672357 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/06/e8cf74c3c48e5485c7acc5a990d0d8516cdfb5fdf80f799174f1287cc1b5/hf_xet-1.4.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ac8202ae1e664b2c15cdfc7298cbb25e80301ae596d602ef7870099a126fcad4", size = 3796125 }, + { url = "https://files.pythonhosted.org/packages/66/d4/b73ebab01cbf60777323b7de9ef05550790451eb5172a220d6b9845385ec/hf_xet-1.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6d2f8ee39fa9fba9af929f8c0d0482f8ee6e209179ad14a909b6ad78ffcb7c81", size = 3555985 }, + { url = "https://files.pythonhosted.org/packages/ff/e7/ded6d1bd041c3f2bca9e913a0091adfe32371988e047dd3a68a2463c15a2/hf_xet-1.4.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4642a6cf249c09da8c1f87fe50b24b2a3450b235bf8adb55700b52f0ea6e2eb6", size = 4212085 }, + { url = "https://files.pythonhosted.org/packages/97/c1/a0a44d1f98934f7bdf17f7a915b934f9fca44bb826628c553589900f6df8/hf_xet-1.4.2-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:769431385e746c92dc05492dde6f687d304584b89c33d79def8367ace06cb555", size = 3988266 }, + { url = "https://files.pythonhosted.org/packages/7a/82/be713b439060e7d1f1d93543c8053d4ef2fe7e6922c5b31642eaa26f3c4b/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c9dd1c1bc4cc56168f81939b0e05b4c36dd2d28c13dc1364b17af89aa0082496", size = 4188513 }, + { url = "https://files.pythonhosted.org/packages/21/a6/cbd4188b22abd80ebd0edbb2b3e87f2633e958983519980815fb8314eae5/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fca58a2ae4e6f6755cc971ac6fcdf777ea9284d7e540e350bb000813b9a3008d", size = 4428287 }, + { url = "https://files.pythonhosted.org/packages/b2/4e/84e45b25e2e3e903ed3db68d7eafa96dae9a1d1f6d0e7fc85120347a852f/hf_xet-1.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:163aab46854ccae0ab6a786f8edecbbfbaa38fcaa0184db6feceebf7000c93c0", size = 3665574 }, + { url = "https://files.pythonhosted.org/packages/ee/71/c5ac2b9a7ae39c14e91973035286e73911c31980fe44e7b1d03730c00adc/hf_xet-1.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:09b138422ecbe50fd0c84d4da5ff537d27d487d3607183cd10e3e53f05188e82", size = 3528760 }, + { url = "https://files.pythonhosted.org/packages/1e/0f/fcd2504015eab26358d8f0f232a1aed6b8d363a011adef83fe130bff88f7/hf_xet-1.4.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:949dcf88b484bb9d9276ca83f6599e4aa03d493c08fc168c124ad10b2e6f75d7", size = 3796493 }, + { url = "https://files.pythonhosted.org/packages/82/56/19c25105ff81731ca6d55a188b5de2aa99d7a2644c7aa9de1810d5d3b726/hf_xet-1.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:41659966020d59eb9559c57de2cde8128b706a26a64c60f0531fa2318f409418", size = 3555797 }, + { url = "https://files.pythonhosted.org/packages/bf/e3/8933c073186849b5e06762aa89847991d913d10a95d1603eb7f2c3834086/hf_xet-1.4.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c588e21d80010119458dd5d02a69093f0d115d84e3467efe71ffb2c67c19146", size = 4212127 }, + { url = "https://files.pythonhosted.org/packages/eb/01/f89ebba4e369b4ed699dcb60d3152753870996f41c6d22d3d7cac01310e1/hf_xet-1.4.2-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a296744d771a8621ad1d50c098d7ab975d599800dae6d48528ba3944e5001ba0", size = 3987788 }, + { url = "https://files.pythonhosted.org/packages/84/4d/8a53e5ffbc2cc33bbf755382ac1552c6d9af13f623ed125fe67cc3e6772f/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f563f7efe49588b7d0629d18d36f46d1658fe7e08dce3fa3d6526e1c98315e2d", size = 4188315 }, + { url = "https://files.pythonhosted.org/packages/d1/b8/b7a1c1b5592254bd67050632ebbc1b42cc48588bf4757cb03c2ef87e704a/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5b2e0132c56d7ee1bf55bdb638c4b62e7106f6ac74f0b786fed499d5548c5570", size = 4428306 }, + { url = "https://files.pythonhosted.org/packages/a0/0c/40779e45b20e11c7c5821a94135e0207080d6b3d76e7b78ccb413c6f839b/hf_xet-1.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:2f45c712c2fa1215713db10df6ac84b49d0e1c393465440e9cb1de73ecf7bbf6", size = 3665826 }, + { url = "https://files.pythonhosted.org/packages/51/4c/e2688c8ad1760d7c30f7c429c79f35f825932581bc7c9ec811436d2f21a0/hf_xet-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:6d53df40616f7168abfccff100d232e9d460583b9d86fa4912c24845f192f2b8", size = 3529113 }, + { url = "https://files.pythonhosted.org/packages/b4/86/b40b83a2ff03ef05c4478d2672b1fc2b9683ff870e2b25f4f3af240f2e7b/hf_xet-1.4.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:71f02d6e4cdd07f344f6844845d78518cc7186bd2bc52d37c3b73dc26a3b0bc5", size = 3800339 }, + { url = "https://files.pythonhosted.org/packages/64/2e/af4475c32b4378b0e92a587adb1aa3ec53e3450fd3e5fe0372a874531c00/hf_xet-1.4.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e9b38d876e94d4bdcf650778d6ebbaa791dd28de08db9736c43faff06ede1b5a", size = 3559664 }, + { url = "https://files.pythonhosted.org/packages/3c/4c/781267da3188db679e601de18112021a5cb16506fe86b246e22c5401a9c4/hf_xet-1.4.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:77e8c180b7ef12d8a96739a4e1e558847002afe9ea63b6f6358b2271a8bdda1c", size = 4217422 }, + { url = "https://files.pythonhosted.org/packages/68/47/d6cf4a39ecf6c7705f887a46f6ef5c8455b44ad9eb0d391aa7e8a2ff7fea/hf_xet-1.4.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c3b3c6a882016b94b6c210957502ff7877802d0dbda8ad142c8595db8b944271", size = 3992847 }, + { url = "https://files.pythonhosted.org/packages/2d/ef/e80815061abff54697239803948abc665c6b1d237102c174f4f7a9a5ffc5/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9d9a634cc929cfbaf2e1a50c0e532ae8c78fa98618426769480c58501e8c8ac2", size = 4193843 }, + { url = "https://files.pythonhosted.org/packages/54/75/07f6aa680575d9646c4167db6407c41340cbe2357f5654c4e72a1b01ca14/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6b0932eb8b10317ea78b7da6bab172b17be03bbcd7809383d8d5abd6a2233e04", size = 4432751 }, + { url = "https://files.pythonhosted.org/packages/cd/71/193eabd7e7d4b903c4aa983a215509c6114915a5a237525ec562baddb868/hf_xet-1.4.2-cp37-abi3-win_amd64.whl", hash = "sha256:ad185719fb2e8ac26f88c8100562dbf9dbdcc3d9d2add00faa94b5f106aea53f", size = 3671149 }, + { url = "https://files.pythonhosted.org/packages/b4/7e/ccf239da366b37ba7f0b36095450efae4a64980bdc7ec2f51354205fdf39/hf_xet-1.4.2-cp37-abi3-win_arm64.whl", hash = "sha256:32c012286b581f783653e718c1862aea5b9eb140631685bb0c5e7012c8719a87", size = 3533426 }, ] [[package]] @@ -473,21 +465,22 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.35.3" +version = "1.7.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "httpx" }, { name = "packaging" }, { name = "pyyaml" }, - { name = "requests" }, { name = "tqdm" }, + { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798 } +sdist = { url = "https://files.pythonhosted.org/packages/b4/a8/94ccc0aec97b996a3a68f3e1fa06a4bd7185dd02bf22bfba794a0ade8440/huggingface_hub-1.7.1.tar.gz", hash = "sha256:be38fe66e9b03c027ad755cb9e4b87ff0303c98acf515b5d579690beb0bf3048", size = 722097 } wheels = [ - { url = "https://files.pythonhosted.org/packages/31/a0/651f93d154cb72323358bf2bbae3e642bdb5d2f1bfc874d096f7cb159fa0/huggingface_hub-0.35.3-py3-none-any.whl", hash = "sha256:0e3a01829c19d86d03793e4577816fe3bdfc1602ac62c7fb220d593d351224ba", size = 564262 }, + { url = "https://files.pythonhosted.org/packages/6f/75/ca21955d6117a394a482c7862ce96216239d0e3a53133ae8510727a8bcfa/huggingface_hub-1.7.1-py3-none-any.whl", hash = "sha256:38c6cce7419bbde8caac26a45ed22b0cea24152a8961565d70ec21f88752bfaa", size = 616308 }, ] [[package]] @@ -637,6 +630,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437 }, ] +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321 }, +] + [[package]] name = "markdownify" version = "1.1.0" @@ -735,6 +740,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146 }, ] +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 }, +] + [[package]] name = "mpmath" version = "1.3.0" @@ -1617,21 +1631,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341 }, ] -[[package]] -name = "qwen-vl-utils" -version = "0.0.14" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "av" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b6/b1/ad4fc2260a3badd278b38d642f3b987412f1f6682f0ef2b31b0572d5caa8/qwen_vl_utils-0.0.14.tar.gz", hash = "sha256:9c7cad5ae803b3a10f8bb7194deb12aeacdd032f92f4224e880c73587a7346ad", size = 8453 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c4/43/80f67e0336cb2fc725f8e06f7fe35c1d0fe946f4d2b8b2175e797e07349e/qwen_vl_utils-0.0.14-py3-none-any.whl", hash = "sha256:5e28657bfd031e56bd447c5901b58ddfc3835285ed100f4c56580e0ade054e96", size = 8120 }, -] - [[package]] name = "referencing" version = "0.36.2" @@ -1768,6 +1767,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738 }, ] +[[package]] +name = "rich" +version = "14.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458 }, +] + [[package]] name = "rpds-py" version = "0.27.1" @@ -1934,6 +1946,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922", size = 1201486 }, ] +[[package]] +name = "shellingham" +version = "1.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755 }, +] + [[package]] name = "six" version = "1.17.0" @@ -2222,24 +2243,23 @@ wheels = [ [[package]] name = "transformers" -version = "4.57.1" +version = "5.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, { name = "huggingface-hub" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "packaging" }, { name = "pyyaml" }, { name = "regex" }, - { name = "requests" }, { name = "safetensors" }, { name = "tokenizers" }, { name = "tqdm" }, + { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511 } +sdist = { url = "https://files.pythonhosted.org/packages/fc/1a/70e830d53ecc96ce69cfa8de38f163712d2b43ac52fbd743f39f56025c31/transformers-5.3.0.tar.gz", hash = "sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557", size = 8830831 } wheels = [ - { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925 }, + { url = "https://files.pythonhosted.org/packages/b8/88/ae8320064e32679a5429a2c9ebbc05c2bf32cefb6e076f9b07f6d685a9b4/transformers-5.3.0-py3-none-any.whl", hash = "sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a", size = 10661827 }, ] [[package]] @@ -2257,6 +2277,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/63/8cb444ad5cdb25d999b7d647abac25af0ee37d292afc009940c05b82dda0/triton-3.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7936b18a3499ed62059414d7df563e6c163c5e16c3773678a3ee3d417865035d", size = 155659780 }, ] +[[package]] +name = "typer" +version = "0.24.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-doc" }, + { name = "click" }, + { name = "rich" }, + { name = "shellingham" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f5/24/cb09efec5cc954f7f9b930bf8279447d24618bb6758d4f6adf2574c41780/typer-0.24.1.tar.gz", hash = "sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45", size = 118613 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085 }, +] + [[package]] name = "typing-extensions" version = "4.15.0"