From 6fbfff1be6622306fe041b33947c564cf8ce00d8 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 18 Mar 2026 11:19:37 -0400 Subject: [PATCH 1/3] Fix link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 31d88e8..4389217 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Chandra OCR 2 is a state of the art OCR model that converts images and PDFs into - Tops external olmocr benchmark and significant improvement in internal multilingual benchmarks - Convert documents to markdown, html, or json with detailed layout information -- Support for 90+ languages ([benchmark below](#multilingual-benchmark)) +- Support for 90+ languages ([benchmark below](#multilingual-benchmark-table)) - Excellent handwriting support - Reconstructs forms accurately, including checkboxes - Strong performance with tables, math, and complex layouts From 69988118f334c0c65dac5e0c70154e566e524f46 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 18 Mar 2026 11:20:58 -0400 Subject: [PATCH 2/3] Fix test --- tests/integration/test_image_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_image_inference.py b/tests/integration/test_image_inference.py index 46e0c1f..bb066d7 100644 --- a/tests/integration/test_image_inference.py +++ b/tests/integration/test_image_inference.py @@ -15,4 +15,4 @@ def test_inference_image(simple_text_image): assert "Hello, World!" in output.markdown chunks = output.chunks - assert len(chunks) == 1 + assert len(chunks) > 0 From 19e061aee7ebf649ed18296ca48927b6fc2dd372 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 18 Mar 2026 11:38:32 -0400 Subject: [PATCH 3/3] Add im end --- chandra/model/hf.py | 13 ++++++++++++- tests/integration/test_image_inference.py | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/chandra/model/hf.py b/chandra/model/hf.py index 3f4ebf0..c98e58f 100644 --- a/chandra/model/hf.py +++ b/chandra/model/hf.py @@ -28,7 +28,18 @@ def generate_hf( ) inputs = inputs.to(model.device) - generated_ids = model.generate(**inputs, max_new_tokens=max_output_tokens) + # Include both <|endoftext|> and <|im_end|> as stop tokens. + # generation_config only has <|endoftext|>, but the model emits <|im_end|> at turn boundaries. + eos_token_id = model.generation_config.eos_token_id + im_end_id = model.processor.tokenizer.convert_tokens_to_ids("<|im_end|>") + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + if im_end_id is not None and im_end_id not in eos_token_id: + eos_token_id.append(im_end_id) + + generated_ids = model.generate( + **inputs, max_new_tokens=max_output_tokens, eos_token_id=eos_token_id + ) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) diff --git a/tests/integration/test_image_inference.py b/tests/integration/test_image_inference.py index bb066d7..46e0c1f 100644 --- a/tests/integration/test_image_inference.py +++ b/tests/integration/test_image_inference.py @@ -15,4 +15,4 @@ def test_inference_image(simple_text_image): assert "Hello, World!" in output.markdown chunks = output.chunks - assert len(chunks) > 0 + assert len(chunks) == 1