1+ import base64
2+ import json
13from pathlib import Path
4+ from typing import cast
25
36import pymupdf
47import pytest
5- from paperqa .readers import PDFParserFn
6- from paperqa .utils import ImpossibleParsingError
8+ from paperqa import Doc , Docs
9+ from paperqa .readers import PDFParserFn , chunk_pdf
10+ from paperqa .utils import ImpossibleParsingError , bytes_to_string
711
812from paperqa_pymupdf import parse_pdf_to_pages
913
1014REPO_ROOT = Path (__file__ ).parents [3 ]
1115STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"
1216
1317
14- def test_parse_pdf_to_pages () -> None :
18+ @pytest .mark .asyncio
19+ async def test_parse_pdf_to_pages () -> None :
1520 assert isinstance (parse_pdf_to_pages , PDFParserFn )
1621
1722 filepath = STUB_DATA_DIR / "pasa.pdf"
@@ -21,19 +26,131 @@ def test_parse_pdf_to_pages() -> None:
2126 assert (
2227 "Abstract\n \n We introduce PaSa, an advanced Paper Search"
2328 "\n agent powered by large language models."
24- ) in parsed_text .content ["1" ], "Block parsing failed to handle abstract"
29+ ) in parsed_text .content ["1" ][ 0 ] , "Block parsing failed to handle abstract"
2530
26- # Check Figure 1
27- p2_text = parsed_text .content ["2" ]
31+ # Check the images in Figure 1
32+ assert not isinstance (parsed_text .content ["2" ], str )
33+ p2_text , p2_media = parsed_text .content ["2" ]
2834 assert "Figure 1" in p2_text , "Expected Figure 1 title"
2935 assert "Crawler" in p2_text , "Expected Figure 1 contents"
36+ (p2_image ,) = [m for m in p2_media if m .info ["type" ] == "drawing" ]
37+ assert p2_image .index == 0
38+ assert isinstance (p2_image .data , bytes )
39+
40+ # Check the image is valid base64
41+ base64_data = bytes_to_string (p2_image .data )
42+ assert base64_data
43+ assert base64 .b64decode (base64_data , validate = True ) == p2_image .data
44+
45+ # Check we can round-trip serialize the image
46+ serde_p2_image = type (p2_image ).model_validate_json (p2_image .model_dump_json ())
47+ assert serde_p2_image == p2_image
48+
49+ # Check useful attributes are present and are JSON serializable
50+ json .dumps (p2_image .info )
51+ for attr in ("width" , "height" ):
52+ dim = p2_image .info [attr ]
53+ assert isinstance (dim , int | float )
54+ assert dim > 0 , "Edge length should be positive"
55+
56+ # Check Figure 1 can be used to answer questions
57+ doc = Doc (
58+ docname = "He2025" ,
59+ dockey = "stub" ,
60+ citation = (
61+ 'He, Yichen, et al. "PaSa: An LLM Agent for Comprehensive Academic Paper'
62+ ' Search." *arXiv*, 2025, arXiv:2501.10120v1. Accessed 2025.'
63+ ),
64+ )
65+ texts = chunk_pdf (parsed_text , doc = doc , chunk_chars = 3000 , overlap = 100 )
66+ # pylint: disable=duplicate-code
67+ fig_1_text = texts [1 ]
68+ assert (
69+ "Figure 1: Architecture of PaSa" in fig_1_text .text
70+ ), "Expecting Figure 1 for the test to work"
71+ assert fig_1_text .media , "Expecting media to test multimodality"
72+ fig_1_text .text = "stub" # Replace text to confirm multimodality works
73+ docs = Docs ()
74+ assert await docs .aadd_texts (texts = [fig_1_text ], doc = doc )
75+ for query , substrings_min_counts in [
76+ ("What actions can the Crawler take?" , [(("search" , "expand" , "stop" ), 2 )]),
77+ ("What actions can the Selector take?" , [(("select" , "drop" ), 2 )]),
78+ (
79+ "How many User Query are there, and what do they do?" ,
80+ [(("two" , "2" ), 2 ), (("crawler" , "selector" ), 2 )],
81+ ),
82+ ]:
83+ session = await docs .aquery (query = query )
84+ assert session .contexts , "Expected contexts to be generated"
85+ assert all (
86+ c .text .text == fig_1_text .text and c .text .media == fig_1_text .media
87+ for c in session .contexts
88+ ), "Expected context to reuse Figure 1's text and media"
89+ for substrings , min_count in cast (
90+ list [tuple [tuple [str , ...], int ]], substrings_min_counts
91+ ):
92+ assert (
93+ sum (x in session .answer .lower () for x in substrings ) >= min_count
94+ ), f"Expected { session .answer = } to have at { substrings } present"
95+
96+ # Let's check the full page parsing behavior
97+ parsed_text_full_page = parse_pdf_to_pages (filepath , full_page = True )
98+ assert isinstance (parsed_text_full_page .content , dict )
99+ assert "1" in parsed_text_full_page .content , "Parsed text should contain page 1"
100+ assert "2" in parsed_text_full_page .content , "Parsed text should contain page 2"
101+ for page_num in ("1" , "2" ):
102+ page_content = parsed_text_full_page .content [page_num ]
103+ assert not isinstance (page_content , str ), f"Page { page_num } should have images"
104+ # Check each page has exactly one image
105+ page_text , (full_page_image ,) = page_content
106+ assert page_text
107+ assert full_page_image .index == 0 , "Full page image should have index 0"
108+ assert isinstance (full_page_image .data , bytes )
109+ assert len (full_page_image .data ) > 0 , "Full page image should have data"
110+ # Check useful attributes are present and are JSON serializable
111+ json .dumps (p2_image .info )
112+ for attr in ("width" , "height" ):
113+ dim = full_page_image .info [attr ]
114+ assert isinstance (dim , int | float )
115+ assert dim > 0 , "Edge length should be positive"
116+
117+ # Check the no-media behavior
118+ parsed_text_no_media = parse_pdf_to_pages (filepath , parse_media = False )
119+ assert isinstance (parsed_text_no_media .content , dict )
120+ assert all (isinstance (c , str ) for c in parsed_text_no_media .content .values ())
30121
31122 # Check metadata
32- (parsing_library ,) = parsed_text .metadata .parsing_libraries
33- assert pymupdf .__name__ in parsing_library
34- assert parsed_text .metadata .parse_type == "pdf"
123+ for pt in (parsed_text , parsed_text_full_page , parsed_text_no_media ):
124+ (parsing_library ,) = pt .metadata .parsing_libraries
125+ assert pymupdf .__name__ in parsing_library
126+ assert pt .metadata .parse_type == "pdf"
127+
128+ # Check commonalities across all modes
129+ assert (
130+ len (parsed_text .content )
131+ == len (parsed_text_full_page .content )
132+ == len (parsed_text_no_media .content )
133+ ), "All modes should parse the same number of pages"
35134
36135
37136def test_page_size_limit_denial () -> None :
38137 with pytest .raises (ImpossibleParsingError , match = "char limit" ):
39138 parse_pdf_to_pages (STUB_DATA_DIR / "paper.pdf" , page_size_limit = 10 ) # chars
139+
140+
141+ def test_table_parsing () -> None :
142+ filepath = STUB_DATA_DIR / "influence.pdf"
143+ parsed_text = parse_pdf_to_pages (filepath )
144+ assert isinstance (parsed_text .content , dict )
145+ assert all (
146+ t and t [0 ] != "\n " and t [- 1 ] != "\n " for t in parsed_text .content .values ()
147+ ), "Expected no leading/trailing newlines in parsed text"
148+ assert "1" in parsed_text .content , "Parsed text should contain page 1"
149+ all_tables = {
150+ i : [m for m in pagenum_media [1 ] if m .info ["type" ] == "table" ]
151+ for i , pagenum_media in parsed_text .content .items ()
152+ if isinstance (pagenum_media , tuple )
153+ }
154+ assert (
155+ sum (len (tables ) for tables in all_tables .values ()) >= 2
156+ ), "Expected a few tables to be parsed"
0 commit comments