Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.12
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,23 @@ Follow these steps to generate a PageIndex tree from a PDF document.

### 1. Install dependencies

- use pip
```bash
pip3 install -r requirements.txt
```
- use uv (recommended)
```bash
pip install uv
uv sync
```

### 2. Set your OpenAI API key

Create a `.env` file in the root directory and add your API key:
Copy the `.env.example` file to `.env` and add your API key:

```bash
CHATGPT_API_KEY=your_openai_key_here
BASE_URL=the_model_provider_url_here (default: https://api.openai.com/v1)
```

### 3. Run PageIndex on your PDF
Expand Down
53 changes: 32 additions & 21 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .utils import *
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import asyncio


################### check title in page #########################################################
Expand Down Expand Up @@ -496,7 +497,7 @@ def remove_first_physical_index_section(text):
return text

### add verify completeness
def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
def generate_toc_continue(toc_content, part, model):# -> Any | dict:
print('start generate_toc_continue')
prompt = """
You are an expert in extracting hierarchical tree structure.
Expand Down Expand Up @@ -729,7 +730,7 @@ def check_toc(page_list, opt=None):


################### fix incorrect toc #########################################################
def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
def single_toc_item_index_fixer(section_title, content, model):# -> int | list | Any | None:
tob_extractor_prompt = """
You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.

Expand Down Expand Up @@ -952,12 +953,14 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
print(mode)
print(f'start_index: {start_index}')

model = opt.model

if mode == 'process_toc_with_page_numbers':
toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=model, logger=logger)
elif mode == 'process_toc_no_page_numbers':
toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger)
toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=model, logger=logger)
else:
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=model, logger=logger)

toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None]

Expand All @@ -968,7 +971,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
logger=logger
)

accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=model)

logger.info({
'mode': 'process_toc_with_page_numbers',
Expand All @@ -978,7 +981,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
if accuracy == 1.0 and len(incorrect_results) == 0:
return toc_with_page_number
if accuracy > 0.6 and len(incorrect_results) > 0:
toc_with_page_number, incorrect_results = await fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results,start_index=start_index, max_attempts=3, model=opt.model, logger=logger)
toc_with_page_number, incorrect_results = await fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results,start_index=start_index, max_attempts=3, model=model, logger=logger)
return toc_with_page_number
else:
if mode == 'process_toc_with_page_numbers':
Expand All @@ -993,11 +996,13 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None)
node_page_list = page_list[node['start_index']-1:node['end_index']]
token_num = sum([page[1] for page in node_page_list])

model = opt.model

if node['end_index'] - node['start_index'] > opt.max_page_num_each_node and token_num >= opt.max_token_num_each_node:
print('large node:', node['title'], 'start_index:', node['start_index'], 'end_index:', node['end_index'], 'token_num:', token_num)

node_toc_tree = await meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger)
node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=opt.model, logger=logger)
node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=model, logger=logger)

# Filter out items with None physical_index before post_processing
valid_node_toc_items = [item for item in node_toc_tree if item.get('physical_index') is not None]
Expand Down Expand Up @@ -1055,7 +1060,8 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
return toc_tree


def page_index_main(doc, opt=None):
async def async_page_index_main(doc, opt=None):
"""The async version of the page_index_main function."""
logger = JsonLogger(doc)

is_valid_pdf = (
Expand All @@ -1071,15 +1077,16 @@ def page_index_main(doc, opt=None):
logger.info({'total_page_number': len(page_list)})
logger.info({'total_token': sum([page[1] for page in page_list])})

structure = asyncio.run(tree_parser(page_list, opt, doc=doc, logger=logger))
# run all async operations in a single event loop
structure = await tree_parser(page_list, opt, doc=doc, logger=logger)
if opt.if_add_node_id == 'yes':
write_node_id(structure)
if opt.if_add_node_text == 'yes':
add_node_text(structure, page_list)
if opt.if_add_node_summary == 'yes':
if opt.if_add_node_text == 'no':
add_node_text(structure, page_list)
asyncio.run(generate_summaries_for_structure(structure, model=opt.model))
await generate_summaries_for_structure(structure, model=opt.model)
if opt.if_add_node_text == 'no':
remove_structure_text(structure)
if opt.if_add_doc_description == 'yes':
Expand All @@ -1095,15 +1102,9 @@ def page_index_main(doc, opt=None):
}


def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):

user_opt = {
arg: value for arg, value in locals().items()
if arg != "doc" and value is not None
}
opt = ConfigLoader().load(user_opt)
return page_index_main(doc, opt)
def page_index_main(doc, opt=None):
"""Synchronous wrapper function that runs all async operations in a single event loop."""
return asyncio.run(async_page_index_main(doc, opt))


def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None):
Expand Down Expand Up @@ -1136,4 +1137,14 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt
if truncated_items:
print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")

return toc_with_page_number
return toc_with_page_number

def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):

user_opt = {
arg: value for arg, value in locals().items()
if arg != "doc" and value is not None
}
opt = ConfigLoader().load(user_opt)
return page_index_main(doc, opt)
108 changes: 80 additions & 28 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,69 @@
import yaml
from pathlib import Path
from types import SimpleNamespace as config
from contextlib import asynccontextmanager

CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
BASE_URL = os.getenv("BASE_URL")

@asynccontextmanager
async def get_async_openai_client(api_key=CHATGPT_API_KEY):
"""Asynchronous context manager to manage the lifecycle of the OpenAI client."""
client = openai.AsyncOpenAI(api_key=api_key, base_url=BASE_URL)
try:
yield client
finally:
try:
await client.aclose()
except Exception:
pass

def get_appropriate_tokenizer(model):
"""
Get the appropriate tokenizer for the model.
The logic is to prioritize tiktoken and use sentencepiece_approx as a final fallback.
"""
try:
# First, try to get the specific tokenizer for the model using tiktoken.
enc = tiktoken.encoding_for_model(model)
return {
'type': 'tiktoken',
'encoder': enc
}
except KeyError:
# If the model is not found, tiktoken raises a KeyError.
# print(f"Warning: Model '{model}' not found in tiktoken. Using cl100k_base as fallback.")
try:
# Fallback to a generic tiktoken encoder.
enc = tiktoken.get_encoding("cl100k_base")
return {
'type': 'tiktoken_fallback',
'encoder': enc
}
except Exception as e:
# This is unlikely to happen, but as a last resort.
print(f"Warning: tiktoken fallback failed: {e}. Using SentencePiece approximation as final fallback.")
return {
'type': 'sentencepiece_approx',
'encoder': None
}

def count_tokens(text, model):
enc = tiktoken.encoding_for_model(model)
tokens = enc.encode(text)
return len(tokens)
"""Count the number of tokens in the text, using the appropriate tokenizer for the model."""
tokenizer_info = get_appropriate_tokenizer(model)

if tokenizer_info['type'] in ['tiktoken', 'tiktoken_fallback']:
# Use tiktoken encoder to return the token count directly
enc = tokenizer_info['encoder']
return len(enc.encode(text))

else:
# Default fallback to character count
return max(len(text) // 4, 1) # Rough estimation: 1 token per 4 characters

def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
client = openai.OpenAI(api_key=api_key, base_url=BASE_URL)
for i in range(max_retries):
try:
if chat_history:
Expand Down Expand Up @@ -59,7 +110,7 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_

def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
client = openai.OpenAI(api_key=api_key, base_url=BASE_URL)
for i in range(max_retries):
try:
if chat_history:
Expand Down Expand Up @@ -87,24 +138,24 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):

async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
max_retries = 10
client = openai.AsyncOpenAI(api_key=api_key)
for i in range(max_retries):
try:
messages = [{"role": "user", "content": prompt}]
response = await client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)
return response.choices[0].message.content
except Exception as e:
print('************* Retrying *************')
logging.error(f"Error: {e}")
if i < max_retries - 1:
await asyncio.sleep(1) # Wait for 1秒 before retrying
else:
logging.error('Max retries reached for prompt: ' + prompt)
return "Error"
async with get_async_openai_client(api_key) as client:
for i in range(max_retries):
try:
messages = [{"role": "user", "content": prompt}]
response = await client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)
return response.choices[0].message.content
except Exception as e:
print('************* Retrying *************')
logging.error(f"Error: {e}")
if i < max_retries - 1:
await asyncio.sleep(1) # Wait for 1秒 before retrying
else:
logging.error('Max retries reached for prompt: ' + prompt)
return "Error"

def get_json_content(response):
start_idx = response.find("```json")
Expand Down Expand Up @@ -409,16 +460,17 @@ def add_preface_if_needed(data):


def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
enc = tiktoken.encoding_for_model(model)
"""Get the text and token count for each page of the PDF."""
if pdf_parser == "PyPDF2":
pdf_reader = PyPDF2.PdfReader(pdf_path)
page_list = []
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
token_length = len(enc.encode(page_text))
token_length = count_tokens(page_text, model)
page_list.append((page_text, token_length))
return page_list

elif pdf_parser == "PyMuPDF":
if isinstance(pdf_path, BytesIO):
pdf_stream = pdf_path
Expand All @@ -428,7 +480,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
page_list = []
for page in doc:
page_text = page.get_text()
token_length = len(enc.encode(page_text))
token_length = count_tokens(page_text, model)
page_list.append((page_text, token_length))
return page_list
else:
Expand Down Expand Up @@ -456,7 +508,7 @@ def get_number_of_pages(pdf_path):


def post_processing(structure, end_physical_index):
# First convert page_number to start_index in flat list
# First convert physical_index to start_index in flat list
for i, item in enumerate(structure):
item['start_index'] = item.get('physical_index')
if i < len(structure) - 1:
Expand All @@ -470,7 +522,7 @@ def post_processing(structure, end_physical_index):
if len(tree)!=0:
return tree
else:
### remove appear_start
# remove appear_start
for node in structure:
node.pop('appear_start', None)
node.pop('physical_index', None)
Expand Down
15 changes: 15 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[project]
name = "pageindex"
version = "0.1.0"
description = "pageindex is a cutting-edge tool for indexing and searching PDF files without traditional rag."
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"openai==1.70.0",
"pymupdf==1.25.5",
"pypdf2==3.0.1",
"python-dotenv==1.1.0",
"pyyaml==6.0.2",
"sentencepiece>=0.2.0",
"tiktoken>=0.9.0",
]
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@ openai==1.70.0
pymupdf==1.25.5
PyPDF2==3.0.1
python-dotenv==1.1.0
tiktoken==0.7.0
# tiktoken==0.7.0
tiktoken
sentencepiece
pyyaml==6.0.2
Loading