Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pageindex/config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
provider: "openai" # "openai" or "gemini"
model: "gpt-4o-2024-11-20"
toc_check_page_num: 20
max_page_num_each_node: 10
Expand Down
18 changes: 12 additions & 6 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
response = ChatGPT_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
return json_content.get('completed', 'no')

def extract_toc_content(content, model=None):
prompt = f"""
Expand Down Expand Up @@ -289,7 +289,13 @@ def toc_transformer(toc_content, model=None):
Directly return the final JSON structure, do not output anything else. """

prompt = init_prompt + '\n Given table of contents\n:' + toc_content
last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)

response_schema = None
if GOOGLE_GENAI_AVAILABLE and LLM_PROVIDER == "gemini":
from pageindex.utils import TocStructure
response_schema = TocStructure

last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, response_schema=response_schema)
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
if if_complete == "yes" and finish_reason == "finished":
last_complete = extract_json(last_complete)
Expand All @@ -313,7 +319,7 @@ def toc_transformer(toc_content, model=None):

Please continue the json structure, directly output the remaining part of the json structure."""

new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, response_schema=response_schema)

if new_complete.startswith('```json'):
new_complete = get_json_content(new_complete)
Expand Down Expand Up @@ -496,7 +502,7 @@ def remove_first_physical_index_section(text):
return text

### add verify completeness
def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
def generate_toc_continue(toc_content, part, model=None):
print('start generate_toc_continue')
prompt = """
You are an expert in extracting hierarchical tree structure.
Expand Down Expand Up @@ -729,7 +735,7 @@ def check_toc(page_list, opt=None):


################### fix incorrect toc #########################################################
def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
def single_toc_item_index_fixer(section_title, content, model=None):
tob_extractor_prompt = """
You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.

Expand Down Expand Up @@ -1066,7 +1072,7 @@ def page_index_main(doc, opt=None):
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")

print('Parsing PDF...')
page_list = get_page_tokens(doc)
page_list = get_page_tokens(doc, model=opt.model)

logger.info({'total_page_number': len(page_list)})
logger.info({'total_token': sum([page[1] for page in page_list])})
Expand Down
Loading