diff --git a/pageindex/page_index.py b/pageindex/page_index.py index dedc0bd..b669090 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -7,6 +7,9 @@ from .utils import * import os from concurrent.futures import ThreadPoolExecutor, as_completed +import argparse +from collections import Counter +from tqdm import tqdm ################### check title in page ######################################################### @@ -1024,7 +1027,23 @@ def page_index_main(doc, opt=None): write_node_id(structure) if opt.if_add_node_summary == 'yes': add_node_text(structure, page_list) - asyncio.run(generate_summaries_for_structure(structure, model=opt.model)) + + # Get total number of nodes for progress bar + nodes = structure_to_list(structure) + with tqdm(total=len(nodes), desc="Generating summaries") as pbar: + async def generate_summaries_with_progress(): + tasks = [] + for node in nodes: + task = asyncio.create_task(generate_node_summary(node, model=opt.model)) + task.add_done_callback(lambda _: pbar.update(1)) + tasks.append(task) + summaries = await asyncio.gather(*tasks) + for node, summary in zip(nodes, summaries): + node['summary'] = summary + return structure + + structure = asyncio.run(generate_summaries_with_progress()) + remove_structure_text(structure) if opt.if_add_node_text == 'yes': add_node_text_with_labels(structure, page_list) diff --git a/pageindex/utils.py b/pageindex/utils.py index e499831..ad9b48d 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -16,6 +16,7 @@ import yaml from pathlib import Path from types import SimpleNamespace as config +from tqdm import tqdm CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") @@ -413,7 +414,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) page_list = [] - for page_num in range(len(pdf_reader.pages)): + for page_num in tqdm(range(len(pdf_reader.pages)), desc="Parsing PDF pages"): page = pdf_reader.pages[page_num] page_text = page.extract_text() token_length = len(enc.encode(page_text)) @@ -426,7 +427,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"): doc = pymupdf.open(pdf_path) page_list = [] - for page in doc: + for page in tqdm(doc, desc="Parsing PDF pages"): page_text = page.get_text() token_length = len(enc.encode(page_text)) page_list.append((page_text, token_length)) diff --git a/requirements.txt b/requirements.txt index ad43fe1..e2c32ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ PyPDF2==3.0.1 python-dotenv==1.1.0 tiktoken==0.7.0 pyyaml==6.0.2 +tqdm==4.66.2