From c9f4e100db8470835f657458571a9d9313a626d8 Mon Sep 17 00:00:00 2001 From: Alessandro Bouchs Date: Thu, 20 Nov 2025 12:16:23 +0000 Subject: [PATCH] lexicon generator class --- CHANGELOG.md | 1 + examples/lexicon_generator.py | 74 ++++++++ src/bigdata_research_tools/lexicon.py | 179 ++++++++++++++++++ src/bigdata_research_tools/prompts/lexicon.py | 32 ++++ 4 files changed, 286 insertions(+) create mode 100644 examples/lexicon_generator.py create mode 100644 src/bigdata_research_tools/lexicon.py create mode 100644 src/bigdata_research_tools/prompts/lexicon.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 441b067..5659126 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Preparation for a first stable release. accepts a `fiscal_year` parameter. The parameter can now be a single integer or a list of integers. When a list is provided, the workflow or function will search for the union of all values provided. +- Added a LexiconGenerator class to generate a list of keywords or sentences using an LLM. ### Changed - Refactor the `themes` submodule to `tree`, allowing for a more generic tree structure diff --git a/examples/lexicon_generator.py b/examples/lexicon_generator.py new file mode 100644 index 0000000..9575594 --- /dev/null +++ b/examples/lexicon_generator.py @@ -0,0 +1,74 @@ +import logging + +from dotenv import load_dotenv + +from bigdata_research_tools.lexicon import LexiconGenerator + +# Load environment variables for authentication +print(f"Environment variables loaded: {load_dotenv()}") + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +def test_keyword_generation( + main_theme, + llm_base_config: str = "openai::gpt-4o-mini", +): + + logger.info("=" * 60) + logger.info("TEST 1: Keyword Generation with LLM") + logger.info("=" * 60) + lexicon_generator = LexiconGenerator( + main_theme=main_theme, + llm_model_config=llm_base_config, + mode = 'keywords', + ) + + results = lexicon_generator.generate_lexicon( + ) + + logger.info("Results: %s", results) + +def test_sentence_generation( + main_theme, + llm_base_config: str = "openai::o3-mini", +): + logger.info("=" * 60) + logger.info("TEST 2: Test Sentence Generation with LLM") + logger.info("=" * 60) + lexicon_generator = LexiconGenerator( + main_theme=main_theme, + llm_model_config=llm_base_config, + mode = 'sentences', + ) + + results = lexicon_generator.generate_lexicon( + ) + + logger.info("Results: %s", results) + +def main( + main_theme="Crude Oil Market",): + """Run all tests.""" + logger.info("Testing Lexicon Generation") + logger.info("=" * 60) + + try: + + test_keyword_generation(main_theme) + + test_sentence_generation(main_theme) + + logger.info("=" * 60) + logger.info("All tests completed successfully") + + except Exception as e: + logger.error("Error during testing: %s", e) + raise + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/bigdata_research_tools/lexicon.py b/src/bigdata_research_tools/lexicon.py new file mode 100644 index 0000000..f94d893 --- /dev/null +++ b/src/bigdata_research_tools/lexicon.py @@ -0,0 +1,179 @@ +from logging import Logger, getLogger + +logger: Logger = getLogger(__name__) +import json +import re +from bigdata_research_tools.llm.base import ( + REASONING_MODELS, + LLMConfig, + LLMEngine, + AsyncLLMEngine, +) +from bigdata_research_tools.llm.utils import (run_concurrent_prompts, run_parallel_prompts) +from bigdata_research_tools.prompts.lexicon import build_lexicon_prompt + +class LexiconGenerator: + + def __init__(self, main_theme:str, llm_model_config: str | LLMConfig | dict, mode="keywords", seeds: list|None = [123, 123456, 123456789]): + """ + Args: + llm_config (LLMConfig): LLM configuration object (model, api_key, etc) + seeds (list[int], optional): List of seeds for LLM sampling. Defaults to [123, 123456, 123456789, 456789, 789]. + mode (str): 'keywords' or 'sentences'. + """ + self.main_theme = main_theme + self.mode = mode + self.validate_mode() + + self.seeds = seeds if seeds is not None else [123, 123456, 123456789] + if isinstance(llm_model_config, dict): + self.llm_model_config = LLMConfig(**llm_model_config) + elif isinstance(llm_model_config, str): + self.llm_model_config = self.get_default_lexicon_config(llm_model_config) + else: + self.llm_model_config = llm_model_config + + def validate_mode(self): + """Validate the mode.""" + if self.mode not in ['keywords', 'sentences']: + raise ValueError("Mode must be either 'keywords' or 'sentences'.") + + def get_default_lexicon_config(self, model) -> LLMConfig: + """Get default LLM model configuration for lexicon generation.""" + if any(rm in model for rm in REASONING_MODELS): + return LLMConfig( + model=model, + reasoning_effort="high", + seed=42, + response_format={"type": "json_object"}, + ) + else: + return LLMConfig( + model=model, + temperature=0, + top_p=1, + frequency_penalty=0, + presence_penalty=0, + seed=42, + response_format={"type": "json_object"}, + ) + + def generate_lexicon( + self, + timeout: int | None = 55, + max_workers: int = 100, + ) -> list[str]: + """ + Generate lexicon using LLM prompts. + + Args: + timeout: Timeout for each LLM request. + max_workers: Maximum number of concurrent workers. + + Returns: + List of generated lexicon items. + """ + system_prompt = build_lexicon_prompt(self.main_theme, mode=self.mode) + responses = self._run_lexicon_prompts( + system_prompt, + timeout=timeout, + max_workers=max_workers, + ) + + unique_items = self.flatten_responses(responses) + + return unique_items + + def _run_lexicon_prompts( + self, + system_prompt: str, + timeout: int | None, + max_workers: int = 100, + ) -> list[str]: + """ + Get the labels from the prompts. + + Args: + prompts: List of prompts to process + system_prompt: System prompt for the LLM + timeout: Timeout for each LLM request for concurrent calls + max_workers: Maximum number of concurrent workers + processing_callbacks: Callback function for handling responses + Returns: + Dict of parsed responses from the LLM + """ + llm_kwargs = self.llm_model_config.get_llm_kwargs( + remove_max_tokens=True, remove_timeout=True + ) + + provider, _ = self.llm_model_config.model.split("::") + + if (provider == 'bedrock') or (len(self.seeds) == 1): + llm = LLMEngine( + model=self.llm_model_config.model, + **self.llm_model_config.connection_config, + ) + + return run_parallel_prompts( + llm_engine=llm, + prompts = [self.main_theme]*len(self.seeds), + system_prompt=system_prompt, + processing_callbacks=[self.parse_lexicon_response], + max_workers=max_workers, + **llm_kwargs, + ) + elif (provider != 'bedrock') and (len(self.seeds) > 1): + llm = AsyncLLMEngine( + model=self.llm_model_config.model, + **self.llm_model_config.connection_config, + ) + + return run_concurrent_prompts( + llm_engine=llm, + prompts = [self.main_theme]*len(self.seeds), + system_prompt=system_prompt, + timeout=timeout, + processing_callbacks=[self.parse_lexicon_response], + max_workers=max_workers, + **llm_kwargs, + ) + + def parse_lexicon_response(self, response: str) -> list[str]: + """ + Parse the response from the LLM model used for lexicon generation. + + Args: + response: Response string from the LLM. + Returns: + List of lexicon items. + """ + try: + response = re.sub(r'```', '', response) + response = re.sub(r'json', '', response) + parsed = json.loads(response)[self.mode] + return [item.strip() for item in parsed if item.strip()] + except json.JSONDecodeError: + logger.error(f"Error deserializing response: {response}") + return [] + + def flatten_responses(self, responses: list[list[str]]) -> list[str]: + """ + Flatten the responses from the LLM into a unique list of items. + + Args: + responses: List of lists of lexicon items from the LLM. + Returns: + List of unique lexicon items, preserving order. + """ + seen = set() + unique_items = [] + for response_list in responses: + for item in response_list: + if item not in seen: + seen.add(item) + unique_items.append(item) + return unique_items + + + + diff --git a/src/bigdata_research_tools/prompts/lexicon.py b/src/bigdata_research_tools/prompts/lexicon.py new file mode 100644 index 0000000..f02a74a --- /dev/null +++ b/src/bigdata_research_tools/prompts/lexicon.py @@ -0,0 +1,32 @@ +def build_lexicon_prompt(theme, mode='keywords'): + if mode == "keywords": + prompt = """You are an expert tasked with generating a lexicon of the most important and relevant keywords specific to the {theme}. + +Your goal is to compile a list of terms that are critical for understanding and analyzing the {theme}. This lexicon should include only the most essential keywords, phrases, and abbreviations that are directly associated with {theme} topics, analysis, logistics, and industry reporting. + +Guidelines: +1. Focus on relevance: Include only the most important and commonly used keywords that are uniquely tied to the {theme}. These should reflect key concepts, industry-specific mechanisms, benchmarks, logistical aspects, and terminology that are central to the theme. +2. Avoid redundancy: Do not repeat the primary terms of the theme in multiple phrases. Include the main term (e.g., "{theme}") only as a standalone term, and focus on other specific terms without redundant repetition. +3. Strict exclusion of generic terms: Exclude any terms that are generic or broadly used across different fields, such as "Arbitrage," "Hedge," "Liquidity," or "Futures Contract," even if they have a specific meaning within the context of {theme}. Only include terms that are uniquely relevant to {theme} and cannot be applied broadly. +4. Include specific variations: Where applicable, provide both the full form and common abbreviations relevant to the {theme}. Present the full term and its abbreviation as separate entries. For example, instead of "Zero Lower Bound (ZLB)", list "Zero Lower Bound" and "ZLB" as separate keywords. +5. Ensure clarity: Each keyword should be concise, clear, and directly relevant to the {theme}, avoiding any ambiguity. +6. Select only the most critical: There is no need to reach a specific number of keywords. Focus solely on the most crucial terms without padding the list. If fewer keywords meet the criteria, that is acceptable. + +The output should be a lexicon of only the most critical and uniquely relevant keywords related to the {theme}, formatted as a JSON list, accessible with the key {mode}, with full terms and abbreviations listed separately. +""" + else: + prompt = """You are an expert tasked with generating a lexicon of the most important and relevant sentences specific to the {theme}. + +Your goal is to compile a list of concise, informative sentences that are critical for understanding and analyzing the {theme}. Each sentence should capture a unique aspect, mechanism, or implication of the theme. + +Guidelines: +1. Focus on relevance: Include only the most important and commonly discussed sentences that are uniquely tied to the {theme}. These should reflect key concepts, industry-specific mechanisms, benchmarks, logistical aspects, and terminology that are central to the theme. +2. Avoid redundancy: Do not repeat the primary terms of the theme in multiple sentences. Include the main term (e.g., "{theme}") only as a standalone sentence, and focus on other specific sentences without redundant repetition. +3. Strict exclusion of generic sentences: Exclude any sentences that are generic or broadly used across different fields, even if they have a specific meaning within the context of {theme}. Only include sentences that are uniquely relevant to {theme} and cannot be applied broadly. +4. Include specific variations: Where applicable, provide both the full form and common abbreviations relevant to the {theme}. Present the full sentence and its abbreviation as separate entries. +5. Ensure clarity: Each sentence should be concise, clear, and directly relevant to the {theme}, avoiding any ambiguity. +6. Select only the most critical: There is no need to reach a specific number of sentences. Focus solely on the most crucial sentences without padding the list. If fewer sentences meet the criteria, that is acceptable. + +The output should be a lexicon of only the most critical and uniquely relevant sentences related to the {theme}, formatted as a JSON list, accessible with the key {mode}, with full sentences. +""" + return prompt.strip().format(theme=theme, mode=mode) \ No newline at end of file