From c9f4e100db8470835f657458571a9d9313a626d8 Mon Sep 17 00:00:00 2001
From: Alessandro Bouchs <abouchs@ravenpack.com>
Date: Thu, 20 Nov 2025 12:16:23 +0000
Subject: [PATCH] lexicon generator class

---
 CHANGELOG.md                                  |   1 +
 examples/lexicon_generator.py                 |  74 ++++++++
 src/bigdata_research_tools/lexicon.py         | 179 ++++++++++++++++++
 src/bigdata_research_tools/prompts/lexicon.py |  32 ++++
 4 files changed, 286 insertions(+)
 create mode 100644 examples/lexicon_generator.py
 create mode 100644 src/bigdata_research_tools/lexicon.py
 create mode 100644 src/bigdata_research_tools/prompts/lexicon.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 441b067..5659126 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ Preparation for a first stable release.
   accepts a `fiscal_year` parameter. The parameter can now be a single integer or a list
   of integers. When a list is provided, the workflow or function will search for the union
   of all values provided.
+- Added a LexiconGenerator class to generate a list of keywords or sentences using an LLM. 
 
 ### Changed
 - Refactor the `themes` submodule to `tree`, allowing for a more generic tree structure
diff --git a/examples/lexicon_generator.py b/examples/lexicon_generator.py
new file mode 100644
index 0000000..9575594
--- /dev/null
+++ b/examples/lexicon_generator.py
@@ -0,0 +1,74 @@
+import logging
+
+from dotenv import load_dotenv
+
+from bigdata_research_tools.lexicon import LexiconGenerator
+
+# Load environment variables for authentication
+print(f"Environment variables loaded: {load_dotenv()}")
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+def test_keyword_generation(
+    main_theme,
+    llm_base_config: str = "openai::gpt-4o-mini",
+):
+    
+    logger.info("=" * 60)
+    logger.info("TEST 1: Keyword Generation with LLM")
+    logger.info("=" * 60)
+    lexicon_generator = LexiconGenerator(
+        main_theme=main_theme,
+        llm_model_config=llm_base_config,
+        mode = 'keywords',
+    )
+
+    results = lexicon_generator.generate_lexicon(
+    )
+    
+    logger.info("Results: %s", results)
+
+def test_sentence_generation(
+    main_theme,
+    llm_base_config: str = "openai::o3-mini",
+):
+    logger.info("=" * 60)
+    logger.info("TEST 2: Test Sentence Generation with LLM")
+    logger.info("=" * 60)
+    lexicon_generator = LexiconGenerator(
+        main_theme=main_theme,
+        llm_model_config=llm_base_config,
+        mode = 'sentences',
+    )
+
+    results = lexicon_generator.generate_lexicon(
+    )
+    
+    logger.info("Results: %s", results)
+
+def main(
+    main_theme="Crude Oil Market",):
+    """Run all tests."""
+    logger.info("Testing Lexicon Generation")
+    logger.info("=" * 60)
+
+    try:
+        
+        test_keyword_generation(main_theme)
+
+        test_sentence_generation(main_theme)
+
+        logger.info("=" * 60)
+        logger.info("All tests completed successfully")
+
+    except Exception as e:
+        logger.error("Error during testing: %s", e)
+        raise
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/bigdata_research_tools/lexicon.py b/src/bigdata_research_tools/lexicon.py
new file mode 100644
index 0000000..f94d893
--- /dev/null
+++ b/src/bigdata_research_tools/lexicon.py
@@ -0,0 +1,179 @@
+from logging import Logger, getLogger
+
+logger: Logger = getLogger(__name__)
+import json
+import re
+from bigdata_research_tools.llm.base import (
+    REASONING_MODELS,
+    LLMConfig,
+    LLMEngine,
+    AsyncLLMEngine,
+)
+from bigdata_research_tools.llm.utils import (run_concurrent_prompts, run_parallel_prompts)
+from bigdata_research_tools.prompts.lexicon import build_lexicon_prompt
+
+class LexiconGenerator:
+    
+    def __init__(self, main_theme:str, llm_model_config: str | LLMConfig | dict, mode="keywords", seeds: list|None = [123, 123456, 123456789]):
+        """
+        Args:
+            llm_config (LLMConfig): LLM configuration object (model, api_key, etc)
+            seeds (list[int], optional): List of seeds for LLM sampling. Defaults to [123, 123456, 123456789, 456789, 789].
+            mode (str): 'keywords' or 'sentences'.
+        """
+        self.main_theme = main_theme
+        self.mode = mode
+        self.validate_mode()
+
+        self.seeds = seeds if seeds is not None else [123, 123456, 123456789]
+        if isinstance(llm_model_config, dict):
+            self.llm_model_config = LLMConfig(**llm_model_config)
+        elif isinstance(llm_model_config, str):
+            self.llm_model_config = self.get_default_lexicon_config(llm_model_config)
+        else:
+            self.llm_model_config = llm_model_config
+
+    def validate_mode(self):
+        """Validate the mode."""
+        if self.mode not in ['keywords', 'sentences']:
+            raise ValueError("Mode must be either 'keywords' or 'sentences'.")
+
+    def get_default_lexicon_config(self, model) -> LLMConfig:
+        """Get default LLM model configuration for lexicon generation."""
+        if any(rm in model for rm in REASONING_MODELS):
+            return LLMConfig(
+                model=model,
+                reasoning_effort="high",
+                seed=42,
+                response_format={"type": "json_object"},
+            )
+        else:
+            return LLMConfig(
+                model=model,
+                temperature=0,
+                top_p=1,
+                frequency_penalty=0,
+                presence_penalty=0,
+                seed=42,
+                response_format={"type": "json_object"},
+            )
+        
+    def generate_lexicon(
+        self,
+        timeout: int | None = 55,
+        max_workers: int = 100,
+    ) -> list[str]:
+        """
+        Generate lexicon using LLM prompts.
+
+        Args:
+            timeout: Timeout for each LLM request.
+            max_workers: Maximum number of concurrent workers.
+
+        Returns:
+            List of generated lexicon items.
+        """
+        system_prompt = build_lexicon_prompt(self.main_theme, mode=self.mode)
+        responses = self._run_lexicon_prompts(
+            system_prompt,
+            timeout=timeout,
+            max_workers=max_workers,
+        )
+
+        unique_items = self.flatten_responses(responses)
+
+        return unique_items
+        
+    def _run_lexicon_prompts(
+        self,
+        system_prompt: str,
+        timeout: int | None,
+        max_workers: int = 100,
+    ) -> list[str]:
+        """
+        Get the labels from the prompts.
+
+        Args:
+            prompts: List of prompts to process
+            system_prompt: System prompt for the LLM
+            timeout: Timeout for each LLM request for concurrent calls
+            max_workers: Maximum number of concurrent workers
+            processing_callbacks: Callback function for handling responses
+        Returns:
+            Dict of parsed responses from the LLM
+        """
+        llm_kwargs = self.llm_model_config.get_llm_kwargs(
+            remove_max_tokens=True, remove_timeout=True
+        )
+
+        provider, _ = self.llm_model_config.model.split("::")
+
+        if  (provider == 'bedrock') or (len(self.seeds) == 1):     
+            llm = LLMEngine(
+                model=self.llm_model_config.model,
+                **self.llm_model_config.connection_config,
+            )
+            
+            return run_parallel_prompts(
+                llm_engine=llm,
+                prompts = [self.main_theme]*len(self.seeds),
+                system_prompt=system_prompt,
+                processing_callbacks=[self.parse_lexicon_response],
+                max_workers=max_workers,
+                **llm_kwargs,
+            )
+        elif (provider != 'bedrock') and (len(self.seeds) > 1):
+            llm = AsyncLLMEngine(
+                model=self.llm_model_config.model,
+                **self.llm_model_config.connection_config,
+            )
+
+            return run_concurrent_prompts(
+                llm_engine=llm,
+                prompts = [self.main_theme]*len(self.seeds),
+                system_prompt=system_prompt,
+                timeout=timeout,
+                processing_callbacks=[self.parse_lexicon_response],
+                max_workers=max_workers,
+                **llm_kwargs,
+            )
+        
+    def parse_lexicon_response(self, response: str) -> list[str]:
+        """
+        Parse the response from the LLM model used for lexicon generation.
+
+        Args:
+            response: Response string from the LLM.
+        Returns:
+            List of lexicon items.
+        """
+        try:
+            response = re.sub(r'```', '', response)
+            response = re.sub(r'json', '', response)
+            parsed = json.loads(response)[self.mode]
+            return [item.strip() for item in parsed if item.strip()]
+        except json.JSONDecodeError:
+            logger.error(f"Error deserializing response: {response}")
+            return []
+
+    def flatten_responses(self, responses: list[list[str]]) -> list[str]:
+        """
+        Flatten the responses from the LLM into a unique list of items.
+
+        Args:
+            responses: List of lists of lexicon items from the LLM.
+        Returns:
+            List of unique lexicon items, preserving order.
+        """
+        seen = set()
+        unique_items = []
+        for response_list in responses:
+            for item in response_list:
+                if item not in seen:
+                    seen.add(item)
+                    unique_items.append(item)
+        return unique_items
+
+
+
+        
diff --git a/src/bigdata_research_tools/prompts/lexicon.py b/src/bigdata_research_tools/prompts/lexicon.py
new file mode 100644
index 0000000..f02a74a
--- /dev/null
+++ b/src/bigdata_research_tools/prompts/lexicon.py
@@ -0,0 +1,32 @@
+def build_lexicon_prompt(theme, mode='keywords'):
+    if mode == "keywords":
+        prompt = """You are an expert tasked with generating a lexicon of the most important and relevant keywords specific to the {theme}.
+
+Your goal is to compile a list of terms that are critical for understanding and analyzing the {theme}. This lexicon should include only the most essential keywords, phrases, and abbreviations that are directly associated with {theme} topics, analysis, logistics, and industry reporting.
+
+Guidelines:
+1. Focus on relevance: Include only the most important and commonly used keywords that are uniquely tied to the {theme}. These should reflect key concepts, industry-specific mechanisms, benchmarks, logistical aspects, and terminology that are central to the theme.
+2. Avoid redundancy: Do not repeat the primary terms of the theme in multiple phrases. Include the main term (e.g., "{theme}") only as a standalone term, and focus on other specific terms without redundant repetition.
+3. Strict exclusion of generic terms: Exclude any terms that are generic or broadly used across different fields, such as "Arbitrage," "Hedge," "Liquidity," or "Futures Contract," even if they have a specific meaning within the context of {theme}. Only include terms that are uniquely relevant to {theme} and cannot be applied broadly.
+4. Include specific variations: Where applicable, provide both the full form and common abbreviations relevant to the {theme}. Present the full term and its abbreviation as separate entries. For example, instead of "Zero Lower Bound (ZLB)", list "Zero Lower Bound" and "ZLB" as separate keywords.
+5. Ensure clarity: Each keyword should be concise, clear, and directly relevant to the {theme}, avoiding any ambiguity.
+6. Select only the most critical: There is no need to reach a specific number of keywords. Focus solely on the most crucial terms without padding the list. If fewer keywords meet the criteria, that is acceptable.
+
+The output should be a lexicon of only the most critical and uniquely relevant keywords related to the {theme}, formatted as a JSON list, accessible with the key {mode}, with full terms and abbreviations listed separately.
+"""
+    else:
+        prompt = """You are an expert tasked with generating a lexicon of the most important and relevant sentences specific to the {theme}.
+
+Your goal is to compile a list of concise, informative sentences that are critical for understanding and analyzing the {theme}. Each sentence should capture a unique aspect, mechanism, or implication of the theme.
+
+Guidelines:
+1. Focus on relevance: Include only the most important and commonly discussed sentences that are uniquely tied to the {theme}. These should reflect key concepts, industry-specific mechanisms, benchmarks, logistical aspects, and terminology that are central to the theme.
+2. Avoid redundancy: Do not repeat the primary terms of the theme in multiple sentences. Include the main term (e.g., "{theme}") only as a standalone sentence, and focus on other specific sentences without redundant repetition.
+3. Strict exclusion of generic sentences: Exclude any sentences that are generic or broadly used across different fields, even if they have a specific meaning within the context of {theme}. Only include sentences that are uniquely relevant to {theme} and cannot be applied broadly.
+4. Include specific variations: Where applicable, provide both the full form and common abbreviations relevant to the {theme}. Present the full sentence and its abbreviation as separate entries.
+5. Ensure clarity: Each sentence should be concise, clear, and directly relevant to the {theme}, avoiding any ambiguity.
+6. Select only the most critical: There is no need to reach a specific number of sentences. Focus solely on the most crucial sentences without padding the list. If fewer sentences meet the criteria, that is acceptable.
+
+The output should be a lexicon of only the most critical and uniquely relevant sentences related to the {theme}, formatted as a JSON list, accessible with the key {mode}, with full sentences.
+"""
+    return prompt.strip().format(theme=theme, mode=mode)
\ No newline at end of file