Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Preparation for a first stable release.
accepts a `fiscal_year` parameter. The parameter can now be a single integer or a list
of integers. When a list is provided, the workflow or function will search for the union
of all values provided.
- Added a LexiconGenerator class to generate a list of keywords or sentences using an LLM.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be moved to a new release, v1.0.0 has already been merged


### Changed
- Refactor the `themes` submodule to `tree`, allowing for a more generic tree structure
Expand Down
74 changes: 74 additions & 0 deletions examples/lexicon_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import logging

from dotenv import load_dotenv

from bigdata_research_tools.lexicon import LexiconGenerator

# Load environment variables for authentication
print(f"Environment variables loaded: {load_dotenv()}")

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

def test_keyword_generation(
main_theme,
llm_base_config: str = "openai::gpt-4o-mini",
):

logger.info("=" * 60)
logger.info("TEST 1: Keyword Generation with LLM")
logger.info("=" * 60)
lexicon_generator = LexiconGenerator(
main_theme=main_theme,
llm_model_config=llm_base_config,
mode = 'keywords',
)

results = lexicon_generator.generate_lexicon(
)

logger.info("Results: %s", results)

def test_sentence_generation(
main_theme,
llm_base_config: str = "openai::o3-mini",
):
logger.info("=" * 60)
logger.info("TEST 2: Test Sentence Generation with LLM")
logger.info("=" * 60)
lexicon_generator = LexiconGenerator(
main_theme=main_theme,
llm_model_config=llm_base_config,
mode = 'sentences',
)

results = lexicon_generator.generate_lexicon(
)

logger.info("Results: %s", results)

def main(
main_theme="Crude Oil Market",):
"""Run all tests."""
logger.info("Testing Lexicon Generation")
logger.info("=" * 60)

try:

test_keyword_generation(main_theme)

test_sentence_generation(main_theme)

logger.info("=" * 60)
logger.info("All tests completed successfully")

except Exception as e:
logger.error("Error during testing: %s", e)
raise


if __name__ == "__main__":
main()
179 changes: 179 additions & 0 deletions src/bigdata_research_tools/lexicon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
from logging import Logger, getLogger

logger: Logger = getLogger(__name__)
import json
import re
from bigdata_research_tools.llm.base import (
REASONING_MODELS,
LLMConfig,
LLMEngine,
AsyncLLMEngine,
)
from bigdata_research_tools.llm.utils import (run_concurrent_prompts, run_parallel_prompts)
from bigdata_research_tools.prompts.lexicon import build_lexicon_prompt

class LexiconGenerator:

def __init__(self, main_theme:str, llm_model_config: str | LLMConfig | dict, mode="keywords", seeds: list|None = [123, 123456, 123456789]):
"""
Args:
llm_config (LLMConfig): LLM configuration object (model, api_key, etc)
seeds (list[int], optional): List of seeds for LLM sampling. Defaults to [123, 123456, 123456789, 456789, 789].
mode (str): 'keywords' or 'sentences'.
"""
self.main_theme = main_theme
self.mode = mode
self.validate_mode()

self.seeds = seeds if seeds is not None else [123, 123456, 123456789]
if isinstance(llm_model_config, dict):
self.llm_model_config = LLMConfig(**llm_model_config)
elif isinstance(llm_model_config, str):
self.llm_model_config = self.get_default_lexicon_config(llm_model_config)
else:
self.llm_model_config = llm_model_config

def validate_mode(self):
"""Validate the mode."""
if self.mode not in ['keywords', 'sentences']:
raise ValueError("Mode must be either 'keywords' or 'sentences'.")

def get_default_lexicon_config(self, model) -> LLMConfig:
"""Get default LLM model configuration for lexicon generation."""
if any(rm in model for rm in REASONING_MODELS):
return LLMConfig(
model=model,
reasoning_effort="high",
seed=42,
response_format={"type": "json_object"},
)
else:
return LLMConfig(
model=model,
temperature=0,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
seed=42,
response_format={"type": "json_object"},
)

def generate_lexicon(
self,
timeout: int | None = 55,
max_workers: int = 100,
) -> list[str]:
"""
Generate lexicon using LLM prompts.

Args:
timeout: Timeout for each LLM request.
max_workers: Maximum number of concurrent workers.

Returns:
List of generated lexicon items.
"""
system_prompt = build_lexicon_prompt(self.main_theme, mode=self.mode)
responses = self._run_lexicon_prompts(
system_prompt,
timeout=timeout,
max_workers=max_workers,
)

unique_items = self.flatten_responses(responses)

return unique_items

def _run_lexicon_prompts(
self,
system_prompt: str,
timeout: int | None,
max_workers: int = 100,
) -> list[str]:
"""
Get the labels from the prompts.

Args:
prompts: List of prompts to process
system_prompt: System prompt for the LLM
timeout: Timeout for each LLM request for concurrent calls
max_workers: Maximum number of concurrent workers
processing_callbacks: Callback function for handling responses
Returns:
Dict of parsed responses from the LLM
"""
llm_kwargs = self.llm_model_config.get_llm_kwargs(
remove_max_tokens=True, remove_timeout=True
)

provider, _ = self.llm_model_config.model.split("::")

if (provider == 'bedrock') or (len(self.seeds) == 1):
llm = LLMEngine(
model=self.llm_model_config.model,
**self.llm_model_config.connection_config,
)

return run_parallel_prompts(
llm_engine=llm,
prompts = [self.main_theme]*len(self.seeds),
system_prompt=system_prompt,
processing_callbacks=[self.parse_lexicon_response],
max_workers=max_workers,
**llm_kwargs,
)
elif (provider != 'bedrock') and (len(self.seeds) > 1):
llm = AsyncLLMEngine(
model=self.llm_model_config.model,
**self.llm_model_config.connection_config,
)

return run_concurrent_prompts(
llm_engine=llm,
prompts = [self.main_theme]*len(self.seeds),
system_prompt=system_prompt,
timeout=timeout,
processing_callbacks=[self.parse_lexicon_response],
max_workers=max_workers,
**llm_kwargs,
)

def parse_lexicon_response(self, response: str) -> list[str]:
"""
Parse the response from the LLM model used for lexicon generation.

Args:
response: Response string from the LLM.
Returns:
List of lexicon items.
"""
try:
response = re.sub(r'```', '', response)
response = re.sub(r'json', '', response)
parsed = json.loads(response)[self.mode]
return [item.strip() for item in parsed if item.strip()]
except json.JSONDecodeError:
logger.error(f"Error deserializing response: {response}")
return []

def flatten_responses(self, responses: list[list[str]]) -> list[str]:
"""
Flatten the responses from the LLM into a unique list of items.

Args:
responses: List of lists of lexicon items from the LLM.
Returns:
List of unique lexicon items, preserving order.
"""
seen = set()
unique_items = []
for response_list in responses:
for item in response_list:
if item not in seen:
seen.add(item)
unique_items.append(item)
return unique_items




32 changes: 32 additions & 0 deletions src/bigdata_research_tools/prompts/lexicon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
def build_lexicon_prompt(theme, mode='keywords'):
if mode == "keywords":
prompt = """You are an expert tasked with generating a lexicon of the most important and relevant keywords specific to the {theme}.

Your goal is to compile a list of terms that are critical for understanding and analyzing the {theme}. This lexicon should include only the most essential keywords, phrases, and abbreviations that are directly associated with {theme} topics, analysis, logistics, and industry reporting.

Guidelines:
1. Focus on relevance: Include only the most important and commonly used keywords that are uniquely tied to the {theme}. These should reflect key concepts, industry-specific mechanisms, benchmarks, logistical aspects, and terminology that are central to the theme.
2. Avoid redundancy: Do not repeat the primary terms of the theme in multiple phrases. Include the main term (e.g., "{theme}") only as a standalone term, and focus on other specific terms without redundant repetition.
3. Strict exclusion of generic terms: Exclude any terms that are generic or broadly used across different fields, such as "Arbitrage," "Hedge," "Liquidity," or "Futures Contract," even if they have a specific meaning within the context of {theme}. Only include terms that are uniquely relevant to {theme} and cannot be applied broadly.
4. Include specific variations: Where applicable, provide both the full form and common abbreviations relevant to the {theme}. Present the full term and its abbreviation as separate entries. For example, instead of "Zero Lower Bound (ZLB)", list "Zero Lower Bound" and "ZLB" as separate keywords.
5. Ensure clarity: Each keyword should be concise, clear, and directly relevant to the {theme}, avoiding any ambiguity.
6. Select only the most critical: There is no need to reach a specific number of keywords. Focus solely on the most crucial terms without padding the list. If fewer keywords meet the criteria, that is acceptable.

The output should be a lexicon of only the most critical and uniquely relevant keywords related to the {theme}, formatted as a JSON list, accessible with the key {mode}, with full terms and abbreviations listed separately.
"""
else:
prompt = """You are an expert tasked with generating a lexicon of the most important and relevant sentences specific to the {theme}.

Your goal is to compile a list of concise, informative sentences that are critical for understanding and analyzing the {theme}. Each sentence should capture a unique aspect, mechanism, or implication of the theme.

Guidelines:
1. Focus on relevance: Include only the most important and commonly discussed sentences that are uniquely tied to the {theme}. These should reflect key concepts, industry-specific mechanisms, benchmarks, logistical aspects, and terminology that are central to the theme.
2. Avoid redundancy: Do not repeat the primary terms of the theme in multiple sentences. Include the main term (e.g., "{theme}") only as a standalone sentence, and focus on other specific sentences without redundant repetition.
3. Strict exclusion of generic sentences: Exclude any sentences that are generic or broadly used across different fields, even if they have a specific meaning within the context of {theme}. Only include sentences that are uniquely relevant to {theme} and cannot be applied broadly.
4. Include specific variations: Where applicable, provide both the full form and common abbreviations relevant to the {theme}. Present the full sentence and its abbreviation as separate entries.
5. Ensure clarity: Each sentence should be concise, clear, and directly relevant to the {theme}, avoiding any ambiguity.
6. Select only the most critical: There is no need to reach a specific number of sentences. Focus solely on the most crucial sentences without padding the list. If fewer sentences meet the criteria, that is acceptable.

The output should be a lexicon of only the most critical and uniquely relevant sentences related to the {theme}, formatted as a JSON list, accessible with the key {mode}, with full sentences.
"""
return prompt.strip().format(theme=theme, mode=mode)
Loading