-
Notifications
You must be signed in to change notification settings - Fork 0
lexicon generator class #30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
abouchs-ravenpack
wants to merge
1
commit into
main
Choose a base branch
from
lexicon-generator
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| import logging | ||
|
|
||
| from dotenv import load_dotenv | ||
|
|
||
| from bigdata_research_tools.lexicon import LexiconGenerator | ||
|
|
||
| # Load environment variables for authentication | ||
| print(f"Environment variables loaded: {load_dotenv()}") | ||
|
|
||
| # Configure logging | ||
| logging.basicConfig( | ||
| level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | ||
| ) | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
| def test_keyword_generation( | ||
| main_theme, | ||
| llm_base_config: str = "openai::gpt-4o-mini", | ||
| ): | ||
|
|
||
| logger.info("=" * 60) | ||
| logger.info("TEST 1: Keyword Generation with LLM") | ||
| logger.info("=" * 60) | ||
| lexicon_generator = LexiconGenerator( | ||
| main_theme=main_theme, | ||
| llm_model_config=llm_base_config, | ||
| mode = 'keywords', | ||
| ) | ||
|
|
||
| results = lexicon_generator.generate_lexicon( | ||
| ) | ||
|
|
||
| logger.info("Results: %s", results) | ||
|
|
||
| def test_sentence_generation( | ||
| main_theme, | ||
| llm_base_config: str = "openai::o3-mini", | ||
| ): | ||
| logger.info("=" * 60) | ||
| logger.info("TEST 2: Test Sentence Generation with LLM") | ||
| logger.info("=" * 60) | ||
| lexicon_generator = LexiconGenerator( | ||
| main_theme=main_theme, | ||
| llm_model_config=llm_base_config, | ||
| mode = 'sentences', | ||
| ) | ||
|
|
||
| results = lexicon_generator.generate_lexicon( | ||
| ) | ||
|
|
||
| logger.info("Results: %s", results) | ||
|
|
||
| def main( | ||
| main_theme="Crude Oil Market",): | ||
| """Run all tests.""" | ||
| logger.info("Testing Lexicon Generation") | ||
| logger.info("=" * 60) | ||
|
|
||
| try: | ||
|
|
||
| test_keyword_generation(main_theme) | ||
|
|
||
| test_sentence_generation(main_theme) | ||
|
|
||
| logger.info("=" * 60) | ||
| logger.info("All tests completed successfully") | ||
|
|
||
| except Exception as e: | ||
| logger.error("Error during testing: %s", e) | ||
| raise | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,179 @@ | ||
| from logging import Logger, getLogger | ||
|
|
||
| logger: Logger = getLogger(__name__) | ||
| import json | ||
| import re | ||
| from bigdata_research_tools.llm.base import ( | ||
| REASONING_MODELS, | ||
| LLMConfig, | ||
| LLMEngine, | ||
| AsyncLLMEngine, | ||
| ) | ||
| from bigdata_research_tools.llm.utils import (run_concurrent_prompts, run_parallel_prompts) | ||
| from bigdata_research_tools.prompts.lexicon import build_lexicon_prompt | ||
|
|
||
| class LexiconGenerator: | ||
|
|
||
| def __init__(self, main_theme:str, llm_model_config: str | LLMConfig | dict, mode="keywords", seeds: list|None = [123, 123456, 123456789]): | ||
| """ | ||
| Args: | ||
| llm_config (LLMConfig): LLM configuration object (model, api_key, etc) | ||
| seeds (list[int], optional): List of seeds for LLM sampling. Defaults to [123, 123456, 123456789, 456789, 789]. | ||
| mode (str): 'keywords' or 'sentences'. | ||
| """ | ||
| self.main_theme = main_theme | ||
| self.mode = mode | ||
| self.validate_mode() | ||
|
|
||
| self.seeds = seeds if seeds is not None else [123, 123456, 123456789] | ||
| if isinstance(llm_model_config, dict): | ||
| self.llm_model_config = LLMConfig(**llm_model_config) | ||
| elif isinstance(llm_model_config, str): | ||
| self.llm_model_config = self.get_default_lexicon_config(llm_model_config) | ||
| else: | ||
| self.llm_model_config = llm_model_config | ||
|
|
||
| def validate_mode(self): | ||
| """Validate the mode.""" | ||
| if self.mode not in ['keywords', 'sentences']: | ||
| raise ValueError("Mode must be either 'keywords' or 'sentences'.") | ||
|
|
||
| def get_default_lexicon_config(self, model) -> LLMConfig: | ||
| """Get default LLM model configuration for lexicon generation.""" | ||
| if any(rm in model for rm in REASONING_MODELS): | ||
| return LLMConfig( | ||
| model=model, | ||
| reasoning_effort="high", | ||
| seed=42, | ||
| response_format={"type": "json_object"}, | ||
| ) | ||
| else: | ||
| return LLMConfig( | ||
| model=model, | ||
| temperature=0, | ||
| top_p=1, | ||
| frequency_penalty=0, | ||
| presence_penalty=0, | ||
| seed=42, | ||
| response_format={"type": "json_object"}, | ||
| ) | ||
|
|
||
| def generate_lexicon( | ||
| self, | ||
| timeout: int | None = 55, | ||
| max_workers: int = 100, | ||
| ) -> list[str]: | ||
| """ | ||
| Generate lexicon using LLM prompts. | ||
|
|
||
| Args: | ||
| timeout: Timeout for each LLM request. | ||
| max_workers: Maximum number of concurrent workers. | ||
|
|
||
| Returns: | ||
| List of generated lexicon items. | ||
| """ | ||
| system_prompt = build_lexicon_prompt(self.main_theme, mode=self.mode) | ||
| responses = self._run_lexicon_prompts( | ||
| system_prompt, | ||
| timeout=timeout, | ||
| max_workers=max_workers, | ||
| ) | ||
|
|
||
| unique_items = self.flatten_responses(responses) | ||
|
|
||
| return unique_items | ||
|
|
||
| def _run_lexicon_prompts( | ||
| self, | ||
| system_prompt: str, | ||
| timeout: int | None, | ||
| max_workers: int = 100, | ||
| ) -> list[str]: | ||
| """ | ||
| Get the labels from the prompts. | ||
|
|
||
| Args: | ||
| prompts: List of prompts to process | ||
| system_prompt: System prompt for the LLM | ||
| timeout: Timeout for each LLM request for concurrent calls | ||
| max_workers: Maximum number of concurrent workers | ||
| processing_callbacks: Callback function for handling responses | ||
| Returns: | ||
| Dict of parsed responses from the LLM | ||
| """ | ||
| llm_kwargs = self.llm_model_config.get_llm_kwargs( | ||
| remove_max_tokens=True, remove_timeout=True | ||
| ) | ||
|
|
||
| provider, _ = self.llm_model_config.model.split("::") | ||
|
|
||
| if (provider == 'bedrock') or (len(self.seeds) == 1): | ||
| llm = LLMEngine( | ||
| model=self.llm_model_config.model, | ||
| **self.llm_model_config.connection_config, | ||
| ) | ||
|
|
||
| return run_parallel_prompts( | ||
| llm_engine=llm, | ||
| prompts = [self.main_theme]*len(self.seeds), | ||
| system_prompt=system_prompt, | ||
| processing_callbacks=[self.parse_lexicon_response], | ||
| max_workers=max_workers, | ||
| **llm_kwargs, | ||
| ) | ||
| elif (provider != 'bedrock') and (len(self.seeds) > 1): | ||
| llm = AsyncLLMEngine( | ||
| model=self.llm_model_config.model, | ||
| **self.llm_model_config.connection_config, | ||
| ) | ||
|
|
||
| return run_concurrent_prompts( | ||
| llm_engine=llm, | ||
| prompts = [self.main_theme]*len(self.seeds), | ||
| system_prompt=system_prompt, | ||
| timeout=timeout, | ||
| processing_callbacks=[self.parse_lexicon_response], | ||
| max_workers=max_workers, | ||
| **llm_kwargs, | ||
| ) | ||
|
|
||
| def parse_lexicon_response(self, response: str) -> list[str]: | ||
| """ | ||
| Parse the response from the LLM model used for lexicon generation. | ||
|
|
||
| Args: | ||
| response: Response string from the LLM. | ||
| Returns: | ||
| List of lexicon items. | ||
| """ | ||
| try: | ||
| response = re.sub(r'```', '', response) | ||
| response = re.sub(r'json', '', response) | ||
| parsed = json.loads(response)[self.mode] | ||
| return [item.strip() for item in parsed if item.strip()] | ||
| except json.JSONDecodeError: | ||
| logger.error(f"Error deserializing response: {response}") | ||
| return [] | ||
|
|
||
| def flatten_responses(self, responses: list[list[str]]) -> list[str]: | ||
| """ | ||
| Flatten the responses from the LLM into a unique list of items. | ||
|
|
||
| Args: | ||
| responses: List of lists of lexicon items from the LLM. | ||
| Returns: | ||
| List of unique lexicon items, preserving order. | ||
| """ | ||
| seen = set() | ||
| unique_items = [] | ||
| for response_list in responses: | ||
| for item in response_list: | ||
| if item not in seen: | ||
| seen.add(item) | ||
| unique_items.append(item) | ||
| return unique_items | ||
|
|
||
|
|
||
|
|
||
|
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| def build_lexicon_prompt(theme, mode='keywords'): | ||
| if mode == "keywords": | ||
| prompt = """You are an expert tasked with generating a lexicon of the most important and relevant keywords specific to the {theme}. | ||
|
|
||
| Your goal is to compile a list of terms that are critical for understanding and analyzing the {theme}. This lexicon should include only the most essential keywords, phrases, and abbreviations that are directly associated with {theme} topics, analysis, logistics, and industry reporting. | ||
|
|
||
| Guidelines: | ||
| 1. Focus on relevance: Include only the most important and commonly used keywords that are uniquely tied to the {theme}. These should reflect key concepts, industry-specific mechanisms, benchmarks, logistical aspects, and terminology that are central to the theme. | ||
| 2. Avoid redundancy: Do not repeat the primary terms of the theme in multiple phrases. Include the main term (e.g., "{theme}") only as a standalone term, and focus on other specific terms without redundant repetition. | ||
| 3. Strict exclusion of generic terms: Exclude any terms that are generic or broadly used across different fields, such as "Arbitrage," "Hedge," "Liquidity," or "Futures Contract," even if they have a specific meaning within the context of {theme}. Only include terms that are uniquely relevant to {theme} and cannot be applied broadly. | ||
| 4. Include specific variations: Where applicable, provide both the full form and common abbreviations relevant to the {theme}. Present the full term and its abbreviation as separate entries. For example, instead of "Zero Lower Bound (ZLB)", list "Zero Lower Bound" and "ZLB" as separate keywords. | ||
| 5. Ensure clarity: Each keyword should be concise, clear, and directly relevant to the {theme}, avoiding any ambiguity. | ||
| 6. Select only the most critical: There is no need to reach a specific number of keywords. Focus solely on the most crucial terms without padding the list. If fewer keywords meet the criteria, that is acceptable. | ||
|
|
||
| The output should be a lexicon of only the most critical and uniquely relevant keywords related to the {theme}, formatted as a JSON list, accessible with the key {mode}, with full terms and abbreviations listed separately. | ||
| """ | ||
| else: | ||
| prompt = """You are an expert tasked with generating a lexicon of the most important and relevant sentences specific to the {theme}. | ||
|
|
||
| Your goal is to compile a list of concise, informative sentences that are critical for understanding and analyzing the {theme}. Each sentence should capture a unique aspect, mechanism, or implication of the theme. | ||
|
|
||
| Guidelines: | ||
| 1. Focus on relevance: Include only the most important and commonly discussed sentences that are uniquely tied to the {theme}. These should reflect key concepts, industry-specific mechanisms, benchmarks, logistical aspects, and terminology that are central to the theme. | ||
| 2. Avoid redundancy: Do not repeat the primary terms of the theme in multiple sentences. Include the main term (e.g., "{theme}") only as a standalone sentence, and focus on other specific sentences without redundant repetition. | ||
| 3. Strict exclusion of generic sentences: Exclude any sentences that are generic or broadly used across different fields, even if they have a specific meaning within the context of {theme}. Only include sentences that are uniquely relevant to {theme} and cannot be applied broadly. | ||
| 4. Include specific variations: Where applicable, provide both the full form and common abbreviations relevant to the {theme}. Present the full sentence and its abbreviation as separate entries. | ||
| 5. Ensure clarity: Each sentence should be concise, clear, and directly relevant to the {theme}, avoiding any ambiguity. | ||
| 6. Select only the most critical: There is no need to reach a specific number of sentences. Focus solely on the most crucial sentences without padding the list. If fewer sentences meet the criteria, that is acceptable. | ||
|
|
||
| The output should be a lexicon of only the most critical and uniquely relevant sentences related to the {theme}, formatted as a JSON list, accessible with the key {mode}, with full sentences. | ||
| """ | ||
| return prompt.strip().format(theme=theme, mode=mode) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be moved to a new release, v1.0.0 has already been merged