diff --git a/validmind/ai/utils.py b/validmind/ai/utils.py index 648d26076..135fb6b55 100644 --- a/validmind/ai/utils.py +++ b/validmind/ai/utils.py @@ -15,6 +15,10 @@ __client = None __model = None +__judge_llm = None +__judge_embeddings = None +EMBEDDINGS_MODEL = "text-embedding-3-small" + # can be None, True or False (ternary to represent initial state, ack and failed ack) __ack = None @@ -105,6 +109,91 @@ def get_client_and_model(): return __client, __model +def get_judge_config(judge_llm=None, judge_embeddings=None): + try: + from langchain_core.embeddings import Embeddings + from langchain_core.language_models.chat_models import BaseChatModel + from langchain_openai import ChatOpenAI, OpenAIEmbeddings + + from validmind.models.function import FunctionModel + except ImportError: + raise ImportError("Please run `pip install validmind[llm]` to use LLM tests") + + if judge_llm is not None or judge_embeddings is not None: + if isinstance(judge_llm, FunctionModel) and judge_llm is not None: + if isinstance(judge_llm.model, BaseChatModel): + judge_llm = judge_llm.model + else: + raise ValueError( + "The ValidMind Functional model provided does not have have a langchain compatible LLM as a model attribute." + "To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, " + "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account." + ) + if isinstance(judge_embeddings, FunctionModel) and judge_embeddings is not None: + if isinstance(judge_llm.model, BaseChatModel): + judge_embeddings = judge_embeddings.model + else: + raise ValueError( + "The ValidMind Functional model provided does not have have a langchain compatible embeddings model as a model attribute." + "To use default ValidMind LLM, do not set judge_embedding parameter, " + "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account." + ) + + if (isinstance(judge_llm, BaseChatModel) or judge_llm is None) and ( + isinstance(judge_embeddings, Embeddings) or judge_embeddings is None + ): + return judge_llm, judge_embeddings + else: + raise ValueError( + "Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of " + "Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, " + "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account." + ) + + # grab default values if not passed at run time + global __judge_llm, __judge_embeddings + if __judge_llm and __judge_embeddings: + return __judge_llm, __judge_embeddings + + client, model = get_client_and_model() + os.environ["OPENAI_API_BASE"] = str(client.base_url) + + __judge_llm = ChatOpenAI(api_key=client.api_key, model=model) + __judge_embeddings = OpenAIEmbeddings( + api_key=client.api_key, model=EMBEDDINGS_MODEL + ) + + return __judge_llm, __judge_embeddings + + +def set_judge_config(judge_llm, judge_embeddings): + global __judge_llm, __judge_embeddings + try: + from langchain_core.embeddings import Embeddings + from langchain_core.language_models.chat_models import BaseChatModel + + from validmind.models.function import FunctionModel + except ImportError: + raise ImportError("Please run `pip install validmind[llm]` to use LLM tests") + if isinstance(judge_llm, BaseChatModel) and isinstance( + judge_embeddings, Embeddings + ): + __judge_llm = judge_llm + __judge_embeddings = judge_embeddings + # Assuming 'your_object' is the object you want to check + elif isinstance(judge_llm, FunctionModel) and isinstance( + judge_embeddings, FunctionModel + ): + __judge_llm = judge_llm.model + __judge_embeddings = judge_embeddings.model + else: + raise ValueError( + "Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of " + "Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, " + "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account." + ) + + def is_configured(): global __ack diff --git a/validmind/tests/model_validation/ragas/AnswerCorrectness.py b/validmind/tests/model_validation/ragas/AnswerCorrectness.py index 6352bf990..51936ab41 100644 --- a/validmind/tests/model_validation/ragas/AnswerCorrectness.py +++ b/validmind/tests/model_validation/ragas/AnswerCorrectness.py @@ -34,6 +34,8 @@ def AnswerCorrectness( user_input_column="user_input", response_column="response", reference_column="reference", + judge_llm=None, + judge_embeddings=None, ): """ Evaluates the correctness of answers in a dataset with respect to the provided ground @@ -118,7 +120,9 @@ def AnswerCorrectness( df = get_renamed_columns(dataset._df, required_columns) result_df = evaluate( - Dataset.from_pandas(df), metrics=[answer_correctness()], **get_ragas_config() + Dataset.from_pandas(df), + metrics=[answer_correctness()], + **get_ragas_config(judge_llm, judge_embeddings) ).to_pandas() score_column = "answer_correctness" diff --git a/validmind/tests/model_validation/ragas/AspectCritic.py b/validmind/tests/model_validation/ragas/AspectCritic.py index 3f9858c39..8eb3e921b 100644 --- a/validmind/tests/model_validation/ragas/AspectCritic.py +++ b/validmind/tests/model_validation/ragas/AspectCritic.py @@ -51,6 +51,8 @@ def AspectCritic( "maliciousness", ], additional_aspects: list = None, + judge_llm=None, + judge_embeddings=None, ): """ Evaluates generations against the following aspects: harmfulness, maliciousness, @@ -158,7 +160,9 @@ def AspectCritic( all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects result_df = evaluate( - Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config() + Dataset.from_pandas(df), + metrics=all_aspects, + **get_ragas_config(judge_llm, judge_embeddings) ).to_pandas() # reverse the score for aspects where lower is better diff --git a/validmind/tests/model_validation/ragas/ContextEntityRecall.py b/validmind/tests/model_validation/ragas/ContextEntityRecall.py index fa5fb3ae9..3f5aa6a35 100644 --- a/validmind/tests/model_validation/ragas/ContextEntityRecall.py +++ b/validmind/tests/model_validation/ragas/ContextEntityRecall.py @@ -33,6 +33,8 @@ def ContextEntityRecall( dataset, retrieved_contexts_column: str = "retrieved_contexts", reference_column: str = "reference", + judge_llm=None, + judge_embeddings=None, ): """ Evaluates the context entity recall for dataset entries and visualizes the results. @@ -113,7 +115,9 @@ def ContextEntityRecall( df = get_renamed_columns(dataset._df, required_columns) result_df = evaluate( - Dataset.from_pandas(df), metrics=[context_entity_recall()], **get_ragas_config() + Dataset.from_pandas(df), + metrics=[context_entity_recall()], + **get_ragas_config(judge_llm, judge_embeddings) ).to_pandas() score_column = "context_entity_recall" diff --git a/validmind/tests/model_validation/ragas/ContextPrecision.py b/validmind/tests/model_validation/ragas/ContextPrecision.py index 035e76f25..9eb455f3e 100644 --- a/validmind/tests/model_validation/ragas/ContextPrecision.py +++ b/validmind/tests/model_validation/ragas/ContextPrecision.py @@ -34,6 +34,8 @@ def ContextPrecision( user_input_column: str = "user_input", retrieved_contexts_column: str = "retrieved_contexts", reference_column: str = "reference", + judge_llm=None, + judge_embeddings=None, ): # noqa: B950 """ Context Precision is a metric that evaluates whether all of the ground-truth @@ -109,7 +111,9 @@ def ContextPrecision( df = get_renamed_columns(dataset._df, required_columns) result_df = evaluate( - Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config() + Dataset.from_pandas(df), + metrics=[context_precision()], + **get_ragas_config(judge_llm, judge_embeddings) ).to_pandas() score_column = "llm_context_precision_with_reference" diff --git a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py index 9b9d18ea5..5ac58cab9 100644 --- a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +++ b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py @@ -34,6 +34,8 @@ def ContextPrecisionWithoutReference( user_input_column: str = "user_input", retrieved_contexts_column: str = "retrieved_contexts", response_column: str = "response", + judge_llm=None, + judge_embeddings=None, ): # noqa: B950 """ Context Precision Without Reference is a metric used to evaluate the relevance of @@ -104,7 +106,9 @@ def ContextPrecisionWithoutReference( df = get_renamed_columns(dataset._df, required_columns) result_df = evaluate( - Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config() + Dataset.from_pandas(df), + metrics=[context_precision()], + **get_ragas_config(judge_llm, judge_embeddings) ).to_pandas() score_column = "llm_context_precision_without_reference" diff --git a/validmind/tests/model_validation/ragas/ContextRecall.py b/validmind/tests/model_validation/ragas/ContextRecall.py index e6b0317f4..094927037 100644 --- a/validmind/tests/model_validation/ragas/ContextRecall.py +++ b/validmind/tests/model_validation/ragas/ContextRecall.py @@ -34,6 +34,8 @@ def ContextRecall( user_input_column: str = "user_input", retrieved_contexts_column: str = "retrieved_contexts", reference_column: str = "reference", + judge_llm=None, + judge_embeddings=None, ): """ Context recall measures the extent to which the retrieved context aligns with the @@ -109,7 +111,9 @@ def ContextRecall( df = get_renamed_columns(dataset._df, required_columns) result_df = evaluate( - Dataset.from_pandas(df), metrics=[context_recall()], **get_ragas_config() + Dataset.from_pandas(df), + metrics=[context_recall()], + **get_ragas_config(judge_llm, judge_embeddings) ).to_pandas() score_column = "context_recall" diff --git a/validmind/tests/model_validation/ragas/Faithfulness.py b/validmind/tests/model_validation/ragas/Faithfulness.py index 034b5fb61..6b83f26c8 100644 --- a/validmind/tests/model_validation/ragas/Faithfulness.py +++ b/validmind/tests/model_validation/ragas/Faithfulness.py @@ -34,6 +34,8 @@ def Faithfulness( user_input_column="user_input", response_column="response", retrieved_contexts_column="retrieved_contexts", + judge_llm=None, + judge_embeddings=None, ): # noqa """ Evaluates the faithfulness of the generated answers with respect to retrieved contexts. @@ -114,7 +116,9 @@ def Faithfulness( df = get_renamed_columns(dataset._df, required_columns) result_df = evaluate( - Dataset.from_pandas(df), metrics=[faithfulness()], **get_ragas_config() + Dataset.from_pandas(df), + metrics=[faithfulness()], + **get_ragas_config(judge_llm, judge_embeddings) ).to_pandas() score_column = "faithfulness" diff --git a/validmind/tests/model_validation/ragas/NoiseSensitivity.py b/validmind/tests/model_validation/ragas/NoiseSensitivity.py index 17cb838cd..9a5d6e218 100644 --- a/validmind/tests/model_validation/ragas/NoiseSensitivity.py +++ b/validmind/tests/model_validation/ragas/NoiseSensitivity.py @@ -38,6 +38,8 @@ def NoiseSensitivity( reference_column="reference", focus="relevant", user_input_column="user_input", + judge_llm=None, + judge_embeddings=None, ): """ Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it @@ -149,7 +151,7 @@ def NoiseSensitivity( result_df = evaluate( Dataset.from_pandas(df), metrics=[noise_sensitivity(focus=focus)], - **get_ragas_config(), + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = f"noise_sensitivity_{focus}" diff --git a/validmind/tests/model_validation/ragas/ResponseRelevancy.py b/validmind/tests/model_validation/ragas/ResponseRelevancy.py index a7eabd1db..63d633355 100644 --- a/validmind/tests/model_validation/ragas/ResponseRelevancy.py +++ b/validmind/tests/model_validation/ragas/ResponseRelevancy.py @@ -34,6 +34,8 @@ def ResponseRelevancy( user_input_column="user_input", retrieved_contexts_column=None, response_column="response", + judge_llm=None, + judge_embeddings=None, ): """ Assesses how pertinent the generated answer is to the given prompt. @@ -44,8 +46,8 @@ def ResponseRelevancy( relevancy. This metric is computed using the `user_input`, the `retrieved_contexts` and the `response`. - The Response Relevancy is defined as the mean cosine similartiy of the original - `user_input` to a number of artifical questions, which are generated (reverse-engineered) + The Response Relevancy is defined as the mean cosine similarity of the original + `user_input` to a number of artificial questions, which are generated (reverse-engineered) based on the `response`: $$ @@ -62,7 +64,7 @@ def ResponseRelevancy( **Note**: *This is a reference-free metric, meaning that it does not require a `ground_truth` answer to compare against. A similar metric that does evaluate the - correctness of a generated answser with respect to a `ground_truth` answer is + correctness of a generated answers with respect to a `ground_truth` answer is `validmind.model_validation.ragas.AnswerCorrectness`.* ### Configuring Columns @@ -128,7 +130,7 @@ def ResponseRelevancy( result_df = evaluate( Dataset.from_pandas(df), metrics=metrics, - **get_ragas_config(), + **get_ragas_config(judge_llm, judge_embeddings), ).to_pandas() score_column = "answer_relevancy" diff --git a/validmind/tests/model_validation/ragas/SemanticSimilarity.py b/validmind/tests/model_validation/ragas/SemanticSimilarity.py index 42d62a877..c963dec5a 100644 --- a/validmind/tests/model_validation/ragas/SemanticSimilarity.py +++ b/validmind/tests/model_validation/ragas/SemanticSimilarity.py @@ -33,6 +33,8 @@ def SemanticSimilarity( dataset, response_column="response", reference_column="reference", + judge_llm=None, + judge_embeddings=None, ): """ Calculates the semantic similarity between generated responses and ground truths @@ -107,7 +109,9 @@ def SemanticSimilarity( df = get_renamed_columns(dataset._df, required_columns) result_df = evaluate( - Dataset.from_pandas(df), metrics=[semantic_similarity()], **get_ragas_config() + Dataset.from_pandas(df), + metrics=[semantic_similarity()], + **get_ragas_config(judge_llm, judge_embeddings) ).to_pandas() score_column = "semantic_similarity" diff --git a/validmind/tests/model_validation/ragas/utils.py b/validmind/tests/model_validation/ragas/utils.py index 01426a6f8..b6dfca6ad 100644 --- a/validmind/tests/model_validation/ragas/utils.py +++ b/validmind/tests/model_validation/ragas/utils.py @@ -2,34 +2,14 @@ # See the LICENSE file in the root of this repository for details. # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial -import os - -from validmind.ai.utils import get_client_and_model, is_configured +from validmind.ai.utils import get_judge_config EMBEDDINGS_MODEL = "text-embedding-3-small" -def get_ragas_config(): - # import here since its an optional dependency - try: - from langchain_openai import ChatOpenAI, OpenAIEmbeddings - except ImportError: - raise ImportError("Please run `pip install validmind[llm]` to use LLM tests") - - if not is_configured(): - raise ValueError( - "LLM is not configured. Please set an `OPENAI_API_KEY` environment variable " - "or ensure that you are connected to the ValidMind API and ValidMind AI is " - "enabled for your account." - ) - - client, model = get_client_and_model() - os.environ["OPENAI_API_BASE"] = str(client.base_url) - - return { - "llm": ChatOpenAI(api_key=client.api_key, model=model), - "embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL), - } +def get_ragas_config(judge_llm=None, judge_embeddings=None): + judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings) + return {"llm": judge_llm, "embeddings": judge_embeddings} def make_sub_col_udf(root_col, sub_col): diff --git a/validmind/tests/prompt_validation/Bias.py b/validmind/tests/prompt_validation/Bias.py index 07040426a..275894ae3 100644 --- a/validmind/tests/prompt_validation/Bias.py +++ b/validmind/tests/prompt_validation/Bias.py @@ -45,7 +45,7 @@ @tags("llm", "few_shot") @tasks("text_classification", "text_summarization") -def Bias(model, min_threshold=7): +def Bias(model, min_threshold=7, judge_llm=None): """ Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the prompt. @@ -100,6 +100,7 @@ def Bias(model, min_threshold=7): response = call_model( system_prompt=SYSTEM, user_prompt=USER.format(prompt_to_test=model.prompt.template), + judge_llm=judge_llm, ) score = get_score(response) diff --git a/validmind/tests/prompt_validation/Clarity.py b/validmind/tests/prompt_validation/Clarity.py index c6c5c6cc0..63acfd5bf 100644 --- a/validmind/tests/prompt_validation/Clarity.py +++ b/validmind/tests/prompt_validation/Clarity.py @@ -46,7 +46,7 @@ @tags("llm", "zero_shot", "few_shot") @tasks("text_classification", "text_summarization") -def Clarity(model, min_threshold=7): +def Clarity(model, min_threshold=7, judge_llm=None): """ Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines. @@ -89,6 +89,7 @@ def Clarity(model, min_threshold=7): response = call_model( system_prompt=SYSTEM, user_prompt=USER.format(prompt_to_test=model.prompt.template), + judge_llm=judge_llm, ) score = get_score(response) diff --git a/validmind/tests/prompt_validation/Conciseness.py b/validmind/tests/prompt_validation/Conciseness.py index e2c295a3c..cc26d1769 100644 --- a/validmind/tests/prompt_validation/Conciseness.py +++ b/validmind/tests/prompt_validation/Conciseness.py @@ -54,7 +54,7 @@ @tags("llm", "zero_shot", "few_shot") @tasks("text_classification", "text_summarization") -def Conciseness(model, min_threshold=7): +def Conciseness(model, min_threshold=7, judge_llm=None): """ Analyzes and grades the conciseness of prompts provided to a Large Language Model. @@ -97,6 +97,7 @@ def Conciseness(model, min_threshold=7): response = call_model( system_prompt=SYSTEM, user_prompt=USER.format(prompt_to_test=model.prompt.template), + judge_llm=judge_llm, ) score = get_score(response) explanation = get_explanation(response) diff --git a/validmind/tests/prompt_validation/Delimitation.py b/validmind/tests/prompt_validation/Delimitation.py index a91884092..87ae8e260 100644 --- a/validmind/tests/prompt_validation/Delimitation.py +++ b/validmind/tests/prompt_validation/Delimitation.py @@ -39,7 +39,7 @@ @tags("llm", "zero_shot", "few_shot") @tasks("text_classification", "text_summarization") -def Delimitation(model, min_threshold=7): +def Delimitation(model, min_threshold=7, judge_llm=None): """ Evaluates the proper use of delimiters in prompts provided to Large Language Models. @@ -83,6 +83,7 @@ def Delimitation(model, min_threshold=7): response = call_model( system_prompt=SYSTEM, user_prompt=USER.format(prompt_to_test=model.prompt.template), + judge_llm=judge_llm, ) score = get_score(response) explanation = get_explanation(response) diff --git a/validmind/tests/prompt_validation/NegativeInstruction.py b/validmind/tests/prompt_validation/NegativeInstruction.py index b2d62e4c9..64c4a2338 100644 --- a/validmind/tests/prompt_validation/NegativeInstruction.py +++ b/validmind/tests/prompt_validation/NegativeInstruction.py @@ -52,7 +52,7 @@ @tags("llm", "zero_shot", "few_shot") @tasks("text_classification", "text_summarization") -def NegativeInstruction(model, min_threshold=7): +def NegativeInstruction(model, min_threshold=7, judge_llm=None): """ Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts. @@ -101,6 +101,7 @@ def NegativeInstruction(model, min_threshold=7): response = call_model( system_prompt=SYSTEM, user_prompt=USER.format(prompt_to_test=model.prompt.template), + judge_llm=judge_llm, ) score = get_score(response) explanation = get_explanation(response) diff --git a/validmind/tests/prompt_validation/Robustness.py b/validmind/tests/prompt_validation/Robustness.py index 627554e28..d1e306f5d 100644 --- a/validmind/tests/prompt_validation/Robustness.py +++ b/validmind/tests/prompt_validation/Robustness.py @@ -25,7 +25,7 @@ Be creative and think step-by-step how you would break the prompt. Then generate {num_tests} inputs for the user-submitted prompt template that would break the prompt. Each input should be different from the others. -Each input should be retured as a new line in your response. +Each input should be returned as a new line in your response. Respond only with the values to be inserted into the prompt template and do not include quotes, explanations or any extra text. Example: @@ -56,7 +56,7 @@ @tags("llm", "zero_shot", "few_shot") @tasks("text_classification", "text_summarization") -def Robustness(model, dataset, num_tests=10): +def Robustness(model, dataset, num_tests=10, judge_llm=None): """ Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts. This test specifically measures the model's ability to generate correct classifications with the given prompt even when the @@ -112,6 +112,7 @@ def Robustness(model, dataset, num_tests=10): generated_inputs = call_model( system_prompt=SYSTEM.format(num_tests=num_tests), user_prompt=USER.format(prompt_to_test=model.prompt.template), + judge_llm=judge_llm, ).split("\n") responses = model.predict( diff --git a/validmind/tests/prompt_validation/Specificity.py b/validmind/tests/prompt_validation/Specificity.py index 0a2dfb700..10cf4ee5a 100644 --- a/validmind/tests/prompt_validation/Specificity.py +++ b/validmind/tests/prompt_validation/Specificity.py @@ -52,7 +52,7 @@ @tags("llm", "zero_shot", "few_shot") @tasks("text_classification", "text_summarization") -def Specificity(model, min_threshold=7): +def Specificity(model, min_threshold=7, judge_llm=None): """ Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail, and relevance. @@ -97,6 +97,7 @@ def Specificity(model, min_threshold=7): response = call_model( system_prompt=SYSTEM, user_prompt=USER.format(prompt_to_test=model.prompt.template), + judge_llm=judge_llm, ) score = get_score(response) explanation = get_explanation(response) diff --git a/validmind/tests/prompt_validation/ai_powered_test.py b/validmind/tests/prompt_validation/ai_powered_test.py index 49d604f40..03ce32cfa 100644 --- a/validmind/tests/prompt_validation/ai_powered_test.py +++ b/validmind/tests/prompt_validation/ai_powered_test.py @@ -4,7 +4,7 @@ import re -from validmind.ai.utils import get_client_and_model, is_configured +from validmind.ai.utils import get_judge_config, is_configured missing_prompt_message = """ Cannot run prompt validation tests on a model with no prompt. @@ -21,7 +21,12 @@ def call_model( - system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42 + system_prompt: str, + user_prompt: str, + temperature: float = 0.0, + seed: int = 42, + judge_llm=None, + judge_embeddings=None, ): """Call LLM with the given prompts and return the response""" if not is_configured(): @@ -31,21 +36,17 @@ def call_model( "enabled for your account." ) - client, model = get_client_and_model() - - return ( - client.chat.completions.create( - model=model, - messages=[ - {"role": "system", "content": system_prompt.strip("\n").strip()}, - {"role": "user", "content": user_prompt.strip("\n").strip()}, - ], - temperature=temperature, - seed=seed, - ) - .choices[0] - .message.content - ) + judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings) + messages = [ + ("system", system_prompt.strip("\n").strip()), + ("user", user_prompt.strip("\n").strip()), + ] + + return judge_llm.invoke( + messages, + temperature=temperature, + seed=seed, + ).content def get_score(response: str):