Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions validmind/ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@

__client = None
__model = None
__judge_llm = None
__judge_embeddings = None
EMBEDDINGS_MODEL = "text-embedding-3-small"

# can be None, True or False (ternary to represent initial state, ack and failed ack)
__ack = None

Expand Down Expand Up @@ -105,6 +109,91 @@ def get_client_and_model():
return __client, __model


def get_judge_config(judge_llm=None, judge_embeddings=None):
try:
from langchain_core.embeddings import Embeddings
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from validmind.models.function import FunctionModel
except ImportError:
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")

if judge_llm is not None or judge_embeddings is not None:
if isinstance(judge_llm, FunctionModel) and judge_llm is not None:
if isinstance(judge_llm.model, BaseChatModel):
judge_llm = judge_llm.model
else:
raise ValueError(
"The ValidMind Functional model provided does not have have a langchain compatible LLM as a model attribute."
"To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
)
if isinstance(judge_embeddings, FunctionModel) and judge_embeddings is not None:
if isinstance(judge_llm.model, BaseChatModel):
judge_embeddings = judge_embeddings.model
else:
raise ValueError(
"The ValidMind Functional model provided does not have have a langchain compatible embeddings model as a model attribute."
"To use default ValidMind LLM, do not set judge_embedding parameter, "
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
)

if (isinstance(judge_llm, BaseChatModel) or judge_llm is None) and (
isinstance(judge_embeddings, Embeddings) or judge_embeddings is None
):
return judge_llm, judge_embeddings
else:
raise ValueError(
"Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
"Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
)

# grab default values if not passed at run time
global __judge_llm, __judge_embeddings
if __judge_llm and __judge_embeddings:
return __judge_llm, __judge_embeddings

client, model = get_client_and_model()
os.environ["OPENAI_API_BASE"] = str(client.base_url)

__judge_llm = ChatOpenAI(api_key=client.api_key, model=model)
__judge_embeddings = OpenAIEmbeddings(
api_key=client.api_key, model=EMBEDDINGS_MODEL
)

return __judge_llm, __judge_embeddings


def set_judge_config(judge_llm, judge_embeddings):
global __judge_llm, __judge_embeddings
try:
from langchain_core.embeddings import Embeddings
from langchain_core.language_models.chat_models import BaseChatModel

from validmind.models.function import FunctionModel
except ImportError:
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
if isinstance(judge_llm, BaseChatModel) and isinstance(
judge_embeddings, Embeddings
):
__judge_llm = judge_llm
__judge_embeddings = judge_embeddings
# Assuming 'your_object' is the object you want to check
elif isinstance(judge_llm, FunctionModel) and isinstance(
judge_embeddings, FunctionModel
):
__judge_llm = judge_llm.model
__judge_embeddings = judge_embeddings.model
else:
raise ValueError(
"Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
"Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
)


def is_configured():
global __ack

Expand Down
6 changes: 5 additions & 1 deletion validmind/tests/model_validation/ragas/AnswerCorrectness.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def AnswerCorrectness(
user_input_column="user_input",
response_column="response",
reference_column="reference",
judge_llm=None,
judge_embeddings=None,
):
"""
Evaluates the correctness of answers in a dataset with respect to the provided ground
Expand Down Expand Up @@ -118,7 +120,9 @@ def AnswerCorrectness(
df = get_renamed_columns(dataset._df, required_columns)

result_df = evaluate(
Dataset.from_pandas(df), metrics=[answer_correctness()], **get_ragas_config()
Dataset.from_pandas(df),
metrics=[answer_correctness()],
**get_ragas_config(judge_llm, judge_embeddings)
).to_pandas()

score_column = "answer_correctness"
Expand Down
6 changes: 5 additions & 1 deletion validmind/tests/model_validation/ragas/AspectCritic.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def AspectCritic(
"maliciousness",
],
additional_aspects: list = None,
judge_llm=None,
judge_embeddings=None,
):
"""
Evaluates generations against the following aspects: harmfulness, maliciousness,
Expand Down Expand Up @@ -158,7 +160,9 @@ def AspectCritic(
all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects

result_df = evaluate(
Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
Dataset.from_pandas(df),
metrics=all_aspects,
**get_ragas_config(judge_llm, judge_embeddings)
).to_pandas()

# reverse the score for aspects where lower is better
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def ContextEntityRecall(
dataset,
retrieved_contexts_column: str = "retrieved_contexts",
reference_column: str = "reference",
judge_llm=None,
judge_embeddings=None,
):
"""
Evaluates the context entity recall for dataset entries and visualizes the results.
Expand Down Expand Up @@ -113,7 +115,9 @@ def ContextEntityRecall(
df = get_renamed_columns(dataset._df, required_columns)

result_df = evaluate(
Dataset.from_pandas(df), metrics=[context_entity_recall()], **get_ragas_config()
Dataset.from_pandas(df),
metrics=[context_entity_recall()],
**get_ragas_config(judge_llm, judge_embeddings)
).to_pandas()

score_column = "context_entity_recall"
Expand Down
6 changes: 5 additions & 1 deletion validmind/tests/model_validation/ragas/ContextPrecision.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def ContextPrecision(
user_input_column: str = "user_input",
retrieved_contexts_column: str = "retrieved_contexts",
reference_column: str = "reference",
judge_llm=None,
judge_embeddings=None,
): # noqa: B950
"""
Context Precision is a metric that evaluates whether all of the ground-truth
Expand Down Expand Up @@ -109,7 +111,9 @@ def ContextPrecision(
df = get_renamed_columns(dataset._df, required_columns)

result_df = evaluate(
Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
Dataset.from_pandas(df),
metrics=[context_precision()],
**get_ragas_config(judge_llm, judge_embeddings)
).to_pandas()

score_column = "llm_context_precision_with_reference"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def ContextPrecisionWithoutReference(
user_input_column: str = "user_input",
retrieved_contexts_column: str = "retrieved_contexts",
response_column: str = "response",
judge_llm=None,
judge_embeddings=None,
): # noqa: B950
"""
Context Precision Without Reference is a metric used to evaluate the relevance of
Expand Down Expand Up @@ -104,7 +106,9 @@ def ContextPrecisionWithoutReference(
df = get_renamed_columns(dataset._df, required_columns)

result_df = evaluate(
Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
Dataset.from_pandas(df),
metrics=[context_precision()],
**get_ragas_config(judge_llm, judge_embeddings)
).to_pandas()

score_column = "llm_context_precision_without_reference"
Expand Down
6 changes: 5 additions & 1 deletion validmind/tests/model_validation/ragas/ContextRecall.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def ContextRecall(
user_input_column: str = "user_input",
retrieved_contexts_column: str = "retrieved_contexts",
reference_column: str = "reference",
judge_llm=None,
judge_embeddings=None,
):
"""
Context recall measures the extent to which the retrieved context aligns with the
Expand Down Expand Up @@ -109,7 +111,9 @@ def ContextRecall(
df = get_renamed_columns(dataset._df, required_columns)

result_df = evaluate(
Dataset.from_pandas(df), metrics=[context_recall()], **get_ragas_config()
Dataset.from_pandas(df),
metrics=[context_recall()],
**get_ragas_config(judge_llm, judge_embeddings)
).to_pandas()

score_column = "context_recall"
Expand Down
6 changes: 5 additions & 1 deletion validmind/tests/model_validation/ragas/Faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def Faithfulness(
user_input_column="user_input",
response_column="response",
retrieved_contexts_column="retrieved_contexts",
judge_llm=None,
judge_embeddings=None,
): # noqa
"""
Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
Expand Down Expand Up @@ -114,7 +116,9 @@ def Faithfulness(
df = get_renamed_columns(dataset._df, required_columns)

result_df = evaluate(
Dataset.from_pandas(df), metrics=[faithfulness()], **get_ragas_config()
Dataset.from_pandas(df),
metrics=[faithfulness()],
**get_ragas_config(judge_llm, judge_embeddings)
).to_pandas()

score_column = "faithfulness"
Expand Down
4 changes: 3 additions & 1 deletion validmind/tests/model_validation/ragas/NoiseSensitivity.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def NoiseSensitivity(
reference_column="reference",
focus="relevant",
user_input_column="user_input",
judge_llm=None,
judge_embeddings=None,
):
"""
Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
Expand Down Expand Up @@ -149,7 +151,7 @@ def NoiseSensitivity(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=[noise_sensitivity(focus=focus)],
**get_ragas_config(),
**get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()

score_column = f"noise_sensitivity_{focus}"
Expand Down
10 changes: 6 additions & 4 deletions validmind/tests/model_validation/ragas/ResponseRelevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def ResponseRelevancy(
user_input_column="user_input",
retrieved_contexts_column=None,
response_column="response",
judge_llm=None,
judge_embeddings=None,
):
"""
Assesses how pertinent the generated answer is to the given prompt.
Expand All @@ -44,8 +46,8 @@ def ResponseRelevancy(
relevancy. This metric is computed using the `user_input`, the `retrieved_contexts`
and the `response`.

The Response Relevancy is defined as the mean cosine similartiy of the original
`user_input` to a number of artifical questions, which are generated (reverse-engineered)
The Response Relevancy is defined as the mean cosine similarity of the original
`user_input` to a number of artificial questions, which are generated (reverse-engineered)
based on the `response`:

$$
Expand All @@ -62,7 +64,7 @@ def ResponseRelevancy(

**Note**: *This is a reference-free metric, meaning that it does not require a
`ground_truth` answer to compare against. A similar metric that does evaluate the
correctness of a generated answser with respect to a `ground_truth` answer is
correctness of a generated answers with respect to a `ground_truth` answer is
`validmind.model_validation.ragas.AnswerCorrectness`.*

### Configuring Columns
Expand Down Expand Up @@ -128,7 +130,7 @@ def ResponseRelevancy(
result_df = evaluate(
Dataset.from_pandas(df),
metrics=metrics,
**get_ragas_config(),
**get_ragas_config(judge_llm, judge_embeddings),
).to_pandas()

score_column = "answer_relevancy"
Expand Down
6 changes: 5 additions & 1 deletion validmind/tests/model_validation/ragas/SemanticSimilarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def SemanticSimilarity(
dataset,
response_column="response",
reference_column="reference",
judge_llm=None,
judge_embeddings=None,
):
"""
Calculates the semantic similarity between generated responses and ground truths
Expand Down Expand Up @@ -107,7 +109,9 @@ def SemanticSimilarity(
df = get_renamed_columns(dataset._df, required_columns)

result_df = evaluate(
Dataset.from_pandas(df), metrics=[semantic_similarity()], **get_ragas_config()
Dataset.from_pandas(df),
metrics=[semantic_similarity()],
**get_ragas_config(judge_llm, judge_embeddings)
).to_pandas()

score_column = "semantic_similarity"
Expand Down
28 changes: 4 additions & 24 deletions validmind/tests/model_validation/ragas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,14 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

import os

from validmind.ai.utils import get_client_and_model, is_configured
from validmind.ai.utils import get_judge_config

EMBEDDINGS_MODEL = "text-embedding-3-small"


def get_ragas_config():
# import here since its an optional dependency
try:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
except ImportError:
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")

if not is_configured():
raise ValueError(
"LLM is not configured. Please set an `OPENAI_API_KEY` environment variable "
"or ensure that you are connected to the ValidMind API and ValidMind AI is "
"enabled for your account."
)

client, model = get_client_and_model()
os.environ["OPENAI_API_BASE"] = str(client.base_url)

return {
"llm": ChatOpenAI(api_key=client.api_key, model=model),
"embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL),
}
def get_ragas_config(judge_llm=None, judge_embeddings=None):
judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings)
return {"llm": judge_llm, "embeddings": judge_embeddings}


def make_sub_col_udf(root_col, sub_col):
Expand Down
3 changes: 2 additions & 1 deletion validmind/tests/prompt_validation/Bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

@tags("llm", "few_shot")
@tasks("text_classification", "text_summarization")
def Bias(model, min_threshold=7):
def Bias(model, min_threshold=7, judge_llm=None):
"""
Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
prompt.
Expand Down Expand Up @@ -100,6 +100,7 @@ def Bias(model, min_threshold=7):
response = call_model(
system_prompt=SYSTEM,
user_prompt=USER.format(prompt_to_test=model.prompt.template),
judge_llm=judge_llm,
)

score = get_score(response)
Expand Down
3 changes: 2 additions & 1 deletion validmind/tests/prompt_validation/Clarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

@tags("llm", "zero_shot", "few_shot")
@tasks("text_classification", "text_summarization")
def Clarity(model, min_threshold=7):
def Clarity(model, min_threshold=7, judge_llm=None):
"""
Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.

Expand Down Expand Up @@ -89,6 +89,7 @@ def Clarity(model, min_threshold=7):
response = call_model(
system_prompt=SYSTEM,
user_prompt=USER.format(prompt_to_test=model.prompt.template),
judge_llm=judge_llm,
)

score = get_score(response)
Expand Down
Loading
Loading