validmind · rmcmen-vm · Jun 26, 2025 · Jun 23, 2025 · Jun 24, 2025 · Jun 24, 2025
diff --git a/validmind/ai/utils.py b/validmind/ai/utils.py
@@ -15,6 +15,10 @@
 
 __client = None
 __model = None
+__judge_llm = None
+__judge_embeddings = None
+EMBEDDINGS_MODEL = "text-embedding-3-small"
+
 # can be None, True or False (ternary to represent initial state, ack and failed ack)
 __ack = None
 
@@ -105,6 +109,91 @@ def get_client_and_model():
     return __client, __model
 
 
+def get_judge_config(judge_llm=None, judge_embeddings=None):
+    try:
+        from langchain_core.embeddings import Embeddings
+        from langchain_core.language_models.chat_models import BaseChatModel
+        from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+
+        from validmind.models.function import FunctionModel
+    except ImportError:
+        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+
+    if judge_llm is not None or judge_embeddings is not None:
+        if isinstance(judge_llm, FunctionModel) and judge_llm is not None:
+            if isinstance(judge_llm.model, BaseChatModel):
+                judge_llm = judge_llm.model
+            else:
+                raise ValueError(
+                    "The ValidMind Functional model provided does not have have a langchain compatible LLM as a model attribute."
+                    "To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
+                    "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
+                )
+        if isinstance(judge_embeddings, FunctionModel) and judge_embeddings is not None:
+            if isinstance(judge_llm.model, BaseChatModel):
+                judge_embeddings = judge_embeddings.model
+            else:
+                raise ValueError(
+                    "The ValidMind Functional model provided does not have have a langchain compatible embeddings model as a model attribute."
+                    "To use default ValidMind LLM, do not set judge_embedding parameter, "
+                    "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
+                )
+
+        if (isinstance(judge_llm, BaseChatModel) or judge_llm is None) and (
+            isinstance(judge_embeddings, Embeddings) or judge_embeddings is None
+        ):
+            return judge_llm, judge_embeddings
+        else:
+            raise ValueError(
+                "Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
+                "Langchain BaseChatModel and LangchainEmbeddings.  To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
+                "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
+            )
+
+    # grab default values if not passed at run time
+    global __judge_llm, __judge_embeddings
+    if __judge_llm and __judge_embeddings:
+        return __judge_llm, __judge_embeddings
+
+    client, model = get_client_and_model()
+    os.environ["OPENAI_API_BASE"] = str(client.base_url)
+
+    __judge_llm = ChatOpenAI(api_key=client.api_key, model=model)
+    __judge_embeddings = OpenAIEmbeddings(
+        api_key=client.api_key, model=EMBEDDINGS_MODEL
+    )
+
+    return __judge_llm, __judge_embeddings
+
+
+def set_judge_config(judge_llm, judge_embeddings):
+    global __judge_llm, __judge_embeddings
+    try:
+        from langchain_core.embeddings import Embeddings
+        from langchain_core.language_models.chat_models import BaseChatModel
+
+        from validmind.models.function import FunctionModel
+    except ImportError:
+        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+    if isinstance(judge_llm, BaseChatModel) and isinstance(
+        judge_embeddings, Embeddings
+    ):
+        __judge_llm = judge_llm
+        __judge_embeddings = judge_embeddings
+        # Assuming 'your_object' is the object you want to check
+    elif isinstance(judge_llm, FunctionModel) and isinstance(
+        judge_embeddings, FunctionModel
+    ):
+        __judge_llm = judge_llm.model
+        __judge_embeddings = judge_embeddings.model
+    else:
+        raise ValueError(
+            "Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
+            "Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
+            "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
+        )
+
+
 def is_configured():
     global __ack
 

diff --git a/validmind/tests/model_validation/ragas/AnswerCorrectness.py b/validmind/tests/model_validation/ragas/AnswerCorrectness.py
@@ -34,6 +34,8 @@ def AnswerCorrectness(
     user_input_column="user_input",
     response_column="response",
     reference_column="reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Evaluates the correctness of answers in a dataset with respect to the provided ground
@@ -118,7 +120,9 @@ def AnswerCorrectness(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[answer_correctness()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[answer_correctness()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "answer_correctness"

diff --git a/validmind/tests/model_validation/ragas/AspectCritic.py b/validmind/tests/model_validation/ragas/AspectCritic.py
@@ -51,6 +51,8 @@ def AspectCritic(
         "maliciousness",
     ],
     additional_aspects: list = None,
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Evaluates generations against the following aspects: harmfulness, maliciousness,
@@ -158,7 +160,9 @@ def AspectCritic(
     all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=all_aspects,
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     # reverse the score for aspects where lower is better

diff --git a/validmind/tests/model_validation/ragas/ContextEntityRecall.py b/validmind/tests/model_validation/ragas/ContextEntityRecall.py
@@ -33,6 +33,8 @@ def ContextEntityRecall(
     dataset,
     retrieved_contexts_column: str = "retrieved_contexts",
     reference_column: str = "reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Evaluates the context entity recall for dataset entries and visualizes the results.
@@ -113,7 +115,9 @@ def ContextEntityRecall(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_entity_recall()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[context_entity_recall()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "context_entity_recall"

diff --git a/validmind/tests/model_validation/ragas/ContextPrecision.py b/validmind/tests/model_validation/ragas/ContextPrecision.py
@@ -34,6 +34,8 @@ def ContextPrecision(
     user_input_column: str = "user_input",
     retrieved_contexts_column: str = "retrieved_contexts",
     reference_column: str = "reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):  # noqa: B950
     """
     Context Precision is a metric that evaluates whether all of the ground-truth
@@ -109,7 +111,9 @@ def ContextPrecision(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[context_precision()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "llm_context_precision_with_reference"

diff --git a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
@@ -34,6 +34,8 @@ def ContextPrecisionWithoutReference(
     user_input_column: str = "user_input",
     retrieved_contexts_column: str = "retrieved_contexts",
     response_column: str = "response",
+    judge_llm=None,
+    judge_embeddings=None,
 ):  # noqa: B950
     """
     Context Precision Without Reference is a metric used to evaluate the relevance of
@@ -104,7 +106,9 @@ def ContextPrecisionWithoutReference(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[context_precision()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "llm_context_precision_without_reference"

diff --git a/validmind/tests/model_validation/ragas/ContextRecall.py b/validmind/tests/model_validation/ragas/ContextRecall.py
@@ -34,6 +34,8 @@ def ContextRecall(
     user_input_column: str = "user_input",
     retrieved_contexts_column: str = "retrieved_contexts",
     reference_column: str = "reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Context recall measures the extent to which the retrieved context aligns with the
@@ -109,7 +111,9 @@ def ContextRecall(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_recall()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[context_recall()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "context_recall"

diff --git a/validmind/tests/model_validation/ragas/Faithfulness.py b/validmind/tests/model_validation/ragas/Faithfulness.py
@@ -34,6 +34,8 @@ def Faithfulness(
     user_input_column="user_input",
     response_column="response",
     retrieved_contexts_column="retrieved_contexts",
+    judge_llm=None,
+    judge_embeddings=None,
 ):  # noqa
     """
     Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
@@ -114,7 +116,9 @@ def Faithfulness(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[faithfulness()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[faithfulness()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "faithfulness"

diff --git a/validmind/tests/model_validation/ragas/NoiseSensitivity.py b/validmind/tests/model_validation/ragas/NoiseSensitivity.py
@@ -38,6 +38,8 @@ def NoiseSensitivity(
     reference_column="reference",
     focus="relevant",
     user_input_column="user_input",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
@@ -149,7 +151,7 @@ def NoiseSensitivity(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[noise_sensitivity(focus=focus)],
-        **get_ragas_config(),
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = f"noise_sensitivity_{focus}"

diff --git a/validmind/tests/model_validation/ragas/ResponseRelevancy.py b/validmind/tests/model_validation/ragas/ResponseRelevancy.py
@@ -34,6 +34,8 @@ def ResponseRelevancy(
     user_input_column="user_input",
     retrieved_contexts_column=None,
     response_column="response",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Assesses how pertinent the generated answer is to the given prompt.
@@ -44,8 +46,8 @@ def ResponseRelevancy(
     relevancy. This metric is computed using the `user_input`, the `retrieved_contexts`
     and the `response`.
 
-    The Response Relevancy is defined as the mean cosine similartiy of the original
-    `user_input` to a number of artifical questions, which are generated (reverse-engineered)
+    The Response Relevancy is defined as the mean cosine similarity of the original
+    `user_input` to a number of artificial questions, which are generated (reverse-engineered)
     based on the `response`:
 
     $$
@@ -62,7 +64,7 @@ def ResponseRelevancy(
 
     **Note**: *This is a reference-free metric, meaning that it does not require a
     `ground_truth` answer to compare against. A similar metric that does evaluate the
-    correctness of a generated answser with respect to a `ground_truth` answer is
+    correctness of a generated answers with respect to a `ground_truth` answer is
     `validmind.model_validation.ragas.AnswerCorrectness`.*
 
     ### Configuring Columns
@@ -128,7 +130,7 @@ def ResponseRelevancy(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=metrics,
-        **get_ragas_config(),
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "answer_relevancy"

diff --git a/validmind/tests/model_validation/ragas/SemanticSimilarity.py b/validmind/tests/model_validation/ragas/SemanticSimilarity.py
@@ -33,6 +33,8 @@ def SemanticSimilarity(
     dataset,
     response_column="response",
     reference_column="reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Calculates the semantic similarity between generated responses and ground truths
@@ -107,7 +109,9 @@ def SemanticSimilarity(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[semantic_similarity()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[semantic_similarity()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "semantic_similarity"

diff --git a/validmind/tests/model_validation/ragas/utils.py b/validmind/tests/model_validation/ragas/utils.py
@@ -2,34 +2,14 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
-import os
-
-from validmind.ai.utils import get_client_and_model, is_configured
+from validmind.ai.utils import get_judge_config
 
 EMBEDDINGS_MODEL = "text-embedding-3-small"
 
 
-def get_ragas_config():
-    # import here since its an optional dependency
-    try:
-        from langchain_openai import ChatOpenAI, OpenAIEmbeddings
-    except ImportError:
-        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
-
-    if not is_configured():
-        raise ValueError(
-            "LLM is not configured. Please set an `OPENAI_API_KEY` environment variable "
-            "or ensure that you are connected to the ValidMind API and ValidMind AI is "
-            "enabled for your account."
-        )
-
-    client, model = get_client_and_model()
-    os.environ["OPENAI_API_BASE"] = str(client.base_url)
-
-    return {
-        "llm": ChatOpenAI(api_key=client.api_key, model=model),
-        "embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL),
-    }
+def get_ragas_config(judge_llm=None, judge_embeddings=None):
+    judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings)
+    return {"llm": judge_llm, "embeddings": judge_embeddings}
 
 
 def make_sub_col_udf(root_col, sub_col):

diff --git a/validmind/tests/prompt_validation/Bias.py b/validmind/tests/prompt_validation/Bias.py
@@ -45,7 +45,7 @@
 
 @tags("llm", "few_shot")
 @tasks("text_classification", "text_summarization")
-def Bias(model, min_threshold=7):
+def Bias(model, min_threshold=7, judge_llm=None):
     """
     Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
     prompt.
@@ -100,6 +100,7 @@ def Bias(model, min_threshold=7):
     response = call_model(
         system_prompt=SYSTEM,
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     )
 
     score = get_score(response)

diff --git a/validmind/tests/prompt_validation/Clarity.py b/validmind/tests/prompt_validation/Clarity.py
@@ -46,7 +46,7 @@
 
 @tags("llm", "zero_shot", "few_shot")
 @tasks("text_classification", "text_summarization")
-def Clarity(model, min_threshold=7):
+def Clarity(model, min_threshold=7, judge_llm=None):
     """
     Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
 
@@ -89,6 +89,7 @@ def Clarity(model, min_threshold=7):
     response = call_model(
         system_prompt=SYSTEM,
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     )
 
     score = get_score(response)