diff --git a/validmind/ai/utils.py b/validmind/ai/utils.py
index 648d26076..135fb6b55 100644
--- a/validmind/ai/utils.py
+++ b/validmind/ai/utils.py
@@ -15,6 +15,10 @@
 
 __client = None
 __model = None
+__judge_llm = None
+__judge_embeddings = None
+EMBEDDINGS_MODEL = "text-embedding-3-small"
+
 # can be None, True or False (ternary to represent initial state, ack and failed ack)
 __ack = None
 
@@ -105,6 +109,91 @@ def get_client_and_model():
     return __client, __model
 
 
+def get_judge_config(judge_llm=None, judge_embeddings=None):
+    try:
+        from langchain_core.embeddings import Embeddings
+        from langchain_core.language_models.chat_models import BaseChatModel
+        from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+
+        from validmind.models.function import FunctionModel
+    except ImportError:
+        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+
+    if judge_llm is not None or judge_embeddings is not None:
+        if isinstance(judge_llm, FunctionModel) and judge_llm is not None:
+            if isinstance(judge_llm.model, BaseChatModel):
+                judge_llm = judge_llm.model
+            else:
+                raise ValueError(
+                    "The ValidMind Functional model provided does not have have a langchain compatible LLM as a model attribute."
+                    "To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
+                    "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
+                )
+        if isinstance(judge_embeddings, FunctionModel) and judge_embeddings is not None:
+            if isinstance(judge_llm.model, BaseChatModel):
+                judge_embeddings = judge_embeddings.model
+            else:
+                raise ValueError(
+                    "The ValidMind Functional model provided does not have have a langchain compatible embeddings model as a model attribute."
+                    "To use default ValidMind LLM, do not set judge_embedding parameter, "
+                    "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
+                )
+
+        if (isinstance(judge_llm, BaseChatModel) or judge_llm is None) and (
+            isinstance(judge_embeddings, Embeddings) or judge_embeddings is None
+        ):
+            return judge_llm, judge_embeddings
+        else:
+            raise ValueError(
+                "Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
+                "Langchain BaseChatModel and LangchainEmbeddings.  To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
+                "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
+            )
+
+    # grab default values if not passed at run time
+    global __judge_llm, __judge_embeddings
+    if __judge_llm and __judge_embeddings:
+        return __judge_llm, __judge_embeddings
+
+    client, model = get_client_and_model()
+    os.environ["OPENAI_API_BASE"] = str(client.base_url)
+
+    __judge_llm = ChatOpenAI(api_key=client.api_key, model=model)
+    __judge_embeddings = OpenAIEmbeddings(
+        api_key=client.api_key, model=EMBEDDINGS_MODEL
+    )
+
+    return __judge_llm, __judge_embeddings
+
+
+def set_judge_config(judge_llm, judge_embeddings):
+    global __judge_llm, __judge_embeddings
+    try:
+        from langchain_core.embeddings import Embeddings
+        from langchain_core.language_models.chat_models import BaseChatModel
+
+        from validmind.models.function import FunctionModel
+    except ImportError:
+        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
+    if isinstance(judge_llm, BaseChatModel) and isinstance(
+        judge_embeddings, Embeddings
+    ):
+        __judge_llm = judge_llm
+        __judge_embeddings = judge_embeddings
+        # Assuming 'your_object' is the object you want to check
+    elif isinstance(judge_llm, FunctionModel) and isinstance(
+        judge_embeddings, FunctionModel
+    ):
+        __judge_llm = judge_llm.model
+        __judge_embeddings = judge_embeddings.model
+    else:
+        raise ValueError(
+            "Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
+            "Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
+            "ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
+        )
+
+
 def is_configured():
     global __ack
 
diff --git a/validmind/tests/model_validation/ragas/AnswerCorrectness.py b/validmind/tests/model_validation/ragas/AnswerCorrectness.py
index 6352bf990..51936ab41 100644
--- a/validmind/tests/model_validation/ragas/AnswerCorrectness.py
+++ b/validmind/tests/model_validation/ragas/AnswerCorrectness.py
@@ -34,6 +34,8 @@ def AnswerCorrectness(
     user_input_column="user_input",
     response_column="response",
     reference_column="reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Evaluates the correctness of answers in a dataset with respect to the provided ground
@@ -118,7 +120,9 @@ def AnswerCorrectness(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[answer_correctness()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[answer_correctness()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "answer_correctness"
diff --git a/validmind/tests/model_validation/ragas/AspectCritic.py b/validmind/tests/model_validation/ragas/AspectCritic.py
index 3f9858c39..8eb3e921b 100644
--- a/validmind/tests/model_validation/ragas/AspectCritic.py
+++ b/validmind/tests/model_validation/ragas/AspectCritic.py
@@ -51,6 +51,8 @@ def AspectCritic(
         "maliciousness",
     ],
     additional_aspects: list = None,
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Evaluates generations against the following aspects: harmfulness, maliciousness,
@@ -158,7 +160,9 @@ def AspectCritic(
     all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=all_aspects,
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     # reverse the score for aspects where lower is better
diff --git a/validmind/tests/model_validation/ragas/ContextEntityRecall.py b/validmind/tests/model_validation/ragas/ContextEntityRecall.py
index fa5fb3ae9..3f5aa6a35 100644
--- a/validmind/tests/model_validation/ragas/ContextEntityRecall.py
+++ b/validmind/tests/model_validation/ragas/ContextEntityRecall.py
@@ -33,6 +33,8 @@ def ContextEntityRecall(
     dataset,
     retrieved_contexts_column: str = "retrieved_contexts",
     reference_column: str = "reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Evaluates the context entity recall for dataset entries and visualizes the results.
@@ -113,7 +115,9 @@ def ContextEntityRecall(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_entity_recall()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[context_entity_recall()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "context_entity_recall"
diff --git a/validmind/tests/model_validation/ragas/ContextPrecision.py b/validmind/tests/model_validation/ragas/ContextPrecision.py
index 035e76f25..9eb455f3e 100644
--- a/validmind/tests/model_validation/ragas/ContextPrecision.py
+++ b/validmind/tests/model_validation/ragas/ContextPrecision.py
@@ -34,6 +34,8 @@ def ContextPrecision(
     user_input_column: str = "user_input",
     retrieved_contexts_column: str = "retrieved_contexts",
     reference_column: str = "reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):  # noqa: B950
     """
     Context Precision is a metric that evaluates whether all of the ground-truth
@@ -109,7 +111,9 @@ def ContextPrecision(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[context_precision()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "llm_context_precision_with_reference"
diff --git a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
index 9b9d18ea5..5ac58cab9 100644
--- a/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
+++ b/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py
@@ -34,6 +34,8 @@ def ContextPrecisionWithoutReference(
     user_input_column: str = "user_input",
     retrieved_contexts_column: str = "retrieved_contexts",
     response_column: str = "response",
+    judge_llm=None,
+    judge_embeddings=None,
 ):  # noqa: B950
     """
     Context Precision Without Reference is a metric used to evaluate the relevance of
@@ -104,7 +106,9 @@ def ContextPrecisionWithoutReference(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_precision()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[context_precision()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "llm_context_precision_without_reference"
diff --git a/validmind/tests/model_validation/ragas/ContextRecall.py b/validmind/tests/model_validation/ragas/ContextRecall.py
index e6b0317f4..094927037 100644
--- a/validmind/tests/model_validation/ragas/ContextRecall.py
+++ b/validmind/tests/model_validation/ragas/ContextRecall.py
@@ -34,6 +34,8 @@ def ContextRecall(
     user_input_column: str = "user_input",
     retrieved_contexts_column: str = "retrieved_contexts",
     reference_column: str = "reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Context recall measures the extent to which the retrieved context aligns with the
@@ -109,7 +111,9 @@ def ContextRecall(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[context_recall()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[context_recall()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "context_recall"
diff --git a/validmind/tests/model_validation/ragas/Faithfulness.py b/validmind/tests/model_validation/ragas/Faithfulness.py
index 034b5fb61..6b83f26c8 100644
--- a/validmind/tests/model_validation/ragas/Faithfulness.py
+++ b/validmind/tests/model_validation/ragas/Faithfulness.py
@@ -34,6 +34,8 @@ def Faithfulness(
     user_input_column="user_input",
     response_column="response",
     retrieved_contexts_column="retrieved_contexts",
+    judge_llm=None,
+    judge_embeddings=None,
 ):  # noqa
     """
     Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
@@ -114,7 +116,9 @@ def Faithfulness(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[faithfulness()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[faithfulness()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "faithfulness"
diff --git a/validmind/tests/model_validation/ragas/NoiseSensitivity.py b/validmind/tests/model_validation/ragas/NoiseSensitivity.py
index 17cb838cd..9a5d6e218 100644
--- a/validmind/tests/model_validation/ragas/NoiseSensitivity.py
+++ b/validmind/tests/model_validation/ragas/NoiseSensitivity.py
@@ -38,6 +38,8 @@ def NoiseSensitivity(
     reference_column="reference",
     focus="relevant",
     user_input_column="user_input",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
@@ -149,7 +151,7 @@ def NoiseSensitivity(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=[noise_sensitivity(focus=focus)],
-        **get_ragas_config(),
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = f"noise_sensitivity_{focus}"
diff --git a/validmind/tests/model_validation/ragas/ResponseRelevancy.py b/validmind/tests/model_validation/ragas/ResponseRelevancy.py
index a7eabd1db..63d633355 100644
--- a/validmind/tests/model_validation/ragas/ResponseRelevancy.py
+++ b/validmind/tests/model_validation/ragas/ResponseRelevancy.py
@@ -34,6 +34,8 @@ def ResponseRelevancy(
     user_input_column="user_input",
     retrieved_contexts_column=None,
     response_column="response",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Assesses how pertinent the generated answer is to the given prompt.
@@ -44,8 +46,8 @@ def ResponseRelevancy(
     relevancy. This metric is computed using the `user_input`, the `retrieved_contexts`
     and the `response`.
 
-    The Response Relevancy is defined as the mean cosine similartiy of the original
-    `user_input` to a number of artifical questions, which are generated (reverse-engineered)
+    The Response Relevancy is defined as the mean cosine similarity of the original
+    `user_input` to a number of artificial questions, which are generated (reverse-engineered)
     based on the `response`:
 
     $$
@@ -62,7 +64,7 @@ def ResponseRelevancy(
 
     **Note**: *This is a reference-free metric, meaning that it does not require a
     `ground_truth` answer to compare against. A similar metric that does evaluate the
-    correctness of a generated answser with respect to a `ground_truth` answer is
+    correctness of a generated answers with respect to a `ground_truth` answer is
     `validmind.model_validation.ragas.AnswerCorrectness`.*
 
     ### Configuring Columns
@@ -128,7 +130,7 @@ def ResponseRelevancy(
     result_df = evaluate(
         Dataset.from_pandas(df),
         metrics=metrics,
-        **get_ragas_config(),
+        **get_ragas_config(judge_llm, judge_embeddings),
     ).to_pandas()
 
     score_column = "answer_relevancy"
diff --git a/validmind/tests/model_validation/ragas/SemanticSimilarity.py b/validmind/tests/model_validation/ragas/SemanticSimilarity.py
index 42d62a877..c963dec5a 100644
--- a/validmind/tests/model_validation/ragas/SemanticSimilarity.py
+++ b/validmind/tests/model_validation/ragas/SemanticSimilarity.py
@@ -33,6 +33,8 @@ def SemanticSimilarity(
     dataset,
     response_column="response",
     reference_column="reference",
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """
     Calculates the semantic similarity between generated responses and ground truths
@@ -107,7 +109,9 @@ def SemanticSimilarity(
     df = get_renamed_columns(dataset._df, required_columns)
 
     result_df = evaluate(
-        Dataset.from_pandas(df), metrics=[semantic_similarity()], **get_ragas_config()
+        Dataset.from_pandas(df),
+        metrics=[semantic_similarity()],
+        **get_ragas_config(judge_llm, judge_embeddings)
     ).to_pandas()
 
     score_column = "semantic_similarity"
diff --git a/validmind/tests/model_validation/ragas/utils.py b/validmind/tests/model_validation/ragas/utils.py
index 01426a6f8..b6dfca6ad 100644
--- a/validmind/tests/model_validation/ragas/utils.py
+++ b/validmind/tests/model_validation/ragas/utils.py
@@ -2,34 +2,14 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 
-import os
-
-from validmind.ai.utils import get_client_and_model, is_configured
+from validmind.ai.utils import get_judge_config
 
 EMBEDDINGS_MODEL = "text-embedding-3-small"
 
 
-def get_ragas_config():
-    # import here since its an optional dependency
-    try:
-        from langchain_openai import ChatOpenAI, OpenAIEmbeddings
-    except ImportError:
-        raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
-
-    if not is_configured():
-        raise ValueError(
-            "LLM is not configured. Please set an `OPENAI_API_KEY` environment variable "
-            "or ensure that you are connected to the ValidMind API and ValidMind AI is "
-            "enabled for your account."
-        )
-
-    client, model = get_client_and_model()
-    os.environ["OPENAI_API_BASE"] = str(client.base_url)
-
-    return {
-        "llm": ChatOpenAI(api_key=client.api_key, model=model),
-        "embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL),
-    }
+def get_ragas_config(judge_llm=None, judge_embeddings=None):
+    judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings)
+    return {"llm": judge_llm, "embeddings": judge_embeddings}
 
 
 def make_sub_col_udf(root_col, sub_col):
diff --git a/validmind/tests/prompt_validation/Bias.py b/validmind/tests/prompt_validation/Bias.py
index 07040426a..275894ae3 100644
--- a/validmind/tests/prompt_validation/Bias.py
+++ b/validmind/tests/prompt_validation/Bias.py
@@ -45,7 +45,7 @@
 
 @tags("llm", "few_shot")
 @tasks("text_classification", "text_summarization")
-def Bias(model, min_threshold=7):
+def Bias(model, min_threshold=7, judge_llm=None):
     """
     Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
     prompt.
@@ -100,6 +100,7 @@ def Bias(model, min_threshold=7):
     response = call_model(
         system_prompt=SYSTEM,
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     )
 
     score = get_score(response)
diff --git a/validmind/tests/prompt_validation/Clarity.py b/validmind/tests/prompt_validation/Clarity.py
index c6c5c6cc0..63acfd5bf 100644
--- a/validmind/tests/prompt_validation/Clarity.py
+++ b/validmind/tests/prompt_validation/Clarity.py
@@ -46,7 +46,7 @@
 
 @tags("llm", "zero_shot", "few_shot")
 @tasks("text_classification", "text_summarization")
-def Clarity(model, min_threshold=7):
+def Clarity(model, min_threshold=7, judge_llm=None):
     """
     Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
 
@@ -89,6 +89,7 @@ def Clarity(model, min_threshold=7):
     response = call_model(
         system_prompt=SYSTEM,
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     )
 
     score = get_score(response)
diff --git a/validmind/tests/prompt_validation/Conciseness.py b/validmind/tests/prompt_validation/Conciseness.py
index e2c295a3c..cc26d1769 100644
--- a/validmind/tests/prompt_validation/Conciseness.py
+++ b/validmind/tests/prompt_validation/Conciseness.py
@@ -54,7 +54,7 @@
 
 @tags("llm", "zero_shot", "few_shot")
 @tasks("text_classification", "text_summarization")
-def Conciseness(model, min_threshold=7):
+def Conciseness(model, min_threshold=7, judge_llm=None):
     """
     Analyzes and grades the conciseness of prompts provided to a Large Language Model.
 
@@ -97,6 +97,7 @@ def Conciseness(model, min_threshold=7):
     response = call_model(
         system_prompt=SYSTEM,
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     )
     score = get_score(response)
     explanation = get_explanation(response)
diff --git a/validmind/tests/prompt_validation/Delimitation.py b/validmind/tests/prompt_validation/Delimitation.py
index a91884092..87ae8e260 100644
--- a/validmind/tests/prompt_validation/Delimitation.py
+++ b/validmind/tests/prompt_validation/Delimitation.py
@@ -39,7 +39,7 @@
 
 @tags("llm", "zero_shot", "few_shot")
 @tasks("text_classification", "text_summarization")
-def Delimitation(model, min_threshold=7):
+def Delimitation(model, min_threshold=7, judge_llm=None):
     """
     Evaluates the proper use of delimiters in prompts provided to Large Language Models.
 
@@ -83,6 +83,7 @@ def Delimitation(model, min_threshold=7):
     response = call_model(
         system_prompt=SYSTEM,
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     )
     score = get_score(response)
     explanation = get_explanation(response)
diff --git a/validmind/tests/prompt_validation/NegativeInstruction.py b/validmind/tests/prompt_validation/NegativeInstruction.py
index b2d62e4c9..64c4a2338 100644
--- a/validmind/tests/prompt_validation/NegativeInstruction.py
+++ b/validmind/tests/prompt_validation/NegativeInstruction.py
@@ -52,7 +52,7 @@
 
 @tags("llm", "zero_shot", "few_shot")
 @tasks("text_classification", "text_summarization")
-def NegativeInstruction(model, min_threshold=7):
+def NegativeInstruction(model, min_threshold=7, judge_llm=None):
     """
     Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
 
@@ -101,6 +101,7 @@ def NegativeInstruction(model, min_threshold=7):
     response = call_model(
         system_prompt=SYSTEM,
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     )
     score = get_score(response)
     explanation = get_explanation(response)
diff --git a/validmind/tests/prompt_validation/Robustness.py b/validmind/tests/prompt_validation/Robustness.py
index 627554e28..d1e306f5d 100644
--- a/validmind/tests/prompt_validation/Robustness.py
+++ b/validmind/tests/prompt_validation/Robustness.py
@@ -25,7 +25,7 @@
 Be creative and think step-by-step how you would break the prompt.
 Then generate {num_tests} inputs for the user-submitted prompt template that would break the prompt.
 Each input should be different from the others.
-Each input should be retured as a new line in your response.
+Each input should be returned as a new line in your response.
 Respond only with the values to be inserted into the prompt template and do not include quotes, explanations or any extra text.
 
 Example:
@@ -56,7 +56,7 @@
 
 @tags("llm", "zero_shot", "few_shot")
 @tasks("text_classification", "text_summarization")
-def Robustness(model, dataset, num_tests=10):
+def Robustness(model, dataset, num_tests=10, judge_llm=None):
     """
     Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts. This test
     specifically measures the model's ability to generate correct classifications with the given prompt even when the
@@ -112,6 +112,7 @@ def Robustness(model, dataset, num_tests=10):
     generated_inputs = call_model(
         system_prompt=SYSTEM.format(num_tests=num_tests),
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     ).split("\n")
 
     responses = model.predict(
diff --git a/validmind/tests/prompt_validation/Specificity.py b/validmind/tests/prompt_validation/Specificity.py
index 0a2dfb700..10cf4ee5a 100644
--- a/validmind/tests/prompt_validation/Specificity.py
+++ b/validmind/tests/prompt_validation/Specificity.py
@@ -52,7 +52,7 @@
 
 @tags("llm", "zero_shot", "few_shot")
 @tasks("text_classification", "text_summarization")
-def Specificity(model, min_threshold=7):
+def Specificity(model, min_threshold=7, judge_llm=None):
     """
     Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail,
     and relevance.
@@ -97,6 +97,7 @@ def Specificity(model, min_threshold=7):
     response = call_model(
         system_prompt=SYSTEM,
         user_prompt=USER.format(prompt_to_test=model.prompt.template),
+        judge_llm=judge_llm,
     )
     score = get_score(response)
     explanation = get_explanation(response)
diff --git a/validmind/tests/prompt_validation/ai_powered_test.py b/validmind/tests/prompt_validation/ai_powered_test.py
index 49d604f40..03ce32cfa 100644
--- a/validmind/tests/prompt_validation/ai_powered_test.py
+++ b/validmind/tests/prompt_validation/ai_powered_test.py
@@ -4,7 +4,7 @@
 
 import re
 
-from validmind.ai.utils import get_client_and_model, is_configured
+from validmind.ai.utils import get_judge_config, is_configured
 
 missing_prompt_message = """
 Cannot run prompt validation tests on a model with no prompt.
@@ -21,7 +21,12 @@
 
 
 def call_model(
-    system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42
+    system_prompt: str,
+    user_prompt: str,
+    temperature: float = 0.0,
+    seed: int = 42,
+    judge_llm=None,
+    judge_embeddings=None,
 ):
     """Call LLM with the given prompts and return the response"""
     if not is_configured():
@@ -31,21 +36,17 @@ def call_model(
             "enabled for your account."
         )
 
-    client, model = get_client_and_model()
-
-    return (
-        client.chat.completions.create(
-            model=model,
-            messages=[
-                {"role": "system", "content": system_prompt.strip("\n").strip()},
-                {"role": "user", "content": user_prompt.strip("\n").strip()},
-            ],
-            temperature=temperature,
-            seed=seed,
-        )
-        .choices[0]
-        .message.content
-    )
+    judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings)
+    messages = [
+        ("system", system_prompt.strip("\n").strip()),
+        ("user", user_prompt.strip("\n").strip()),
+    ]
+
+    return judge_llm.invoke(
+        messages,
+        temperature=temperature,
+        seed=seed,
+    ).content
 
 
 def get_score(response: str):