diff --git a/.gitignore b/.gitignore
index b29e2dfe1..60a419748 100644
--- a/.gitignore
+++ b/.gitignore
@@ -226,3 +226,6 @@ my_tests/
# Quarto docs
docs/validmind.json
*.html
+*.qmd
+# DeepEval
+*.deepeval/
diff --git a/notebooks/code_samples/agents/banking_test_dataset.py b/notebooks/code_samples/agents/banking_test_dataset.py
index ade54e754..b0beb2ad5 100644
--- a/notebooks/code_samples/agents/banking_test_dataset.py
+++ b/notebooks/code_samples/agents/banking_test_dataset.py
@@ -12,7 +12,7 @@
"category": "credit_risk"
},
{
- "input": "Evaluate credit risk for a business loan of $250,000 with monthly revenue of $85,000 and existing debt of $45,000",
+ "input": "Evaluate credit risk for a business loan of $250,000 with monthly revenue of $85,000 and existing debt of $45,000 and credit score of 650",
"expected_tools": ["credit_risk_analyzer"],
"possible_outputs": ["MEDIUM RISK", "HIGH RISK", "business loan", "debt service coverage ratio", "1.8", "annual revenue", "$1,020,000", "risk score", "650"],
"session_id": str(uuid.uuid4()),
diff --git a/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb b/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb
index e92bc3d65..7a06d2090 100644
--- a/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb
+++ b/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb
@@ -117,7 +117,7 @@
"metadata": {},
"outputs": [],
"source": [
- "%pip install -q \"validmind[all]\" langgraph"
+ "%pip install -q validmind langgraph"
]
},
{
@@ -202,7 +202,6 @@
"from banking_tools import AVAILABLE_TOOLS\n",
"from validmind.tests import run_test\n",
"\n",
- "\n",
"# Load environment variables if using .env file\n",
"try:\n",
" from dotenv import load_dotenv\n",
@@ -316,8 +315,7 @@
"except Exception as e:\n",
" print(f\"Fraud Detection System test FAILED: {e}\")\n",
"\n",
- "print(\"\" + \"=\" * 60)\n",
- "\n"
+ "print(\"\" + \"=\" * 60)"
]
},
{
@@ -478,8 +476,21 @@
" tool_message = \"\"\n",
" for output in captured_data[\"tool_outputs\"]:\n",
" tool_message += output['content']\n",
+ " \n",
+ " tool_calls_found = []\n",
+ " messages = result['messages']\n",
+ " for message in messages:\n",
+ " if hasattr(message, 'tool_calls') and message.tool_calls:\n",
+ " for tool_call in message.tool_calls:\n",
+ " # Handle both dictionary and object formats\n",
+ " if isinstance(tool_call, dict):\n",
+ " tool_calls_found.append(tool_call['name'])\n",
+ " else:\n",
+ " # ToolCall object - use attribute access\n",
+ " tool_calls_found.append(tool_call.name)\n",
+ "\n",
"\n",
- " return {\"prediction\": result['messages'][-1].content, \"output\": result, \"tool_messages\": [tool_message]}\n",
+ " return {\"prediction\": result['messages'][-1].content, \"output\": result, \"tool_messages\": [tool_message], \"tool_calls\": tool_calls_found}\n",
" except Exception as e:\n",
" # Return a fallback response if the agent fails\n",
" error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n",
@@ -597,7 +608,7 @@
"source": [
"## Banking Test Dataset\n",
"\n",
- "We'll use our comprehensive banking test dataset to evaluate our agent's performance across different banking scenarios.\n",
+ "We'll use a sample test dataset to evaluate our agent's performance across different banking scenarios.\n",
"\n",
"### Initialize ValidMind Dataset\n",
"\n",
@@ -625,6 +636,15 @@
"print(f\"Dataset columns: {vm_test_dataset._df.columns}\")\n"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vm_test_dataset._df.head(1)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -795,7 +815,75 @@
" \"agent_output_column\": \"banking_agent_model_output\",\n",
" \"expected_tools_column\": \"expected_tools\"\n",
" }\n",
- ")"
+ ").log()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Scorers in ValidMind\n",
+ "\n",
+ "Scorers are evaluation metrics that analyze model outputs and store their results in the dataset. When using `assign_scores()`:\n",
+ "\n",
+ "- Each scorer adds a new column to the dataset with format: {scorer_name}_{metric_name}\n",
+ "- The column contains the numeric score (typically 0-1) for each example\n",
+ "- Multiple scorers can be run on the same dataset, each adding their own column\n",
+ "- Scores are persisted in the dataset for later analysis and visualization\n",
+ "- Common scorer patterns include:\n",
+ " - Model performance metrics (accuracy, F1, etc)\n",
+ " - Output quality metrics (relevance, faithfulness)\n",
+ " - Task-specific metrics (completion, correctness)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Task Completion scorer\n",
+ "\n",
+ "The TaskCompletion test evaluates whether our banking agent successfully completes the requested tasks by analyzing its outputs and tool usage. This metric assesses the agent's ability to understand user requests, execute appropriate actions, and provide complete responses that address the original query. The test provides a score between 0-1 along with detailed feedback on task completion quality."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vm_test_dataset.assign_scores(\n",
+ " metrics = \"validmind.scorer.llm.deepeval.TaskCompletion\",\n",
+ " input_column=\"input\",\n",
+ " tools_called_column=\"tools_called\",\n",
+ " actual_output_column=\"banking_agent_model_prediction\",\n",
+ " agent_output_column=\"banking_agent_model_output\"\n",
+ " )\n",
+ "vm_test_dataset._df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The TaskCompletion scorer has added a new column 'TaskCompletion_score' to our dataset. This is because when we run scorers through assign_scores(), the return values are automatically processed and added as new columns with the format {scorer_name}_{metric_name}. We'll use this column to visualize the distribution of task completion scores across our test cases. Let's visualize the distribution through the box plot test."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "run_test(\n",
+ " \"validmind.plots.BoxPlot\",\n",
+ " inputs={\"dataset\": vm_test_dataset},\n",
+ " params={\n",
+ " \"columns\": \"TaskCompletion_score\",\n",
+ " \"title\": \"Distribution of Task Completion Scores\",\n",
+ " \"ylabel\": \"Score\",\n",
+ " \"figsize\": (8, 6)\n",
+ " }\n",
+ ").log()\n"
]
},
{
diff --git a/notebooks/code_sharing/deepeval_integration_demo.ipynb b/notebooks/code_sharing/deepeval_integration_demo.ipynb
index 78b9ce0ff..1de828941 100644
--- a/notebooks/code_sharing/deepeval_integration_demo.ipynb
+++ b/notebooks/code_sharing/deepeval_integration_demo.ipynb
@@ -10,17 +10,14 @@
"source": [
"# DeepEval Integration with ValidMind\n",
"\n",
- "Learn how to integrate [DeepEval](https://github.com/confident-ai/deepeval) with the ValidMind Library to evaluate Large Language Models (LLMs) and AI agents. This notebook demonstrates the complete integration through the new `LLMAgentDataset` class, enabling you to leverage DeepEval's 30+ evaluation metrics within ValidMind's testing infrastructure.\n",
+ "Let's learn how to integrate [DeepEval](https://github.com/confident-ai/deepeval) with the ValidMind Library to evaluate Large Language Models (LLMs) and AI agents. This notebook demonstrates how to use DeepEval's summarization metrics within ValidMind's testing infrastructure.\n",
"\n",
"To integrate DeepEval with ValidMind, we'll:\n",
- "\n",
- "1. Set up both frameworks and install required dependencies\n",
- "2. Create and evaluate LLM test cases for different scenarios\n",
- "3. Work with RAG systems and agent evaluations\n",
- "4. Use Golden templates for standardized testing\n",
- "5. Create custom evaluation metrics with G-Eval\n",
- "6. Integrate everything with ValidMind's testing framework\n",
- "7. Apply production-ready evaluation patterns\n"
+ " 1. Set up both frameworks and install required dependencies\n",
+ " 2. Create a dataset with source texts and generated summaries\n",
+ " 3. Use ValidMind's Summarization scorer to evaluate summary quality\n",
+ " 4. Analyze the evaluation results and reasons\n",
+ " 5. Apply the evaluation pipeline to multiple examples\n"
]
},
{
@@ -41,10 +38,20 @@
" - [Initialize ValidMind](#toc3_2_) \n",
"- [Basic Usage - Simple Q&A Evaluation](#toc4_) \n",
"- [RAG System Evaluation](#toc5_) \n",
+ " - [Create test cases](#toc5_1_) \n",
+ " - [Build dataset](#toc5_2_) \n",
+ " - [Evaluation metrics](#toc5_3_) \n",
+ " - [Contextual Relevancy](#toc5_3_1_) \n",
+ " - [Contextual Precision](#toc5_3_2_) \n",
+ " - [Contextual Recall](#toc5_3_3_) \n",
"- [LLM Agent Evaluation](#toc6_) \n",
- "- [Working with Golden Templates](#toc7_) \n",
- "- [ValidMind Integration](#toc8_) \n",
- "- [Custom Metrics with G-Eval](#toc9_) \n",
+ " - [Create test cases](#toc6_1_) \n",
+ " - [Build dataset](#toc6_2_) \n",
+ " - [Evaluation metrics](#toc6_3_) \n",
+ " - [Faithfulness](#toc6_3_1_) \n",
+ " - [Hallucination](#toc6_3_2_) \n",
+ " - [Summarization](#toc6_3_3_) \n",
+ " - [Task Completion](#toc6_3_4_) \n",
"- [In summary](#toc10_) \n",
"- [Next steps](#toc11_) \n",
"\n"
@@ -119,10 +126,6 @@
"\n",
"**LLMTestCase**: A DeepEval object that represents a single test case with input, expected output, actual output, and optional context.\n",
"\n",
- "**Golden Templates**: Pre-defined test templates with inputs and expected outputs that can be converted to test cases by generating actual outputs.\n",
- "\n",
- "**G-Eval**: Generative evaluation using LLMs to assess response quality based on custom criteria.\n",
- "\n",
"**LLMAgentDataset**: A ValidMind dataset class that bridges DeepEval test cases with ValidMind's testing infrastructure.\n",
"\n",
"**RAG Evaluation**: Testing retrieval-augmented generation systems that combine document retrieval with generation.\n",
@@ -216,12 +219,10 @@
"# Core imports\n",
"import pandas as pd\n",
"import warnings\n",
- "from deepeval.test_case import LLMTestCase, ToolCall, LLMTestCaseParams\n",
- "from deepeval.dataset import Golden\n",
- "from deepeval.metrics import GEval\n",
+ "from deepeval.test_case import LLMTestCase, ToolCall\n",
"from validmind.datasets.llm import LLMAgentDataset\n",
"\n",
- "warnings.filterwarnings('ignore')\n"
+ "warnings.filterwarnings('ignore')"
]
},
{
@@ -239,15 +240,19 @@
"Let's start with the simplest use case: evaluating a basic question-and-answer interaction with an LLM. This demonstrates how to create LLMTestCase objects and integrate them with ValidMind's dataset infrastructure.\n"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create a simple LLM test case"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# Step 1: Create a simple LLM test case\n",
- "print(\"Creating a simple Q&A test case...\")\n",
- "\n",
"simple_test_cases = [\n",
"LLMTestCase(\n",
" input=\"What is machine learning?\",\n",
@@ -258,7 +263,14 @@
" model building. It uses algorithms that iteratively learn from data, allowing computers to find \n",
" hidden insights without being explicitly programmed where to look.\"\"\",\n",
" context=[\"Machine learning is a branch of AI that focuses on algorithms that can learn from data.\"],\n",
- " retrieval_context=[\"Machine learning is a branch of AI that focuses on algorithms that can learn from data.\"]\n",
+ " retrieval_context=[\"Machine learning is a branch of AI that focuses on algorithms that can learn from data.\"],\n",
+ " tools_called=[\n",
+ " ToolCall(\n",
+ " name=\"search_docs\",\n",
+ " args={\"query\": \"machine learning definition\"},\n",
+ " response=\"Found definition of machine learning in documentation.\"\n",
+ " )\n",
+ " ]\n",
"),\n",
"LLMTestCase(\n",
" input=\"What is deep learning?\",\n",
@@ -269,11 +281,31 @@
" with many layers to automatically learn representations of data with multiple levels of abstraction.\n",
" It has enabled major breakthroughs in AI applications.\"\"\",\n",
" context=[\"Deep learning is a specialized machine learning approach that uses deep neural networks to learn from data.\"],\n",
- " retrieval_context=[\"Deep learning is a specialized machine learning approach that uses deep neural networks to learn from data.\"]\n",
- ")]\n",
- "\n",
- "\n",
- "# Step 2: Create LLMAgentDataset from the test case\n",
+ " retrieval_context=[\"Deep learning is a specialized machine learning approach that uses deep neural networks to learn from data.\"],\n",
+ " tools_called=[\n",
+ " ToolCall(\n",
+ " name=\"search_docs\", \n",
+ " args={\"query\": \"deep learning definition\"},\n",
+ " response=\"Found definition of deep learning in documentation.\"\n",
+ " )\n",
+ " ]\n",
+ ")]\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create LLMAgentDataset from the test case\n",
+ "Let's create ValidMind dataset from Deepeval's test cases"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"print(\"\\nCreating ValidMind dataset...\")\n",
"\n",
"simple_dataset = LLMAgentDataset.from_test_cases(\n",
@@ -281,38 +313,22 @@
" input_id=\"simple_qa_dataset\"\n",
")\n",
"\n",
+ "\n",
"# Display the dataset\n",
+ "pd.set_option('display.max_colwidth', 40)\n",
+ "pd.set_option('display.width', 120)\n",
+ "pd.set_option('display.max_colwidth', None)\n",
"print(\"\\nDataset preview:\")\n",
- "display(simple_dataset.df)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def agent_fn(input):\n",
- " \"\"\"\n",
- " Invoke the simplified agent with the given input.\n",
- " \"\"\"\n",
- " \n",
- " return 1.23\n",
- "\n",
- " \n",
- "vm_model = vm.init_model(\n",
- " predict_fn=agent_fn,\n",
- " input_id=\"test_model\",\n",
- ")"
+ "display(simple_dataset.df)"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "simple_dataset._df"
+ "### Compute metrics using ValidMind scorer interface\n",
+ "\n",
+ "Now we'll compute metrics on our dataset using ValidMind's scorer interface. This will help us evaluate how well our model is performing by calculating various metrics like answer relevancy. The scorer interface provides a standardized way to assess model outputs against expected results.\n"
]
},
{
@@ -321,7 +337,12 @@
"metadata": {},
"outputs": [],
"source": [
- "simple_dataset.assign_scores(metrics = \"validmind.scorer.llm.deepeval.AnswerRelevancy\")"
+ "simple_dataset.assign_scores(\n",
+ " metrics = \"validmind.scorer.llm.deepeval.AnswerRelevancy\",\n",
+ " input_column = \"input\",\n",
+ " actual_output_column = \"actual_output\",\n",
+ ")\n",
+ "simple_dataset._df.head()"
]
},
{
@@ -330,6 +351,11 @@
"metadata": {},
"outputs": [],
"source": [
+ "simple_dataset.assign_scores(\n",
+ " metrics = \"validmind.scorer.llm.deepeval.Bias\",\n",
+ " input_column = \"input\",\n",
+ " actual_output_column = \"actual_output\",\n",
+ ")\n",
"simple_dataset._df.head()"
]
},
@@ -348,15 +374,23 @@
"Now let's evaluate a more complex use case: a Retrieval-Augmented Generation (RAG) system that retrieves relevant documents and generates responses based on them. RAG systems combine document retrieval with text generation, requiring specialized evaluation approaches.\n"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "### Create test cases"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# Create multiple RAG test cases\n",
- "print(\"Creating RAG evaluation test cases...\")\n",
"\n",
+ "print(\"Creating RAG evaluation test cases...\")\n",
"rag_test_cases = [\n",
" LLMTestCase(\n",
" input=\"How do I return a product that doesn't fit?\",\n",
@@ -402,9 +436,38 @@
" )\n",
"]\n",
"\n",
- "print(f\"Created {len(rag_test_cases)} RAG test cases\")\n",
+ "print(f\"Created {len(rag_test_cases)} RAG test cases\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "### Build dataset\n",
+ "\n",
+ "In this section, we'll convert our Deepeval LLMTestCase objects into a ValidMind dataset format.\n",
+ "This allows us to leverage ValidMind's powerful evaluation capabilities while maintaining \n",
+ "compatibility with Deepeval's test case structure.\n",
"\n",
- "# Create RAG dataset\n",
+ "The dataset will contain:\n",
+ "- Input queries\n",
+ "- Actual model outputs \n",
+ "- Expected outputs\n",
+ "- Context information\n",
+ "- Retrieved context passages\n",
+ "\n",
+ "This structured format enables detailed analysis of the RAG system's performance\n",
+ "across multiple evaluation dimensions.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"rag_dataset = LLMAgentDataset.from_test_cases(\n",
" test_cases=rag_test_cases,\n",
" input_id=\"rag_evaluation_dataset\"\n",
@@ -418,6 +481,63 @@
"display(rag_dataset.df[['input', 'actual_output', 'context', 'retrieval_context']].head())\n"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "### Evaluation metrics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "#### Contextual Relevancy\n",
+ "The Contextual Relevancy metric evaluates how well the retrieved context aligns with the input query.\n",
+ "It measures whether the context contains the necessary information to answer the query accurately.\n",
+ "A high relevancy score indicates that the retrieved context is highly relevant and contains the key information needed.\n",
+ "This helps validate that the RAG system is retrieving appropriate context for the given queries.\n",
+ "\n",
+ "\n",
+ "\n",
+ "#### Contextual Precision\n",
+ "The Contextual Precision metric evaluates how well a RAG system ranks retrieved context nodes by relevance to the input query. \n",
+ "It checks if the most relevant nodes are ranked at the top of the retrieval results.\n",
+ "A high precision score indicates that the retrieved context is highly relevant to the query and properly ranked.\n",
+ "This is particularly useful for evaluating RAG systems and ensuring they surface the most relevant information first.\n",
+ "\n",
+ "\n",
+ "\n",
+ "#### Contextual Recall\n",
+ "The Contextual Recall metric evaluates how well the retrieved context covers all the information needed to generate the expected output.\n",
+ "It extracts statements from the expected output and checks how many of them can be attributed to the retrieved context.\n",
+ "A high recall score indicates that the retrieved context contains all the key information needed to generate the expected response.\n",
+ "This helps ensure the RAG system retrieves comprehensive context that covers all aspects of the expected answer.\n",
+ "\n",
+ "Now we'll evaluate the RAG system's performance using multiple metrics at once. The `assign_scores()` method accepts a list of metrics to evaluate different aspects of the system's behavior. The metrics will add score and reason columns to the dataset, providing quantitative and qualitative feedback on the system's performance. This multi-metric evaluation gives us comprehensive insights into the strengths and potential areas for improvement.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rag_dataset.assign_scores(\n",
+ " metrics = [\"validmind.scorer.llm.deepeval.ContextualRelevancy\",\n",
+ " \"validmind.scorer.llm.deepeval.ContextualPrecision\",\n",
+ " \"validmind.scorer.llm.deepeval.ContextualRecall\"],\n",
+ " input_column = \"input\",\n",
+ " expected_output_column = \"expected_output\",\n",
+ " retrieval_context_column = \"retrieval_context\",\n",
+ ")\n",
+ "display(rag_dataset._df.head(2))"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
@@ -430,7 +550,15 @@
"\n",
"## LLM Agent Evaluation\n",
"\n",
- "Let's evaluate LLM agents that can use tools to accomplish tasks. This is one of the most advanced evaluation scenarios, requiring assessment of both response quality and tool usage appropriateness.\n"
+ "Let's evaluate LLM agents that can use tools to accomplish tasks. This is one of the most advanced evaluation scenarios, requiring assessment of both response quality and tool usage appropriateness."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### Create test cases\n"
]
},
{
@@ -442,6 +570,7 @@
"# Create LLM Agent test cases with tool usage\n",
"print(\"Creating Agent evaluation test cases...\")\n",
"\n",
+ "# Create test cases\n",
"agent_test_cases = [\n",
" LLMTestCase(\n",
" input=\"What's the weather like in New York City today?\",\n",
@@ -470,6 +599,11 @@
" description=\"Should fetch weather information for New York City\",\n",
" input_parameters={\"city\": \"New York City\"}\n",
" )\n",
+ " ],\n",
+ " retrieval_context=[\n",
+ " \"Temperature: 72°F, Condition: Partly Cloudy, Humidity: 60%, Wind: 8mph from west\",\n",
+ " \"No precipitation in forecast for today\",\n",
+ " \"Historical average temperature for this date: 70°F\"\n",
" ]\n",
" ),\n",
" LLMTestCase(\n",
@@ -487,13 +621,18 @@
" reasoning=\"Need to calculate compound interest using the standard formula\"\n",
" )\n",
" ],\n",
- " expected_tools=[\n",
- " ToolCall(\n",
- " name=\"Calculator\", \n",
- " description=\"Should perform compound interest calculation\",\n",
- " input_parameters={\"calculation_type\": \"compound_interest\"}\n",
- " )\n",
- " ]\n",
+ " expected_tools=[\n",
+ " ToolCall(\n",
+ " name=\"Calculator\", \n",
+ " description=\"Should perform compound interest calculation\",\n",
+ " input_parameters={\"calculation_type\": \"compound_interest\"}\n",
+ " )\n",
+ " ],\n",
+ " retrieval_context=[\n",
+ " \"Calculation result: $1,157.63\",\n",
+ " \"Formula used: A = P(1 + r)^t\",\n",
+ " \"Parameters: Principal=$1000, Rate=5%, Time=3 years\"\n",
+ " ]\n",
" ),\n",
" LLMTestCase(\n",
" input=\"Send an email to john@example.com about our meeting tomorrow at 2 PM\",\n",
@@ -514,18 +653,39 @@
" reasoning=\"User requested to send email, so I need to use the email tool with appropriate content\"\n",
" )\n",
" ],\n",
- " expected_tools=[\n",
- " ToolCall(\n",
- " name=\"EmailSender\",\n",
- " description=\"Should send an email about the meeting\",\n",
- " input_parameters={\"recipient\": \"john@example.com\"}\n",
- " )\n",
- " ]\n",
+ " expected_tools=[\n",
+ " ToolCall(\n",
+ " name=\"EmailSender\",\n",
+ " description=\"Should send an email about the meeting\",\n",
+ " input_parameters={\"recipient\": \"john@example.com\"}\n",
+ " )\n",
+ " ],\n",
+ " retrieval_context=[\n",
+ " \"Email sent successfully (msg_12345)\",\n",
+ " \"Recipient: john@example.com\",\n",
+ " \"Subject: Meeting Reminder - Tomorrow at 2 PM\",\n",
+ " \"Timestamp: 2024-01-15T10:30:00Z\"\n",
+ " ]\n",
" )\n",
"]\n",
+ "print(f\"Created {len(agent_test_cases)} Agent test cases\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
"\n",
- "print(f\"Created {len(agent_test_cases)} Agent test cases\")\n",
- "\n",
+ "### Build dataset\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"# Create Agent dataset\n",
"agent_dataset = LLMAgentDataset.from_test_cases(\n",
" test_cases=agent_test_cases,\n",
@@ -547,22 +707,20 @@
" print(f\" - {tool}: {count} times\")\n",
"\n",
"print(\"\\nAgent Dataset Preview:\")\n",
- "display(agent_dataset.df[['input', 'actual_output', 'tools_called']].head())\n"
+ "display(agent_dataset.df[['input', 'actual_output', 'tools_called']].head())"
]
},
{
"cell_type": "markdown",
- "metadata": {
- "vscode": {
- "languageId": "raw"
- }
- },
+ "metadata": {},
"source": [
- "\n",
+ "\n",
"\n",
- "## Working with Golden Templates\n",
+ "### Evaluation metrics\n",
+ "\n",
"\n",
- "Golden templates are a powerful feature of DeepEval that allow you to define test inputs and expected outputs, then generate actual outputs at evaluation time. This approach enables systematic testing across multiple scenarios.\n"
+ "#### Faithfulness\n",
+ "The Faithfulness metric evaluates whether the model's output contains any contradictions or hallucinations compared to the provided context. It ensures that the model's response is grounded in and consistent with the given information, rather than making up facts or contradicting the context. A high faithfulness score indicates that the model's output aligns well with the source material.\n"
]
},
{
@@ -571,84 +729,23 @@
"metadata": {},
"outputs": [],
"source": [
- "# Create Golden templates\n",
- "print(\"Creating Golden templates...\")\n",
- "\n",
- "goldens = [\n",
- " Golden(\n",
- " input=\"Explain the concept of neural networks in simple terms\",\n",
- " expected_output=\"Neural networks are computing systems inspired by biological neural networks that constitute animal brains.\",\n",
- " context=[\"Neural networks are a key component of machine learning and artificial intelligence.\"]\n",
- " ),\n",
- " Golden(\n",
- " input=\"What are the main benefits of cloud computing for businesses?\", \n",
- " expected_output=\"Cloud computing offers scalability, cost-effectiveness, accessibility, and reduced infrastructure maintenance.\",\n",
- " context=[\"Cloud computing provides on-demand access to computing resources over the internet.\"]\n",
- " ),\n",
- " Golden(\n",
- " input=\"How does password encryption protect user data?\",\n",
- " expected_output=\"Password encryption converts passwords into unreadable formats using cryptographic algorithms, protecting against unauthorized access.\",\n",
- " context=[\"Encryption is a fundamental security technique used to protect sensitive information.\"]\n",
- " ),\n",
- " Golden(\n",
- " input=\"What is the difference between machine learning and deep learning?\",\n",
- " expected_output=\"Machine learning is a broad field of AI, while deep learning is a subset that uses neural networks with multiple layers.\",\n",
- " context=[\"Both are important areas of artificial intelligence with different approaches and applications.\"]\n",
+ "agent_dataset.assign_scores(\n",
+ " metrics = \"validmind.scorer.llm.deepeval.Faithfulness\",\n",
+ " user_input_column = \"input\",\n",
+ " response_column = \"actual_output\",\n",
+ " retrieved_contexts_column = \"retrieval_context\",\n",
" )\n",
- "]\n",
- "\n",
- "print(f\"Created {len(goldens)} Golden templates\")\n",
- "\n",
- "# Create dataset from goldens\n",
- "golden_dataset = LLMAgentDataset.from_goldens(\n",
- " goldens=goldens,\n",
- " input_id=\"golden_templates_dataset\"\n",
- ")\n",
+ "agent_dataset._df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
"\n",
- "print(f\"Golden Dataset: {golden_dataset}\")\n",
- "print(f\"Shape: {golden_dataset.df.shape}\")\n",
- "\n",
- "print(\"\\nGolden Templates Preview:\")\n",
- "display(golden_dataset.df[['input', 'expected_output', 'context', 'type']].head())\n",
- "\n",
- "# Mock LLM application function for demonstration\n",
- "def mock_llm_application(input_text: str) -> str:\n",
- " \"\"\"\n",
- " Simulate an LLM application generating responses.\n",
- " In production, this would be your actual LLM application.\n",
- " \"\"\"\n",
- " \n",
- " responses = {\n",
- " \"neural networks\": \"\"\"Neural networks are computational models inspired by the human brain. \n",
- " They consist of interconnected nodes (neurons) that process information by learning patterns from data. \n",
- " These networks can recognize complex patterns and make predictions, making them useful for tasks like \n",
- " image recognition, natural language processing, and decision-making.\"\"\",\n",
- " \n",
- " \"cloud computing\": \"\"\"Cloud computing provides businesses with flexible, scalable access to computing resources \n",
- " over the internet. Key benefits include reduced upfront costs, automatic scaling based on demand, \n",
- " improved collaboration through shared access, enhanced security through professional data centers, \n",
- " and reduced need for internal IT maintenance.\"\"\",\n",
- " \n",
- " \"password encryption\": \"\"\"Password encryption protects user data by converting passwords into complex, \n",
- " unreadable strings using mathematical algorithms. When you enter your password, it's immediately encrypted \n",
- " before storage or transmission. Even if data is intercepted, the encrypted password appears as random characters, \n",
- " making it virtually impossible for attackers to determine the original password.\"\"\",\n",
- " \n",
- " \"machine learning\": \"\"\"Machine learning is a broad approach to artificial intelligence where computers learn \n",
- " to make predictions or decisions by finding patterns in data. Deep learning is a specialized subset that uses \n",
- " artificial neural networks with multiple layers (hence 'deep') to process information in ways that mimic \n",
- " human brain function, enabling more sophisticated pattern recognition and decision-making.\"\"\"\n",
- " }\n",
- " \n",
- " # Simple keyword matching for demonstration\n",
- " input_lower = input_text.lower()\n",
- " for keyword, response in responses.items():\n",
- " if keyword in input_lower:\n",
- " return response.strip()\n",
- " \n",
- " return f\"Thank you for your question about: {input_text}. I'd be happy to provide a comprehensive answer based on current knowledge and best practices.\"\n",
- "\n",
- "print(f\"\\nMock LLM application ready - will generate responses for {len(goldens)} templates\")\n"
+ "#### Hallucination\n",
+ "The Hallucination metric evaluates whether the model's output contains information that is not supported by or contradicts the provided context. It helps identify cases where the model makes up facts or includes details that aren't grounded in the source material. A low hallucination score indicates that the model's response stays faithful to the given context without introducing unsupported information.\n"
]
},
{
@@ -657,57 +754,23 @@
"metadata": {},
"outputs": [],
"source": [
- "# Convert goldens to test cases by generating actual outputs\n",
- "print(\"Converting Golden templates to test cases...\")\n",
- "\n",
- "print(\"Before conversion:\")\n",
- "print(f\" - Test cases: {len(golden_dataset.test_cases)}\")\n",
- "print(f\" - Goldens: {len(golden_dataset.goldens)}\")\n",
- "\n",
- "# Convert goldens to test cases using our mock LLM\n",
- "golden_dataset.convert_goldens_to_test_cases(mock_llm_application)\n",
- "\n",
- "print(\"\\nAfter conversion:\")\n",
- "print(f\" - Test cases: {len(golden_dataset.test_cases)}\")\n",
- "print(f\" - Goldens: {len(golden_dataset.goldens)}\")\n",
- "\n",
- "print(\"\\nConversion completed!\")\n",
- "\n",
- "# Show the updated dataset\n",
- "print(\"\\nUpdated Dataset with Generated Outputs:\")\n",
- "dataset_df = golden_dataset.df\n",
- "# Filter for rows with actual output\n",
- "mask = pd.notna(dataset_df['actual_output']) & (dataset_df['actual_output'] != '')\n",
- "converted_df = dataset_df[mask]\n",
- "\n",
- "if not converted_df.empty:\n",
- " display(converted_df[['input', 'actual_output', 'expected_output']])\n",
- " \n",
- " # Analyze output lengths using pandas string methods\n",
- " actual_lengths = pd.Series([len(str(x)) for x in converted_df['actual_output']])\n",
- " expected_lengths = pd.Series([len(str(x)) for x in converted_df['expected_output']])\n",
- "else:\n",
- " print(\"No converted test cases found\")\n",
- "\n",
- "print(f\"\\nOutput Analysis:\")\n",
- "print(f\"Average actual output length: {actual_lengths.mean():.0f} characters\")\n",
- "print(f\"Average expected output length: {expected_lengths.mean():.0f} characters\")\n",
- "print(f\"Ratio (actual/expected): {(actual_lengths.mean() / expected_lengths.mean()):.2f}x\")\n"
+ "agent_dataset.assign_scores(\n",
+ " metrics = \"validmind.scorer.llm.deepeval.Hallucination\",\n",
+ " input_column = \"input\",\n",
+ " actual_output_column = \"actual_output\",\n",
+ " context_column = \"retrieval_context\",\n",
+ ")\n",
+ "agent_dataset._df.head()"
]
},
{
"cell_type": "markdown",
- "metadata": {
- "vscode": {
- "languageId": "raw"
- }
- },
+ "metadata": {},
"source": [
- "\n",
- "\n",
- "## ValidMind Integration\n",
+ "\n",
"\n",
- "Now let's demonstrate how to integrate our LLMAgentDataset with ValidMind's testing framework, enabling comprehensive documentation and compliance features.\n"
+ "#### Summarization\n",
+ "The Summarization metric evaluates how well a model's output summarizes the given context by generating assessment questions to check if the summary is factually aligned with and sufficiently covers the source text. It helps ensure that summaries are accurate, complete, and maintain the key information from the original content without introducing unsupported details or omitting critical points. A high summarization score indicates that the model effectively condenses the source material while preserving its essential meaning.\n"
]
},
{
@@ -716,71 +779,22 @@
"metadata": {},
"outputs": [],
"source": [
- "# Initialize ValidMind\n",
- "print(\"Integrating with ValidMind framework...\")\n",
- "\n",
- "try:\n",
- " # Initialize ValidMind\n",
- " vm.init()\n",
- " print(\"ValidMind initialized\")\n",
- " \n",
- " # Register our datasets with ValidMind\n",
- " datasets_to_register = [\n",
- " (simple_dataset, \"simple_qa_dataset\"),\n",
- " (rag_dataset, \"rag_evaluation_dataset\"),\n",
- " (agent_dataset, \"agent_evaluation_dataset\"),\n",
- " (golden_dataset, \"golden_templates_dataset\")\n",
- " ]\n",
- " \n",
- " for dataset, dataset_id in datasets_to_register:\n",
- " try:\n",
- " vm.init_dataset(\n",
- " dataset=dataset.df,\n",
- " input_id=dataset_id,\n",
- " text_column=\"input\",\n",
- " target_column=\"expected_output\"\n",
- " )\n",
- " print(f\"Registered: {dataset_id}\")\n",
- " except Exception as e:\n",
- " print(f\"WARNING: Failed to register {dataset_id}: {e}\")\n",
- " \n",
- " # Note: ValidMind datasets are now registered and can be used in test suites\n",
- " print(\"\\nValidMind Integration Complete:\")\n",
- " print(\" - Datasets registered successfully\")\n",
- " print(\" - Ready for use in ValidMind test suites\")\n",
- " print(\" - Can be referenced by their input_id in test configurations\")\n",
- " \n",
- "except Exception as e:\n",
- " print(f\"ERROR: ValidMind integration failed: {e}\")\n",
- " print(\"Note: Some ValidMind features may require additional setup\")\n",
- "\n",
- "# Demonstrate dataset compatibility\n",
- "print(f\"\\nDataset Compatibility Check:\")\n",
- "print(f\"All datasets inherit from VMDataset: SUCCESS\")\n",
- "\n",
- "for dataset, name in [(simple_dataset, \"Simple Q&A\"), (rag_dataset, \"RAG\"), (agent_dataset, \"Agent\"), (golden_dataset, \"Golden\")]:\n",
- " print(f\"\\n{name} Dataset:\")\n",
- " print(f\" - Type: {type(dataset).__name__}\")\n",
- " print(f\" - Inherits VMDataset: {hasattr(dataset, 'df')}\")\n",
- " print(f\" - Has text_column: {hasattr(dataset, 'text_column')}\")\n",
- " print(f\" - Has target_column: {hasattr(dataset, 'target_column')}\")\n",
- " print(f\" - DataFrame shape: {dataset.df.shape}\")\n",
- " print(f\" - Columns: {len(dataset.columns)}\")\n"
+ "agent_dataset.assign_scores(\n",
+ " metrics = \"validmind.scorer.llm.deepeval.Summarization\",\n",
+ " input_column = \"input\",\n",
+ " actual_output_column = \"actual_output\",\n",
+ ")\n",
+ "agent_dataset._df.head()"
]
},
{
"cell_type": "markdown",
- "metadata": {
- "vscode": {
- "languageId": "raw"
- }
- },
+ "metadata": {},
"source": [
- "\n",
+ "\n",
"\n",
- "## Custom Metrics with G-Eval\n",
- "\n",
- "One of DeepEval's most powerful features is the ability to create custom evaluation metrics using G-Eval (Generative Evaluation). This enables domain-specific evaluation criteria tailored to your use case.\n"
+ "#### Task Completion\n",
+ "The Task Completion metric evaluates whether the model's output successfully accomplishes the intended task or goal specified in the input prompt. It assesses if the model has properly understood the task requirements and provided a complete and appropriate response. A high task completion score indicates that the model has effectively addressed the core objective of the prompt and delivered a satisfactory solution.\n"
]
},
{
@@ -789,93 +803,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# Create custom evaluation metrics using G-Eval\n",
- "print(\"Creating custom evaluation metrics...\")\n",
- "\n",
- "# Custom metric 1: Technical Accuracy\n",
- "technical_accuracy_metric = GEval(\n",
- " name=\"Technical Accuracy\",\n",
- " criteria=\"\"\"Evaluate whether the response is technically accurate and uses appropriate \n",
- " terminology for the domain. Consider if the explanations are scientifically sound \n",
- " and if technical concepts are explained correctly.\"\"\",\n",
- " evaluation_params=[\n",
- " LLMTestCaseParams.INPUT,\n",
- " LLMTestCaseParams.ACTUAL_OUTPUT,\n",
- " LLMTestCaseParams.CONTEXT\n",
- " ],\n",
- " threshold=0.8\n",
- ")\n",
- "\n",
- "# Custom metric 2: Clarity and Comprehensiveness \n",
- "clarity_metric = GEval(\n",
- " name=\"Clarity and Comprehensiveness\",\n",
- " criteria=\"\"\"Assess whether the response is clear, well-structured, and comprehensive. \n",
- " The response should be easy to understand, logically organized, and address all \n",
- " aspects of the user's question without being overly verbose.\"\"\",\n",
- " evaluation_params=[\n",
- " LLMTestCaseParams.INPUT,\n",
- " LLMTestCaseParams.ACTUAL_OUTPUT\n",
- " ],\n",
- " threshold=0.75\n",
- ")\n",
- "\n",
- "# Custom metric 3: Business Context Appropriateness\n",
- "business_context_metric = GEval(\n",
- " name=\"Business Context Appropriateness\", \n",
- " criteria=\"\"\"Evaluate whether the response is appropriate for a business context. \n",
- " Consider if the tone is professional, if the content is relevant to business needs, \n",
- " and if it provides actionable information that would be valuable to a business user.\"\"\",\n",
- " evaluation_params=[\n",
- " LLMTestCaseParams.INPUT,\n",
- " LLMTestCaseParams.ACTUAL_OUTPUT,\n",
- " LLMTestCaseParams.EXPECTED_OUTPUT\n",
- " ],\n",
- " threshold=0.7\n",
- ")\n",
+ "agent_dataset.assign_scores(\n",
+ " metrics = \"validmind.scorer.llm.deepeval.TaskCompletion\",\n",
+ " input_column = \"input\",\n",
+ " actual_output_column = \"actual_output\",\n",
+ " agent_output_column = \"agent_output\",\n",
+ " tools_called_column = \"tools_called\",\n",
"\n",
- "# Custom metric 4: Tool Usage Appropriateness (for agents)\n",
- "tool_usage_metric = GEval(\n",
- " name=\"Tool Usage Appropriateness\",\n",
- " criteria=\"\"\"Evaluate whether the agent used appropriate tools for the given task. \n",
- " Consider if the tools were necessary, if they were used correctly, and if the \n",
- " agent's reasoning for tool selection was sound.\"\"\",\n",
- " evaluation_params=[\n",
- " LLMTestCaseParams.INPUT,\n",
- " LLMTestCaseParams.ACTUAL_OUTPUT\n",
- " ],\n",
- " threshold=0.8\n",
")\n",
- "\n",
- "custom_metrics = [\n",
- " technical_accuracy_metric,\n",
- " clarity_metric, \n",
- " business_context_metric,\n",
- " tool_usage_metric\n",
- "]\n",
- "\n",
- "print(\"Custom metrics created:\")\n",
- "for metric in custom_metrics:\n",
- " print(f\" - {metric.name}: threshold {metric.threshold}\")\n",
- "\n",
- "# Demonstrate metric application to different dataset types\n",
- "print(f\"\\nMetric-Dataset Matching:\")\n",
- "metric_dataset_pairs = [\n",
- " (\"Technical Accuracy\", \"golden_templates_dataset (tech questions)\"),\n",
- " (\"Clarity and Comprehensiveness\", \"simple_qa_dataset (general Q&A)\"),\n",
- " (\"Business Context Appropriateness\", \"rag_evaluation_dataset (business support)\"),\n",
- " (\"Tool Usage Appropriateness\", \"agent_evaluation_dataset (agent actions)\")\n",
- "]\n",
- "\n",
- "for metric_name, dataset_name in metric_dataset_pairs:\n",
- " print(f\" - {metric_name} → {dataset_name}\")\n",
- "\n",
- "print(f\"\\nEvaluation Setup (Demo Mode):\")\n",
- "print(\"Note: Actual evaluation requires OpenAI API key\")\n",
- "print(\"These metrics would evaluate:\")\n",
- "print(\" - Technical accuracy of AI/ML explanations\") \n",
- "print(\" - Clarity of business support responses\")\n",
- "print(\" - Appropriateness of agent tool usage\")\n",
- "print(\" - Overall comprehensiveness across all domains\")\n"
+ "agent_dataset._df.head()"
]
},
{
@@ -895,8 +831,6 @@
"**Key Achievements:**\n",
"- Successfully created and evaluated different types of LLM test cases (Q&A, RAG, Agents)\n",
"- Integrated DeepEval metrics with ValidMind's testing infrastructure\n",
- "- Demonstrated Golden template workflows for systematic testing\n",
- "- Created custom evaluation metrics using G-Eval\n",
"- Showed how to handle complex agent scenarios with tool usage\n",
"\n",
"**Integration Benefits:**\n",
diff --git a/pyproject.toml b/pyproject.toml
index e49687e8a..ff8a7f5bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "validmind"
-version = "2.10.0"
+version = "2.10.1"
description = "ValidMind Library"
readme = "README.pypi.md"
requires-python = ">=3.9,<3.13"
diff --git a/validmind/__version__.py b/validmind/__version__.py
index 1c622223b..565443f86 100644
--- a/validmind/__version__.py
+++ b/validmind/__version__.py
@@ -1 +1 @@
-__version__ = "2.10.0"
+__version__ = "2.10.1"
diff --git a/validmind/datasets/llm/agent_dataset.py b/validmind/datasets/llm/agent_dataset.py
index 5441c80de..abb4335af 100644
--- a/validmind/datasets/llm/agent_dataset.py
+++ b/validmind/datasets/llm/agent_dataset.py
@@ -141,9 +141,7 @@ def _convert_to_dataframe(self) -> pd.DataFrame:
"retrieval_context": self._serialize_list_field(
getattr(test_case, "retrieval_context", None)
),
- "tools_called": self._serialize_tools_field(
- getattr(test_case, "tools_called", None)
- ),
+ "tools_called": getattr(test_case, "tools_called", None),
"expected_tools": self._serialize_tools_field(
getattr(test_case, "expected_tools", None)
),
diff --git a/validmind/scorer/llm/deepeval/AnswerRelevancy.py b/validmind/scorer/llm/deepeval/AnswerRelevancy.py
index 86addeb88..784203f76 100644
--- a/validmind/scorer/llm/deepeval/AnswerRelevancy.py
+++ b/validmind/scorer/llm/deepeval/AnswerRelevancy.py
@@ -76,8 +76,8 @@ def AnswerRelevancy(
)
results = []
for _, test_case in dataset.df.iterrows():
- input = test_case["input"]
- actual_output = test_case["actual_output"]
+ input = test_case[input_column]
+ actual_output = test_case[actual_output_column]
test_case = LLMTestCase(
input=input,
diff --git a/validmind/scorer/llm/deepeval/Bias.py b/validmind/scorer/llm/deepeval/Bias.py
new file mode 100644
index 000000000..12b2fe671
--- /dev/null
+++ b/validmind/scorer/llm/deepeval/Bias.py
@@ -0,0 +1,103 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+from typing import Any, Dict, List
+
+from validmind import tags, tasks
+from validmind.ai.utils import get_client_and_model
+from validmind.errors import MissingDependencyError
+from validmind.tests.decorator import scorer
+from validmind.vm_models.dataset import VMDataset
+
+try:
+ from deepeval import evaluate
+ from deepeval.metrics import BiasMetric
+ from deepeval.test_case import LLMTestCase
+except ImportError as e:
+ if "deepeval" in str(e):
+ raise MissingDependencyError(
+ "Missing required package `deepeval` for Bias. "
+ "Please run `pip install validmind[llm]` to use LLM tests",
+ required_dependencies=["deepeval"],
+ extra="llm",
+ ) from e
+
+ raise e
+
+
+# Create custom ValidMind tests for DeepEval metrics
+@scorer()
+@tags("llm", "Bias", "deepeval")
+@tasks("llm")
+def Bias(
+ dataset: VMDataset,
+ threshold: float = 0.5,
+ input_column: str = "input",
+ actual_output_column: str = "actual_output",
+ strict_mode: bool = False,
+) -> List[Dict[str, Any]]:
+ """Detects bias in LLM outputs using deepeval's BiasMetric.
+
+ This scorer evaluates whether an LLM's output contains biased opinions (e.g.,
+ gender, racial, political). It returns a list of dictionaries per row with the
+ bias score and the reason behind the evaluation.
+
+ Args:
+ dataset: The dataset containing input prompts and LLM outputs
+ threshold: Maximum passing threshold for bias (default: 0.5)
+ input_column: Column name for the input prompts (default: "input")
+ actual_output_column: Column name for the model outputs (default: "actual_output")
+ strict_mode: If True, enforces a binary score (0 for perfect, 1 otherwise)
+
+ Returns:
+ List[Dict[str, Any]]: Per-row bias scores and reasons as a list of dictionaries.
+ Each dictionary contains:
+ - "score": float - The bias score (0.0 to 1.0)
+ - "reason": str - Explanation of why the score was assigned
+
+ Raises:
+ ValueError: If required columns are not found in the dataset
+ """
+
+ # Validate required columns exist in dataset
+ if input_column not in dataset.df.columns:
+ raise ValueError(
+ f"Input column '{input_column}' not found in dataset. Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ if actual_output_column not in dataset.df.columns:
+ raise ValueError(
+ f"Actual output column '{actual_output_column}' not found in dataset. Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ _, model = get_client_and_model()
+
+ metric = BiasMetric(
+ threshold=threshold,
+ model=model,
+ include_reason=True,
+ strict_mode=strict_mode,
+ verbose_mode=False,
+ )
+
+ results: List[Dict[str, Any]] = []
+ for _, row in dataset.df.iterrows():
+ input_value = row[input_column]
+ actual_output_value = row[actual_output_column]
+
+ test_case = LLMTestCase(
+ input=input_value,
+ actual_output=actual_output_value,
+ )
+
+ result = evaluate(test_cases=[test_case], metrics=[metric])
+
+ # Extract score and reason from the metric result
+ metric_data = result.test_results[0].metrics_data[0]
+ score = metric_data.score
+ reason = getattr(metric_data, "reason", "No reason provided")
+
+ results.append({"score": score, "reason": reason})
+
+ return results
diff --git a/validmind/scorer/llm/deepeval/ContextualPrecision.py b/validmind/scorer/llm/deepeval/ContextualPrecision.py
new file mode 100644
index 000000000..45959ee37
--- /dev/null
+++ b/validmind/scorer/llm/deepeval/ContextualPrecision.py
@@ -0,0 +1,110 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+from typing import Any, Dict, List
+
+from validmind import tags, tasks
+from validmind.ai.utils import get_client_and_model
+from validmind.errors import MissingDependencyError
+from validmind.tests.decorator import scorer
+from validmind.vm_models.dataset import VMDataset
+
+try:
+ from deepeval import evaluate
+ from deepeval.metrics import ContextualPrecisionMetric
+ from deepeval.test_case import LLMTestCase
+except ImportError as e:
+ if "deepeval" in str(e):
+ raise MissingDependencyError(
+ "Missing required package `deepeval` for ContextualPrecision. "
+ "Please run `pip install validmind[llm]` to use LLM tests",
+ required_dependencies=["deepeval"],
+ extra="llm",
+ ) from e
+
+ raise e
+
+
+# Create custom ValidMind tests for DeepEval metrics
+@scorer()
+@tags("llm", "ContextualPrecision", "deepeval")
+@tasks("llm")
+def ContextualPrecision(
+ dataset: VMDataset,
+ threshold: float = 0.5,
+ input_column: str = "input",
+ expected_output_column: str = "expected_output",
+ retrieval_context_column: str = "retrieval_context",
+ strict_mode: bool = False,
+) -> List[Dict[str, Any]]:
+ """Evaluates RAG retriever ranking using deepeval's ContextualPrecisionMetric.
+
+ The metric checks whether retrieved nodes are correctly ranked by relevance to the
+ query-only input and returns per-row score and reason.
+
+ Args:
+ dataset: Dataset containing query, expected_output, and retrieval_context
+ threshold: Minimum passing threshold (default: 0.5)
+ input_column: Column name for the query-only input (default: "input")
+ expected_output_column: Column for the reference output (default: "expected_output")
+ retrieval_context_column: Column with ranked retrieved nodes list (default: "retrieval_context")
+ strict_mode: If True, enforces a binary score (0 for perfect, 1 otherwise)
+
+ Returns:
+ List[Dict[str, Any]] with keys "score" and "reason" for each row.
+
+ Raises:
+ ValueError: If required columns are missing
+ """
+
+ # Validate required columns exist in dataset
+ missing_columns: List[str] = []
+ for col in [input_column, expected_output_column, retrieval_context_column]:
+ if col not in dataset.df.columns:
+ missing_columns.append(col)
+ if missing_columns:
+ raise ValueError(
+ f"Required columns {missing_columns} not found in dataset. "
+ f"Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ _, model = get_client_and_model()
+
+ metric = ContextualPrecisionMetric(
+ threshold=threshold,
+ model=model,
+ include_reason=True,
+ strict_mode=strict_mode,
+ verbose_mode=False,
+ )
+
+ results: List[Dict[str, Any]] = []
+ for _, row in dataset.df.iterrows():
+ input_value = row[input_column]
+ expected_output_value = row[expected_output_column]
+ retrieval_context_value = (
+ [row[retrieval_context_column]]
+ if not isinstance(row[retrieval_context_column], list)
+ else row[retrieval_context_column]
+ )
+
+ # Ensure retrieval_context is a list of strings
+ if not isinstance(retrieval_context_value, list):
+ raise ValueError(
+ f"Value in '{retrieval_context_column}' must be a list of strings; got {type(retrieval_context_value)}"
+ )
+
+ test_case = LLMTestCase(
+ input=input_value,
+ expected_output=expected_output_value,
+ retrieval_context=retrieval_context_value,
+ )
+
+ result = evaluate(test_cases=[test_case], metrics=[metric])
+ metric_data = result.test_results[0].metrics_data[0]
+ score = metric_data.score
+ reason = getattr(metric_data, "reason", "No reason provided")
+ results.append({"score": score, "reason": reason})
+
+ return results
diff --git a/validmind/scorer/llm/deepeval/ContextualRecall.py b/validmind/scorer/llm/deepeval/ContextualRecall.py
new file mode 100644
index 000000000..ee6df890f
--- /dev/null
+++ b/validmind/scorer/llm/deepeval/ContextualRecall.py
@@ -0,0 +1,110 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+from typing import Any, Dict, List
+
+from validmind import tags, tasks
+from validmind.ai.utils import get_client_and_model
+from validmind.errors import MissingDependencyError
+from validmind.tests.decorator import scorer
+from validmind.vm_models.dataset import VMDataset
+
+try:
+ from deepeval import evaluate
+ from deepeval.metrics import ContextualRecallMetric
+ from deepeval.test_case import LLMTestCase
+except ImportError as e:
+ if "deepeval" in str(e):
+ raise MissingDependencyError(
+ "Missing required package `deepeval` for ContextualRecall. "
+ "Please run `pip install validmind[llm]` to use LLM tests",
+ required_dependencies=["deepeval"],
+ extra="llm",
+ ) from e
+
+ raise e
+
+
+# Create custom ValidMind tests for DeepEval metrics
+@scorer()
+@tags("llm", "ContextualRecall", "deepeval")
+@tasks("llm")
+def ContextualRecall(
+ dataset: VMDataset,
+ threshold: float = 0.5,
+ input_column: str = "input",
+ expected_output_column: str = "expected_output",
+ retrieval_context_column: str = "retrieval_context",
+ strict_mode: bool = False,
+) -> List[Dict[str, Any]]:
+ """Evaluates RAG retriever coverage using deepeval's ContextualRecallMetric.
+
+ The metric extracts statements from the expected output and checks how many are
+ attributable to the retrieved context. Returns per-row score and reason.
+
+ Args:
+ dataset: Dataset containing query, expected_output, and retrieval_context
+ threshold: Minimum passing threshold (default: 0.5)
+ input_column: Column name for the query-only input (default: "input")
+ expected_output_column: Column for the reference output (default: "expected_output")
+ retrieval_context_column: Column with ranked retrieved nodes list (default: "retrieval_context")
+ strict_mode: If True, enforces a binary score (0 for perfect, 1 otherwise)
+
+ Returns:
+ List[Dict[str, Any]] with keys "score" and "reason" for each row.
+
+ Raises:
+ ValueError: If required columns are missing
+ """
+
+ # Validate required columns exist in dataset
+ missing_columns: List[str] = []
+ for col in [input_column, expected_output_column, retrieval_context_column]:
+ if col not in dataset.df.columns:
+ missing_columns.append(col)
+ if missing_columns:
+ raise ValueError(
+ f"Required columns {missing_columns} not found in dataset. "
+ f"Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ _, model = get_client_and_model()
+
+ metric = ContextualRecallMetric(
+ threshold=threshold,
+ model=model,
+ include_reason=True,
+ strict_mode=strict_mode,
+ verbose_mode=False,
+ )
+
+ results: List[Dict[str, Any]] = []
+ for _, row in dataset.df.iterrows():
+ input_value = row[input_column]
+ expected_output_value = row[expected_output_column]
+ retrieval_context_value = (
+ [row[retrieval_context_column]]
+ if not isinstance(row[retrieval_context_column], list)
+ else row[retrieval_context_column]
+ )
+
+ # Ensure retrieval_context is a list of strings
+ if not isinstance(retrieval_context_value, list):
+ raise ValueError(
+ f"Value in '{retrieval_context_column}' must be a list of strings; got {type(retrieval_context_value)}"
+ )
+
+ test_case = LLMTestCase(
+ input=input_value,
+ expected_output=expected_output_value,
+ retrieval_context=retrieval_context_value,
+ )
+
+ result = evaluate(test_cases=[test_case], metrics=[metric])
+ metric_data = result.test_results[0].metrics_data[0]
+ score = metric_data.score
+ reason = getattr(metric_data, "reason", "No reason provided")
+ results.append({"score": score, "reason": reason})
+
+ return results
diff --git a/validmind/scorer/llm/deepeval/ContextualRelevancy.py b/validmind/scorer/llm/deepeval/ContextualRelevancy.py
new file mode 100644
index 000000000..1e0c7708c
--- /dev/null
+++ b/validmind/scorer/llm/deepeval/ContextualRelevancy.py
@@ -0,0 +1,110 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+from typing import Any, Dict, List
+
+from validmind import tags, tasks
+from validmind.ai.utils import get_client_and_model
+from validmind.errors import MissingDependencyError
+from validmind.tests.decorator import scorer
+from validmind.vm_models.dataset import VMDataset
+
+try:
+ from deepeval import evaluate
+ from deepeval.metrics import ContextualRelevancyMetric
+ from deepeval.test_case import LLMTestCase
+except ImportError as e:
+ if "deepeval" in str(e):
+ raise MissingDependencyError(
+ "Missing required package `deepeval` for ContextualRelevancyMetric. "
+ "Please run `pip install validmind[llm]` to use LLM tests",
+ required_dependencies=["deepeval"],
+ extra="llm",
+ ) from e
+
+ raise e
+
+
+# Create custom ValidMind tests for DeepEval metrics
+@scorer()
+@tags("llm", "ContextualRelevancy", "deepeval")
+@tasks("llm")
+def ContextualRelevancy(
+ dataset: VMDataset,
+ threshold: float = 0.5,
+ input_column: str = "input",
+ expected_output_column: str = "expected_output",
+ retrieval_context_column: str = "retrieval_context",
+ strict_mode: bool = False,
+) -> List[Dict[str, Any]]:
+ """Evaluates RAG retriever relevancy using deepeval's ContextualRelevancyMetric.
+
+ This metric checks whether statements in the retrieved context are relevant to the
+ query-only input. Returns per-row score and reason.
+
+ Args:
+ dataset: Dataset containing query, expected_output, and retrieval_context
+ threshold: Minimum passing threshold (default: 0.5)
+ input_column: Column name for the query-only input (default: "input")
+ expected_output_column: Column for the reference output (default: "expected_output")
+ retrieval_context_column: Column with ranked retrieved nodes list (default: "retrieval_context")
+ strict_mode: If True, enforces a binary score (0 for perfect, 1 otherwise)
+
+ Returns:
+ List[Dict[str, Any]] with keys "score" and "reason" for each row.
+
+ Raises:
+ ValueError: If required columns are missing
+ """
+
+ # Validate required columns
+ missing_columns: List[str] = []
+ for col in [input_column, expected_output_column, retrieval_context_column]:
+ if col not in dataset.df.columns:
+ missing_columns.append(col)
+ if missing_columns:
+ raise ValueError(
+ f"Required columns {missing_columns} not found in dataset. "
+ f"Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ _, model = get_client_and_model()
+
+ metric = ContextualRelevancyMetric(
+ threshold=threshold,
+ model=model,
+ include_reason=True,
+ strict_mode=strict_mode,
+ verbose_mode=False,
+ )
+
+ results: List[Dict[str, Any]] = []
+ for _, row in dataset.df.iterrows():
+ input_value = row[input_column]
+ expected_output_value = row[expected_output_column]
+ retrieval_context_value = (
+ [row[retrieval_context_column]]
+ if not isinstance(row[retrieval_context_column], list)
+ else row[retrieval_context_column]
+ )
+
+ # Ensure retrieval_context is a list of strings
+ if not isinstance(retrieval_context_value, list):
+ raise ValueError(
+ f"Value in '{retrieval_context_column}' must be a list of strings; got {type(retrieval_context_value)}"
+ )
+
+ test_case = LLMTestCase(
+ input=input_value,
+ expected_output=expected_output_value,
+ retrieval_context=retrieval_context_value,
+ )
+
+ result = evaluate(test_cases=[test_case], metrics=[metric])
+ metric_data = result.test_results[0].metrics_data[0]
+ score = metric_data.score
+ reason = getattr(metric_data, "reason", "No reason provided")
+ results.append({"score": score, "reason": reason})
+
+ return results
diff --git a/validmind/scorer/llm/deepeval/Faithfulness.py b/validmind/scorer/llm/deepeval/Faithfulness.py
new file mode 100644
index 000000000..b37d32cc3
--- /dev/null
+++ b/validmind/scorer/llm/deepeval/Faithfulness.py
@@ -0,0 +1,110 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+from typing import Any, Dict, List
+
+from validmind import tags, tasks
+from validmind.ai.utils import get_client_and_model
+from validmind.errors import MissingDependencyError
+from validmind.tests.decorator import scorer
+from validmind.vm_models.dataset import VMDataset
+
+try:
+ from deepeval import evaluate
+ from deepeval.metrics import FaithfulnessMetric
+ from deepeval.test_case import LLMTestCase
+except ImportError as e:
+ if "deepeval" in str(e):
+ raise MissingDependencyError(
+ "Missing required package `deepeval` for Faithfulness. "
+ "Please run `pip install validmind[llm]` to use LLM tests",
+ required_dependencies=["deepeval"],
+ extra="llm",
+ ) from e
+
+ raise e
+
+
+# Create custom ValidMind tests for DeepEval metrics
+@scorer()
+@tags("llm", "Faithfulness", "deepeval")
+@tasks("llm")
+def Faithfulness(
+ dataset: VMDataset,
+ threshold: float = 0.5,
+ input_column: str = "input",
+ actual_output_column: str = "actual_output",
+ retrieval_context_column: str = "retrieval_context",
+ strict_mode: bool = False,
+) -> List[Dict[str, Any]]:
+ """Evaluates RAG generator faithfulness using deepeval's FaithfulnessMetric.
+
+ The metric extracts claims from the actual output and checks how many are
+ supported by the retrieved context. Returns per-row score and reason.
+
+ Args:
+ dataset: Dataset containing query, actual_output, and retrieval_context
+ threshold: Minimum passing threshold (default: 0.5)
+ input_column: Column name for the query-only input (default: "input")
+ actual_output_column: Column for the generator output (default: "actual_output")
+ retrieval_context_column: Column with ranked retrieved nodes list (default: "retrieval_context")
+ strict_mode: If True, enforces a binary score (0 for perfect, 1 otherwise)
+
+ Returns:
+ List[Dict[str, Any]] with keys "score" and "reason" for each row.
+
+ Raises:
+ ValueError: If required columns are missing
+ """
+
+ # Validate required columns exist in dataset
+ missing_columns: List[str] = []
+ for col in [input_column, actual_output_column, retrieval_context_column]:
+ if col not in dataset.df.columns:
+ missing_columns.append(col)
+ if missing_columns:
+ raise ValueError(
+ f"Required columns {missing_columns} not found in dataset. "
+ f"Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ _, model = get_client_and_model()
+
+ metric = FaithfulnessMetric(
+ threshold=threshold,
+ model=model,
+ include_reason=True,
+ strict_mode=strict_mode,
+ verbose_mode=False,
+ )
+
+ results: List[Dict[str, Any]] = []
+ for _, row in dataset.df.iterrows():
+ input_value = row[input_column]
+ actual_output_value = row[actual_output_column]
+ retrieval_context_value = (
+ [row[retrieval_context_column]]
+ if not isinstance(row[retrieval_context_column], list)
+ else row[retrieval_context_column]
+ )
+
+ # Ensure retrieval_context is a list of strings
+ if not isinstance(retrieval_context_value, list):
+ raise ValueError(
+ f"Value in '{retrieval_context_column}' must be a list of strings; got {type(retrieval_context_value)}"
+ )
+
+ test_case = LLMTestCase(
+ input=input_value,
+ actual_output=actual_output_value,
+ retrieval_context=retrieval_context_value,
+ )
+
+ result = evaluate(test_cases=[test_case], metrics=[metric])
+ metric_data = result.test_results[0].metrics_data[0]
+ score = metric_data.score
+ reason = getattr(metric_data, "reason", "No reason provided")
+ results.append({"score": score, "reason": reason})
+
+ return results
diff --git a/validmind/scorer/llm/deepeval/Hallucination.py b/validmind/scorer/llm/deepeval/Hallucination.py
new file mode 100644
index 000000000..ace0f37b6
--- /dev/null
+++ b/validmind/scorer/llm/deepeval/Hallucination.py
@@ -0,0 +1,110 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+from typing import Any, Dict, List
+
+from validmind import tags, tasks
+from validmind.ai.utils import get_client_and_model
+from validmind.errors import MissingDependencyError
+from validmind.tests.decorator import scorer
+from validmind.vm_models.dataset import VMDataset
+
+try:
+ from deepeval import evaluate
+ from deepeval.metrics import HallucinationMetric
+ from deepeval.test_case import LLMTestCase
+except ImportError as e:
+ if "deepeval" in str(e):
+ raise MissingDependencyError(
+ "Missing required package `deepeval` for Hallucination. "
+ "Please run `pip install validmind[llm]` to use LLM tests",
+ required_dependencies=["deepeval"],
+ extra="llm",
+ ) from e
+
+ raise e
+
+
+# Create custom ValidMind tests for DeepEval metrics
+@scorer()
+@tags("llm", "Hallucination", "deepeval")
+@tasks("llm")
+def Hallucination(
+ dataset: VMDataset,
+ threshold: float = 0.5,
+ input_column: str = "input",
+ actual_output_column: str = "actual_output",
+ context_column: str = "context",
+ strict_mode: bool = False,
+) -> List[Dict[str, Any]]:
+ """Detects hallucinations in LLM outputs using deepeval's HallucinationMetric.
+
+ The metric checks whether the actual output contradicts the provided context,
+ treating the context as ground truth. Returns per-row score and reason.
+
+ Args:
+ dataset: Dataset containing input, actual_output, and context
+ threshold: Maximum passing threshold (default: 0.5)
+ input_column: Column name for the input (default: "input")
+ actual_output_column: Column for the model output (default: "actual_output")
+ context_column: Column with context list (default: "context")
+ strict_mode: If True, enforces a binary score (0 for perfect, 1 otherwise)
+
+ Returns:
+ List[Dict[str, Any]] with keys "score" and "reason" for each row.
+
+ Raises:
+ ValueError: If required columns are missing
+ """
+
+ # Validate required columns exist in dataset
+ missing_columns: List[str] = []
+ for col in [input_column, actual_output_column, context_column]:
+ if col not in dataset.df.columns:
+ missing_columns.append(col)
+ if missing_columns:
+ raise ValueError(
+ f"Required columns {missing_columns} not found in dataset. "
+ f"Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ _, model = get_client_and_model()
+
+ metric = HallucinationMetric(
+ threshold=threshold,
+ model=model,
+ include_reason=True,
+ strict_mode=strict_mode,
+ verbose_mode=False,
+ )
+
+ results: List[Dict[str, Any]] = []
+ for _, row in dataset.df.iterrows():
+ input_value = row[input_column]
+ actual_output_value = row[actual_output_column]
+ context_value = (
+ [row[context_column]]
+ if not isinstance(row[context_column], list)
+ else row[context_column]
+ )
+
+ # Ensure context is a list of strings
+ if not isinstance(context_value, list):
+ raise ValueError(
+ f"Value in '{context_column}' must be a list of strings; got {type(context_value)}"
+ )
+
+ test_case = LLMTestCase(
+ input=input_value,
+ actual_output=actual_output_value,
+ context=context_value,
+ )
+
+ result = evaluate(test_cases=[test_case], metrics=[metric])
+ metric_data = result.test_results[0].metrics_data[0]
+ score = metric_data.score
+ reason = getattr(metric_data, "reason", "No reason provided")
+ results.append({"score": score, "reason": reason})
+
+ return results
diff --git a/validmind/scorer/llm/deepeval/Summarization.py b/validmind/scorer/llm/deepeval/Summarization.py
new file mode 100644
index 000000000..809d038e6
--- /dev/null
+++ b/validmind/scorer/llm/deepeval/Summarization.py
@@ -0,0 +1,111 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+from typing import Any, Dict, List, Optional
+
+from validmind import tags, tasks
+from validmind.ai.utils import get_client_and_model
+from validmind.errors import MissingDependencyError
+from validmind.tests.decorator import scorer
+from validmind.vm_models.dataset import VMDataset
+
+try:
+ from deepeval import evaluate
+ from deepeval.metrics import SummarizationMetric
+ from deepeval.test_case import LLMTestCase
+except ImportError as e:
+ if "deepeval" in str(e):
+ raise MissingDependencyError(
+ "Missing required package `deepeval` for Summarization. "
+ "Please run `pip install validmind[llm]` to use LLM tests",
+ required_dependencies=["deepeval"],
+ extra="llm",
+ ) from e
+
+ raise e
+
+
+# Create custom ValidMind tests for DeepEval metrics
+@scorer()
+@tags("llm", "Summarization", "deepeval")
+@tasks("llm")
+def Summarization(
+ dataset: VMDataset,
+ threshold: float = 0.5,
+ input_column: str = "input",
+ actual_output_column: str = "actual_output",
+ assessment_questions: Optional[List[str]] = None,
+ n: int = 5,
+ truths_extraction_limit: Optional[int] = None,
+ strict_mode: bool = False,
+) -> List[Dict[str, Any]]:
+ """Evaluates summary quality using deepeval's SummarizationMetric.
+
+ The metric generates or uses provided close-ended questions to assess if the
+ summary is factually aligned with and sufficiently covers the source text.
+
+ Args:
+ dataset: Dataset containing original text and generated summary
+ threshold: Minimum passing threshold (default: 0.5)
+ input_column: Column name for the original text (default: "input")
+ actual_output_column: Column for the generated summary (default: "actual_output")
+ assessment_questions: Optional list of yes/no questions to assess the summary
+ n: Number of assessment questions to generate when not provided (default: 5)
+ truths_extraction_limit: Optional cap for number of truths extracted from input
+ strict_mode: If True, enforces a binary score (0 for perfect, 1 otherwise)
+
+ Returns:
+ List[Dict[str, Any]] with keys "score" and "reason" for each row.
+
+ Raises:
+ ValueError: If required columns are missing
+ """
+
+ # Validate required columns exist in dataset
+ missing_columns: List[str] = []
+ for col in [input_column, actual_output_column]:
+ if col not in dataset.df.columns:
+ missing_columns.append(col)
+ if missing_columns:
+ raise ValueError(
+ f"Required columns {missing_columns} not found in dataset. "
+ f"Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ _, model = get_client_and_model()
+
+ # Build metric with optional parameters
+ metric_kwargs: Dict[str, Any] = dict(
+ threshold=threshold,
+ model=model,
+ include_reason=True,
+ strict_mode=strict_mode,
+ verbose_mode=False,
+ )
+ if assessment_questions is not None:
+ metric_kwargs["assessment_questions"] = assessment_questions
+ else:
+ metric_kwargs["n"] = n
+ if truths_extraction_limit is not None:
+ metric_kwargs["truths_extraction_limit"] = truths_extraction_limit
+
+ metric = SummarizationMetric(**metric_kwargs)
+
+ results: List[Dict[str, Any]] = []
+ for _, row in dataset.df.iterrows():
+ input_value = row[input_column]
+ actual_output_value = row[actual_output_column]
+
+ test_case = LLMTestCase(
+ input=input_value,
+ actual_output=actual_output_value,
+ )
+
+ result = evaluate(test_cases=[test_case], metrics=[metric])
+ metric_data = result.test_results[0].metrics_data[0]
+ score = metric_data.score
+ reason = getattr(metric_data, "reason", "No reason provided")
+ results.append({"score": score, "reason": reason})
+
+ return results
diff --git a/validmind/scorer/llm/deepeval/TaskCompletion.py b/validmind/scorer/llm/deepeval/TaskCompletion.py
new file mode 100644
index 000000000..9599b49d0
--- /dev/null
+++ b/validmind/scorer/llm/deepeval/TaskCompletion.py
@@ -0,0 +1,208 @@
+# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
+# See the LICENSE file in the root of this repository for details.
+# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+
+from typing import Any, Dict, List
+
+from validmind import tags, tasks
+from validmind.ai.utils import get_client_and_model
+from validmind.errors import MissingDependencyError
+from validmind.tests.decorator import scorer
+from validmind.vm_models.dataset import VMDataset
+
+try:
+ from deepeval import evaluate
+ from deepeval.metrics import TaskCompletionMetric
+ from deepeval.test_case import LLMTestCase, ToolCall
+except ImportError as e:
+ if "deepeval" in str(e):
+ raise MissingDependencyError(
+ "Missing required package `deepeval` for TaskCompletion. "
+ "Please run `pip install validmind[llm]` to use LLM tests",
+ required_dependencies=["deepeval"],
+ extra="llm",
+ ) from e
+
+ raise e
+
+
+def _extract_tool_responses(messages: List[Any]) -> Dict[str, str]:
+ """Extract tool responses from messages."""
+ tool_responses = {}
+
+ for message in messages:
+ # Handle both object and dictionary formats
+ if isinstance(message, dict):
+ # Dictionary format
+ if (
+ message.get("name")
+ and message.get("content")
+ and message.get("tool_call_id")
+ ):
+ tool_responses[message["tool_call_id"]] = message["content"]
+ else:
+ # Object format
+ if hasattr(message, "name") and hasattr(message, "content"):
+ if hasattr(message, "tool_call_id"):
+ tool_responses[message.tool_call_id] = message.content
+
+ return tool_responses
+
+
+def _extract_tool_calls_from_message(
+ message: Any, tool_responses: Dict[str, str]
+) -> List[ToolCall]:
+ """Extract tool calls from a single message."""
+ tool_calls = []
+
+ # Handle both object and dictionary formats
+ if isinstance(message, dict):
+ # Dictionary format
+ if message.get("tool_calls"):
+ for tool_call in message["tool_calls"]:
+ tool_name = tool_call.get("name")
+ tool_args = tool_call.get("args", {})
+ tool_id = tool_call.get("id")
+
+ if tool_name and tool_id:
+ # Get the response for this tool call
+ response = tool_responses.get(tool_id, "")
+
+ # Create ToolCall object
+ tool_call_obj = ToolCall(
+ name=tool_name, input_parameters=tool_args, output=response
+ )
+ tool_calls.append(tool_call_obj)
+ else:
+ # Object format
+ if hasattr(message, "tool_calls") and message.tool_calls:
+ for tool_call in message.tool_calls:
+ # Handle both dictionary and object formats
+ if isinstance(tool_call, dict):
+ tool_name = tool_call.get("name")
+ tool_args = tool_call.get("args", {})
+ tool_id = tool_call.get("id")
+ else:
+ # ToolCall object
+ tool_name = getattr(tool_call, "name", None)
+ tool_args = getattr(tool_call, "args", {})
+ tool_id = getattr(tool_call, "id", None)
+
+ if tool_name and tool_id:
+ # Get the response for this tool call
+ response = tool_responses.get(tool_id, "")
+
+ # Create ToolCall object
+ tool_call_obj = ToolCall(
+ name=tool_name, input_parameters=tool_args, output=response
+ )
+ tool_calls.append(tool_call_obj)
+
+ return tool_calls
+
+
+def extract_tool_calls_from_agent_output(
+ agent_output: Dict[str, Any]
+) -> List[ToolCall]:
+ """
+ Extract tool calls from the banking_agent_model_output column.
+
+ Args:
+ agent_output: The dictionary from banking_agent_model_output column
+
+ Returns:
+ List of ToolCall objects with name, args, and response
+ """
+ tool_calls = []
+
+ if not isinstance(agent_output, dict) or "messages" not in agent_output:
+ return tool_calls
+
+ messages = agent_output["messages"]
+
+ # First pass: collect tool responses
+ tool_responses = _extract_tool_responses(messages)
+
+ # Second pass: extract tool calls and match with responses
+ for message in messages:
+ message_tool_calls = _extract_tool_calls_from_message(message, tool_responses)
+ tool_calls.extend(message_tool_calls)
+
+ return tool_calls
+
+
+# Create custom ValidMind tests for DeepEval metrics
+@scorer()
+@tags("llm", "TaskCompletion", "deepeval")
+@tasks("llm")
+def TaskCompletion(
+ dataset: VMDataset,
+ threshold: float = 0.5,
+ input_column: str = "input",
+ actual_output_column: str = "actual_output",
+ agent_output_column: str = "agent_output",
+ tools_called_column: str = "tools_called",
+ strict_mode: bool = False,
+) -> List[Dict[str, Any]]:
+ """Evaluates agent task completion using deepeval's TaskCompletionMetric.
+
+ This metric assesses whether the agent's output completes the requested task.
+
+ Args:
+ dataset: Dataset containing the agent input and final output
+ threshold: Minimum passing threshold (default: 0.5)
+ input_column: Column name for the task input (default: "input")
+ actual_output_column: Column for the agent's final output (default: "actual_output")
+ strict_mode: If True, enforces a binary score (0 for perfect, 1 otherwise)
+
+ Returns:
+ List[Dict[str, Any]] with keys "score" and "reason" for each row.
+
+ Raises:
+ ValueError: If required columns are missing
+ """
+
+ # Validate required columns exist in dataset
+ missing_columns: List[str] = []
+ for col in [input_column, actual_output_column]:
+ if col not in dataset._df.columns:
+ missing_columns.append(col)
+ if missing_columns:
+ raise ValueError(
+ f"Required columns {missing_columns} not found in dataset. "
+ f"Available columns: {dataset.df.columns.tolist()}"
+ )
+
+ _, model = get_client_and_model()
+
+ metric = TaskCompletionMetric(
+ threshold=threshold,
+ model=model,
+ include_reason=True,
+ strict_mode=strict_mode,
+ verbose_mode=False,
+ )
+
+ results: List[Dict[str, Any]] = []
+ for _, row in dataset._df.iterrows():
+ input_value = row[input_column]
+ actual_output_value = row[actual_output_column]
+ if tools_called_column in dataset._df.columns:
+ all_tool_calls = row[tools_called_column]
+ else:
+ agent_output = row.get(agent_output_column, {})
+ all_tool_calls = extract_tool_calls_from_agent_output(agent_output)
+
+ test_case = LLMTestCase(
+ input=input_value,
+ actual_output=actual_output_value,
+ tools_called=all_tool_calls,
+ )
+
+ result = evaluate(test_cases=[test_case], metrics=[metric])
+ metric_data = result.test_results[0].metrics_data[0]
+ score = metric_data.score
+ reason = getattr(metric_data, "reason", "No reason provided")
+ results.append({"score": score, "reason": reason})
+
+ return results
diff --git a/validmind/tests/plots/BoxPlot.py b/validmind/tests/plots/BoxPlot.py
index 9074237d8..cd0b1b4a1 100644
--- a/validmind/tests/plots/BoxPlot.py
+++ b/validmind/tests/plots/BoxPlot.py
@@ -4,6 +4,7 @@
from typing import List, Optional
+import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
@@ -16,17 +17,29 @@ def _validate_inputs(
dataset: VMDataset, columns: Optional[List[str]], group_by: Optional[str]
):
"""Validate inputs and return validated columns."""
+
+ # Get dtypes without loading data into memory
+ if not isinstance(columns, list):
+ columns = [columns]
+
+ columns_dtypes = dataset._df[columns].dtypes
+
+ columns_numeric = []
+ columns_numeric = columns_dtypes[
+ columns_dtypes.apply(lambda x: pd.api.types.is_numeric_dtype(x))
+ ].index.tolist()
+
if columns is None:
- columns = dataset.feature_columns_numeric
+ columns = columns_numeric
else:
- available_columns = set(dataset.feature_columns_numeric)
+ available_columns = set(columns_numeric)
columns = [col for col in columns if col in available_columns]
if not columns:
raise SkipTestError("No numerical columns found for box plotting")
if group_by is not None:
- if group_by not in dataset.df.columns:
+ if group_by not in dataset._df.columns:
raise SkipTestError(f"Group column '{group_by}' not found in dataset")
if group_by in columns:
columns.remove(group_by)