From a740d278fc203bf61744928d98aff374dfea01b8 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Wed, 17 Dec 2025 12:59:27 +0000 Subject: [PATCH 1/6] add agetic evaluation tests --- .gitignore | 3 + .../deepeval_integration_demo.ipynb | 267 ++++++++++++--- poetry.lock | 306 +----------------- pyproject.toml | 2 +- validmind/datasets/llm/agent_dataset.py | 20 +- .../llm/deepeval/ArgumentCorrectness.py | 118 +++++++ .../scorer/llm/deepeval/PlanAdherence.py | 150 +++++++++ validmind/scorer/llm/deepeval/PlanQuality.py | 105 ++++++ .../scorer/llm/deepeval/ToolCorrectness.py | 130 ++++++++ validmind/scorer/llm/deepeval/__init__.py | 159 ++++++++- 10 files changed, 920 insertions(+), 340 deletions(-) create mode 100644 validmind/scorer/llm/deepeval/ArgumentCorrectness.py create mode 100644 validmind/scorer/llm/deepeval/PlanAdherence.py create mode 100644 validmind/scorer/llm/deepeval/PlanQuality.py create mode 100644 validmind/scorer/llm/deepeval/ToolCorrectness.py diff --git a/.gitignore b/.gitignore index 539d53570..42e56c29e 100644 --- a/.gitignore +++ b/.gitignore @@ -230,3 +230,6 @@ docs/validmind.json # DeepEval *.deepeval/ + +# Python cache +__pycache__/ diff --git a/notebooks/code_sharing/deepeval_integration_demo.ipynb b/notebooks/code_sharing/deepeval_integration_demo.ipynb index 1de828941..8ce0a67ad 100644 --- a/notebooks/code_sharing/deepeval_integration_demo.ipynb +++ b/notebooks/code_sharing/deepeval_integration_demo.ipynb @@ -51,7 +51,7 @@ " - [Faithfulness](#toc6_3_1_) \n", " - [Hallucination](#toc6_3_2_) \n", " - [Summarization](#toc6_3_3_) \n", - " - [Task Completion](#toc6_3_4_) \n", + " - [AI Agent Evaluation Metrics](#toc6_3_4_) \n", "- [In summary](#toc10_) \n", "- [Next steps](#toc11_) \n", "\n" @@ -203,11 +203,11 @@ "import validmind as vm\n", "\n", "vm.init(\n", - " api_host=\"...\",\n", - " api_key=\"...\",\n", - " api_secret=\"...\",\n", - " model=\"...\",\n", - ")" + " api_host=\"https://api.prod.validmind.ai/api/v1/tracking\",\n", + " api_key=\"60356f9120477ffca344e945be326ee7\",\n", + " api_secret=\"3d21c73e14c12266a4addf0d9673cec81bc4f0d23b329f6c0e3b599d7bf5052a\",\n", + " model=\"clul6y51o02ct1ojrog2d4dus\",\n", + ")\n" ] }, { @@ -254,42 +254,57 @@ "outputs": [], "source": [ "simple_test_cases = [\n", - "LLMTestCase(\n", - " input=\"What is machine learning?\",\n", - " actual_output=\"\"\"Machine learning is a subset of artificial intelligence (AI) that enables \n", + " LLMTestCase(\n", + " input=\"What is machine learning?\",\n", + " actual_output=\"\"\"Machine learning is a subset of artificial intelligence (AI) that enables \n", " computers to learn and make decisions from data without being explicitly programmed for every task. \n", " It uses algorithms to find patterns in data and make predictions or decisions based on those patterns.\"\"\",\n", - " expected_output=\"\"\"Machine learning is a method of data analysis that automates analytical \n", + " expected_output=\"\"\"Machine learning is a method of data analysis that automates analytical \n", " model building. It uses algorithms that iteratively learn from data, allowing computers to find \n", " hidden insights without being explicitly programmed where to look.\"\"\",\n", - " context=[\"Machine learning is a branch of AI that focuses on algorithms that can learn from data.\"],\n", - " retrieval_context=[\"Machine learning is a branch of AI that focuses on algorithms that can learn from data.\"],\n", - " tools_called=[\n", - " ToolCall(\n", - " name=\"search_docs\",\n", - " args={\"query\": \"machine learning definition\"},\n", - " response=\"Found definition of machine learning in documentation.\"\n", - " )\n", - " ]\n", - "),\n", - "LLMTestCase(\n", - " input=\"What is deep learning?\",\n", - " actual_output=\"\"\"Bananas are yellow fruits that grow on trees in tropical climates. \n", + " context=[\"Machine learning is a branch of AI that focuses on algorithms that can learn from data.\"],\n", + " retrieval_context=[\"Machine learning is a branch of AI that focuses on algorithms that can learn from data.\"],\n", + " tools_called=[\n", + " ToolCall(\n", + " name=\"search_docs\",\n", + " input={\"query\": \"machine learning definition\"},\n", + " response=\"Found definition of machine learning in documentation.\"\n", + " )\n", + " ],\n", + " expected_tools=[\n", + " ToolCall(\n", + " name=\"search_docs\",\n", + " input={\"query\": \"machine learning definition\"},\n", + " response=\"Found definition of machine learning in documentation.\"\n", + " )\n", + " ]\n", + " ),\n", + " LLMTestCase(\n", + " input=\"What is deep learning?\",\n", + " actual_output=\"\"\"Bananas are yellow fruits that grow on trees in tropical climates. \n", " They are rich in potassium and make a great healthy snack. You can also use them \n", " in smoothies and baking.\"\"\",\n", - " expected_output=\"\"\"Deep learning is an advanced machine learning technique that uses neural networks\n", + " expected_output=\"\"\"Deep learning is an advanced machine learning technique that uses neural networks\n", " with many layers to automatically learn representations of data with multiple levels of abstraction.\n", " It has enabled major breakthroughs in AI applications.\"\"\",\n", - " context=[\"Deep learning is a specialized machine learning approach that uses deep neural networks to learn from data.\"],\n", - " retrieval_context=[\"Deep learning is a specialized machine learning approach that uses deep neural networks to learn from data.\"],\n", - " tools_called=[\n", - " ToolCall(\n", - " name=\"search_docs\", \n", - " args={\"query\": \"deep learning definition\"},\n", - " response=\"Found definition of deep learning in documentation.\"\n", - " )\n", - " ]\n", - ")]\n" + " context=[\"Deep learning is a specialized machine learning approach that uses deep neural networks to learn from data.\"],\n", + " retrieval_context=[\"Deep learning is a specialized machine learning approach that uses deep neural networks to learn from data.\"],\n", + " tools_called=[\n", + " ToolCall(\n", + " name=\"search_docs\", \n", + " args={\"query\": \"deep learning definition\"},\n", + " response=\"Found definition of deep learning in documentation.\"\n", + " )\n", + " ],\n", + " expected_tools=[\n", + " ToolCall(\n", + " name=\"summarize_docs\", \n", + " args={\"query\": \"deep learning definition\"},\n", + " response=\"Generated summary of deep learning from documentation.\"\n", + " )\n", + " ]\n", + " )\n", + "]\n" ] }, { @@ -313,7 +328,7 @@ " input_id=\"simple_qa_dataset\"\n", ")\n", "\n", - "\n", + "print(simple_dataset)\n", "# Display the dataset\n", "pd.set_option('display.max_colwidth', 40)\n", "pd.set_option('display.width', 120)\n", @@ -322,6 +337,15 @@ "display(simple_dataset.df)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "simple_dataset._df.columns" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -710,6 +734,15 @@ "display(agent_dataset.df[['input', 'actual_output', 'tools_called']].head())" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent_dataset.df.head()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -793,8 +826,139 @@ "source": [ "\n", "\n", - "#### Task Completion\n", - "The Task Completion metric evaluates whether the model's output successfully accomplishes the intended task or goal specified in the input prompt. It assesses if the model has properly understood the task requirements and provided a complete and appropriate response. A high task completion score indicates that the model has effectively addressed the core objective of the prompt and delivered a satisfactory solution.\n" + "### AI Agent Evaluation Metrics\n", + "\n", + "AI agent evaluation metrics are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the **full execution trace**—including reasoning steps, tool calls, intermediate decisions, and outcomes—rather than just single input–output pairs.\n", + "\n", + "These metrics are essential because agent failures often occur in ways traditional LLM metrics miss (e.g., choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently).\n", + "\n", + "**DeepEval’s AI agent evaluation framework** breaks evaluation into three layers with corresponding metric categories:\n", + "\n", + "1. **Reasoning Layer** – Evaluates planning and strategy generation:\n", + "\n", + " * *PlanQualityMetric* – how logical, complete, and efficient the agent’s plan is\n", + " * *PlanAdherenceMetric* – whether the agent follows its own plan during execution \n", + "\n", + "2. **Action Layer** – Assesses tool usage and argument generation:\n", + "\n", + " * *ToolCorrectnessMetric* – whether the agent selects and calls the right tools\n", + " * *ArgumentCorrectnessMetric* – whether the agent generates correct tool arguments\n", + "\n", + "3. **Execution Layer** – Measures end-to-end performance:\n", + "\n", + " * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n", + " * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n", + "\n", + "Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### **Reasoning Layer**\n", + "#### PlanQualityMetric\n", + "Let's measures how well the agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent_dataset.assign_scores(\n", + " metrics = \"validmind.scorer.llm.deepeval.PlanQuality\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"actual_output\",\n", + " tools_called_column = \"tools_called\",\n", + ")\n", + "agent_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### PlanAdherenceMetric\n", + "Let's checks whether the agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent_dataset.assign_scores(\n", + " metrics = \"validmind.scorer.llm.deepeval.PlanAdherence\",\n", + " input_column = \"input\",\n", + " # actual_output_column = \"actual_output\",\n", + " agent_output_column = \"actual_output\",\n", + " tools_called_column = \"tools_called\",\n", + ")\n", + "agent_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### **Action Layer**\n", + "#### ToolCorrectnessMetric\n", + "Let's evaluates if the agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent_dataset.assign_scores(\n", + " metrics = \"validmind.scorer.llm.deepeval.ToolCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"actual_output\",\n", + " agent_output_column = \"actual_output\",\n", + " tools_called_column = \"tools_called\",\n", + " expected_tools_column = \"expected_tools\",\n", + ")\n", + "agent_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### ArgumentCorrectnessMetric\n", + "Let's assesses whether the agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent_dataset.assign_scores(\n", + " metrics = \"validmind.scorer.llm.deepeval.ArgumentCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"actual_output\",\n", + " tools_called_column = \"tools_called\",\n", + ")\n", + "agent_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### **Execution Layer**\n", + "#### TaskCompletionMetric\n", + "Let's measures whether the agent successfully completes the overall task. This is the ultimate indicator of success." ] }, { @@ -814,6 +978,31 @@ "agent_dataset._df.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### StepEfficiencyMetric\n", + "Let's looks at how efficiently the agent executes the task, penalizing unnecessary or redundant steps. High efficiency indicates optimized behavior." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# simple_dataset.assign_scores(\n", + "# metrics = \"validmind.scorer.llm.deepeval.StepEfficiency\",\n", + "# input_column = \"input\",\n", + "# actual_output_column = \"actual_output\",\n", + "# agent_output_column = \"actual_output\",\n", + "# tools_called_column = \"tools_called\",\n", + "# )\n", + "# simple_dataset._df.head()" + ] + }, { "cell_type": "markdown", "metadata": { @@ -875,9 +1064,9 @@ ], "metadata": { "kernelspec": { - "display_name": "ValidMind Library", + "display_name": "validmind-1QuffXMV-py3.11", "language": "python", - "name": "validmind" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/poetry.lock b/poetry.lock index 325d47776..90b11a58b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiodns" @@ -194,33 +194,6 @@ files = [ {file = "ansicolors-1.1.8.zip", hash = "sha256:99f94f5e3348a0bcd43c82e5fc4414013ccc19d70bd939ad71e0133ce9c372e0"}, ] -[[package]] -name = "anthropic" -version = "0.64.0" -description = "The official Python library for the anthropic API" -optional = true -python-versions = ">=3.8" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "anthropic-0.64.0-py3-none-any.whl", hash = "sha256:6f5f7d913a6a95eb7f8e1bda4e75f76670e8acd8d4cd965e02e2a256b0429dd1"}, - {file = "anthropic-0.64.0.tar.gz", hash = "sha256:3d496c91a63dff64f451b3e8e4b238a9640bf87b0c11d0b74ddc372ba5a3fe58"}, -] - -[package.dependencies] -anyio = ">=3.5.0,<5" -distro = ">=1.7.0,<2" -httpx = ">=0.25.0,<1" -jiter = ">=0.4.0,<1" -pydantic = ">=1.9.0,<3" -sniffio = "*" -typing-extensions = ">=4.10,<5" - -[package.extras] -aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.8)"] -bedrock = ["boto3 (>=1.28.57)", "botocore (>=1.31.57)"] -vertex = ["google-auth[requests] (>=2,<3)"] - [[package]] name = "anyio" version = "4.10.0" @@ -706,10 +679,6 @@ files = [ {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec"}, {file = "Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2"}, {file = "Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128"}, {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc"}, @@ -722,14 +691,8 @@ files = [ {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b"}, {file = "Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50"}, {file = "Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1"}, - {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28"}, - {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2"}, {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451"}, @@ -740,24 +703,8 @@ files = [ {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839"}, {file = "Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0"}, {file = "Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951"}, - {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5"}, - {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, - {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, - {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, {file = "Brotli-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a090ca607cbb6a34b0391776f0cb48062081f5f60ddcce5d11838e67a01928d1"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de9d02f5bda03d27ede52e8cfe7b865b066fa49258cbab568720aa5be80a47d"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2333e30a5e00fe0fe55903c8832e08ee9c3b1382aacf4db26664a16528d51b4b"}, @@ -767,10 +714,6 @@ files = [ {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:069a121ac97412d1fe506da790b3e69f52254b9df4eb665cd42460c837193354"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e93dfc1a1165e385cc8239fab7c036fb2cd8093728cbd85097b284d7b99249a2"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:aea440a510e14e818e67bfc4027880e2fb500c2ccb20ab21c7a7c8b5b4703d75"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:6974f52a02321b36847cd19d1b8e381bf39939c21efd6ee2fc13a28b0d99348c"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:a7e53012d2853a07a4a79c00643832161a910674a893d296c9f1259859a289d2"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:d7702622a8b40c49bffb46e1e3ba2e81268d5c04a34f460978c6b5517a34dd52"}, {file = "Brotli-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:a599669fd7c47233438a56936988a2478685e74854088ef5293802123b5b2460"}, {file = "Brotli-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d143fd47fad1db3d7c27a1b1d66162e855b5d50a89666af46e1679c496e8e579"}, {file = "Brotli-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:11d00ed0a83fa22d29bc6b64ef636c4552ebafcef57154b4ddd132f5638fbd1c"}, @@ -782,10 +725,6 @@ files = [ {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:919e32f147ae93a09fe064d77d5ebf4e35502a8df75c29fb05788528e330fe74"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:23032ae55523cc7bccb4f6a0bf368cd25ad9bcdcc1990b64a647e7bbcce9cb5b"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:224e57f6eac61cc449f498cc5f0e1725ba2071a3d4f48d5d9dffba42db196438"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cb1dac1770878ade83f2ccdf7d25e494f05c9165f5246b46a621cc849341dc01"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:3ee8a80d67a4334482d9712b8e83ca6b1d9bc7e351931252ebef5d8f7335a547"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5e55da2c8724191e5b557f8e18943b1b4839b8efc3ef60d65985bcf6f587dd38"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:d342778ef319e1026af243ed0a07c97acf3bad33b9f29e7ae6a1f68fd083e90c"}, {file = "Brotli-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:587ca6d3cef6e4e868102672d3bd9dc9698c309ba56d41c2b9c85bbb903cdb95"}, {file = "Brotli-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2954c1c23f81c2eaf0b0717d9380bd348578a94161a65b3a2afc62c86467dd68"}, {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3"}, @@ -798,10 +737,6 @@ files = [ {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a"}, {file = "Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b"}, {file = "Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0"}, {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, @@ -814,10 +749,6 @@ files = [ {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, @@ -864,19 +795,6 @@ files = [ [package.dependencies] cffi = ">=1.0.0" -[[package]] -name = "cachetools" -version = "5.5.2" -description = "Extensible memoizing collections and decorators" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a"}, - {file = "cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4"}, -] - [[package]] name = "catalogue" version = "2.0.10" @@ -1714,37 +1632,38 @@ files = [ [[package]] name = "deepeval" -version = "3.4.0" +version = "3.7.5" description = "The LLM Evaluation Framework" optional = true python-versions = "<4.0,>=3.9" groups = ["main"] markers = "extra == \"llm\"" files = [ - {file = "deepeval-3.4.0-py3-none-any.whl", hash = "sha256:ae95fd290f47861e004e5174c995dd0902def477d537cb8c80eff4bd9b93b9bd"}, - {file = "deepeval-3.4.0.tar.gz", hash = "sha256:c21af882f078b220e28a4455e3363abc1f57a45a04fcf380e1b3a2d4a526f5ef"}, + {file = "deepeval-3.7.5-py3-none-any.whl", hash = "sha256:13717c085c36487040e2d3e170ed3c327aaad0f9f26a6b78a482ca3b9f0d0522"}, + {file = "deepeval-3.7.5.tar.gz", hash = "sha256:96485d05f36cd5dd0a0220ca17ce834b407b2f3adf367ed3c73b540880b87d22"}, ] [package.dependencies] aiohttp = "*" -anthropic = "*" click = ">=8.0.0,<8.3.0" -google-genai = ">=1.9.0,<2.0.0" grpcio = ">=1.67.1,<2.0.0" +jinja2 = "*" nest_asyncio = "*" -ollama = "*" openai = "*" opentelemetry-api = ">=1.24.0,<2.0.0" opentelemetry-exporter-otlp-proto-grpc = ">=1.24.0,<2.0.0" opentelemetry-sdk = ">=1.24.0,<2.0.0" portalocker = "*" -posthog = ">=6.3.0,<7.0.0" +posthog = ">=5.4.0,<6.0.0" +pydantic = ">=2.11.7,<3.0.0" +pydantic-settings = ">=2.10.1,<3.0.0" pyfiglet = "*" pytest = "*" pytest-asyncio = "*" pytest-repeat = "*" -pytest-rerunfailures = ">=12.0,<13.0" +pytest-rerunfailures = "*" pytest-xdist = "*" +python-dotenv = ">=1.1.1,<2.0.0" requests = ">=2.31.0,<3.0.0" rich = ">=13.6.0,<15.0.0" sentry-sdk = "*" @@ -2229,60 +2148,6 @@ smb = ["smbprotocol"] ssh = ["paramiko"] tqdm = ["tqdm"] -[[package]] -name = "google-auth" -version = "2.40.3" -description = "Google Authentication Library" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca"}, - {file = "google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77"}, -] - -[package.dependencies] -cachetools = ">=2.0.0,<6.0" -pyasn1-modules = ">=0.2.1" -rsa = ">=3.1.4,<5" - -[package.extras] -aiohttp = ["aiohttp (>=3.6.2,<4.0.0)", "requests (>=2.20.0,<3.0.0)"] -enterprise-cert = ["cryptography", "pyopenssl"] -pyjwt = ["cryptography (<39.0.0) ; python_version < \"3.8\"", "cryptography (>=38.0.3)", "pyjwt (>=2.0)"] -pyopenssl = ["cryptography (<39.0.0) ; python_version < \"3.8\"", "cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"] -reauth = ["pyu2f (>=0.1.5)"] -requests = ["requests (>=2.20.0,<3.0.0)"] -testing = ["aiohttp (<3.10.0)", "aiohttp (>=3.6.2,<4.0.0)", "aioresponses", "cryptography (<39.0.0) ; python_version < \"3.8\"", "cryptography (>=38.0.3)", "flask", "freezegun", "grpcio", "mock", "oauth2client", "packaging", "pyjwt (>=2.0)", "pyopenssl (<24.3.0)", "pyopenssl (>=20.0.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-localserver", "pyu2f (>=0.1.5)", "requests (>=2.20.0,<3.0.0)", "responses", "urllib3"] -urllib3 = ["packaging", "urllib3"] - -[[package]] -name = "google-genai" -version = "1.31.0" -description = "GenAI Python SDK" -optional = true -python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "google_genai-1.31.0-py3-none-any.whl", hash = "sha256:5c6959bcf862714e8ed0922db3aaf41885bacf6318751b3421bf1e459f78892f"}, - {file = "google_genai-1.31.0.tar.gz", hash = "sha256:8572b47aa684357c3e5e10d290ec772c65414114939e3ad2955203e27cd2fcbc"}, -] - -[package.dependencies] -anyio = ">=4.8.0,<5.0.0" -google-auth = ">=2.14.1,<3.0.0" -httpx = ">=0.28.1,<1.0.0" -pydantic = ">=2.0.0,<3.0.0" -requests = ">=2.28.1,<3.0.0" -tenacity = ">=8.2.3,<9.2.0" -typing-extensions = ">=4.11.0,<5.0.0" -websockets = ">=13.0.0,<15.1.0" - -[package.extras] -aiohttp = ["aiohttp (<4.0.0)"] - [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -5285,23 +5150,6 @@ files = [ {file = "nvidia_nvtx_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:619c8304aedc69f02ea82dd244541a83c3d9d40993381b3b590f1adaed3db41e"}, ] -[[package]] -name = "ollama" -version = "0.5.3" -description = "The official Python client for Ollama." -optional = true -python-versions = ">=3.8" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "ollama-0.5.3-py3-none-any.whl", hash = "sha256:a8303b413d99a9043dbf77ebf11ced672396b59bec27e6d5db67c88f01b279d2"}, - {file = "ollama-0.5.3.tar.gz", hash = "sha256:40b6dff729df3b24e56d4042fd9d37e231cee8e528677e0d085413a1d6692394"}, -] - -[package.dependencies] -httpx = ">=0.27" -pydantic = ">=2.9" - [[package]] name = "openai" version = "1.101.0" @@ -6043,15 +5891,15 @@ tests = ["coverage-conditional-plugin (>=0.9.0)", "portalocker[redis]", "pytest [[package]] name = "posthog" -version = "6.6.1" +version = "5.4.0" description = "Integrate PostHog into any python application." optional = true python-versions = ">=3.9" groups = ["main"] markers = "extra == \"llm\"" files = [ - {file = "posthog-6.6.1-py3-none-any.whl", hash = "sha256:cba48af9af1df2a611d08fd10a2014dbee99433118973b8c51881d9ef1aa6667"}, - {file = "posthog-6.6.1.tar.gz", hash = "sha256:87dfc67d48a50eed737b77d6dd306c340f0da2f32101533e8e17b2f22ad572e0"}, + {file = "posthog-5.4.0-py3-none-any.whl", hash = "sha256:284dfa302f64353484420b52d4ad81ff5c2c2d1d607c4e2db602ac72761831bd"}, + {file = "posthog-5.4.0.tar.gz", hash = "sha256:701669261b8d07cdde0276e5bc096b87f9e200e3b9589c5ebff14df658c5893c"}, ] [package.dependencies] @@ -6060,7 +5908,6 @@ distro = ">=1.5.0" python-dateutil = ">=2.2" requests = ">=2.7,<3.0" six = ">=1.5" -typing-extensions = ">=4.2.0" [package.extras] dev = ["django-stubs", "lxml", "mypy", "mypy-baseline", "packaging", "pre-commit", "pydantic", "ruff", "setuptools", "tomli", "tomli_w", "twine", "types-mock", "types-python-dateutil", "types-requests", "types-setuptools", "types-six", "wheel"] @@ -6530,35 +6377,6 @@ files = [ {file = "pyarrow_hotfix-0.7.tar.gz", hash = "sha256:59399cd58bdd978b2e42816a4183a55c6472d4e33d183351b6069f11ed42661d"}, ] -[[package]] -name = "pyasn1" -version = "0.6.1" -description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" -optional = true -python-versions = ">=3.8" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"}, - {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"}, -] - -[[package]] -name = "pyasn1-modules" -version = "0.4.2" -description = "A collection of ASN.1-based protocols modules" -optional = true -python-versions = ">=3.8" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a"}, - {file = "pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6"}, -] - -[package.dependencies] -pyasn1 = ">=0.6.1,<0.7.0" - [[package]] name = "pycares" version = "4.10.0" @@ -7851,22 +7669,6 @@ files = [ {file = "rpds_py-0.27.0.tar.gz", hash = "sha256:8b23cf252f180cda89220b378d917180f29d313cd6a07b2431c0d3b776aae86f"}, ] -[[package]] -name = "rsa" -version = "4.9.1" -description = "Pure-Python RSA implementation" -optional = true -python-versions = "<4,>=3.6" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762"}, - {file = "rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75"}, -] - -[package.dependencies] -pyasn1 = ">=0.1.3" - [[package]] name = "safetensors" version = "0.6.2" @@ -9906,86 +9708,6 @@ docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"] optional = ["python-socks", "wsaccel"] test = ["websockets"] -[[package]] -name = "websockets" -version = "15.0.1" -description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" -optional = true -python-versions = ">=3.9" -groups = ["main"] -markers = "extra == \"llm\"" -files = [ - {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b"}, - {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205"}, - {file = "websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5756779642579d902eed757b21b0164cd6fe338506a8083eb58af5c372e39d9a"}, - {file = "websockets-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdfe3e2a29e4db3659dbd5bbf04560cea53dd9610273917799f1cde46aa725e"}, - {file = "websockets-15.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c2529b320eb9e35af0fa3016c187dffb84a3ecc572bcee7c3ce302bfeba52bf"}, - {file = "websockets-15.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac1e5c9054fe23226fb11e05a6e630837f074174c4c2f0fe442996112a6de4fb"}, - {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5df592cd503496351d6dc14f7cdad49f268d8e618f80dce0cd5a36b93c3fc08d"}, - {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0a34631031a8f05657e8e90903e656959234f3a04552259458aac0b0f9ae6fd9"}, - {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d00075aa65772e7ce9e990cab3ff1de702aa09be3940d1dc88d5abf1ab8a09c"}, - {file = "websockets-15.0.1-cp310-cp310-win32.whl", hash = "sha256:1234d4ef35db82f5446dca8e35a7da7964d02c127b095e172e54397fb6a6c256"}, - {file = "websockets-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:39c1fec2c11dc8d89bba6b2bf1556af381611a173ac2b511cf7231622058af41"}, - {file = "websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431"}, - {file = "websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57"}, - {file = "websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905"}, - {file = "websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562"}, - {file = "websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792"}, - {file = "websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413"}, - {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8"}, - {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3"}, - {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf"}, - {file = "websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85"}, - {file = "websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065"}, - {file = "websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3"}, - {file = "websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665"}, - {file = "websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2"}, - {file = "websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215"}, - {file = "websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5"}, - {file = "websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65"}, - {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe"}, - {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4"}, - {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597"}, - {file = "websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9"}, - {file = "websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7"}, - {file = "websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931"}, - {file = "websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675"}, - {file = "websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151"}, - {file = "websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22"}, - {file = "websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f"}, - {file = "websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8"}, - {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375"}, - {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d"}, - {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4"}, - {file = "websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa"}, - {file = "websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561"}, - {file = "websockets-15.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5f4c04ead5aed67c8a1a20491d54cdfba5884507a48dd798ecaf13c74c4489f5"}, - {file = "websockets-15.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abdc0c6c8c648b4805c5eacd131910d2a7f6455dfd3becab248ef108e89ab16a"}, - {file = "websockets-15.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a625e06551975f4b7ea7102bc43895b90742746797e2e14b70ed61c43a90f09b"}, - {file = "websockets-15.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d591f8de75824cbb7acad4e05d2d710484f15f29d4a915092675ad3456f11770"}, - {file = "websockets-15.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47819cea040f31d670cc8d324bb6435c6f133b8c7a19ec3d61634e62f8d8f9eb"}, - {file = "websockets-15.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac017dd64572e5c3bd01939121e4d16cf30e5d7e110a119399cf3133b63ad054"}, - {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4a9fac8e469d04ce6c25bb2610dc535235bd4aa14996b4e6dbebf5e007eba5ee"}, - {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363c6f671b761efcb30608d24925a382497c12c506b51661883c3e22337265ed"}, - {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2034693ad3097d5355bfdacfffcbd3ef5694f9718ab7f29c29689a9eae841880"}, - {file = "websockets-15.0.1-cp39-cp39-win32.whl", hash = "sha256:3b1ac0d3e594bf121308112697cf4b32be538fb1444468fb0a6ae4feebc83411"}, - {file = "websockets-15.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:b7643a03db5c95c799b89b31c036d5f27eeb4d259c798e878d6937d71832b1e4"}, - {file = "websockets-15.0.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0c9e74d766f2818bb95f84c25be4dea09841ac0f734d1966f415e4edfc4ef1c3"}, - {file = "websockets-15.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1009ee0c7739c08a0cd59de430d6de452a55e42d6b522de7aa15e6f67db0b8e1"}, - {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76d1f20b1c7a2fa82367e04982e708723ba0e7b8d43aa643d3dcd404d74f1475"}, - {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f29d80eb9a9263b8d109135351caf568cc3f80b9928bccde535c235de55c22d9"}, - {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b359ed09954d7c18bbc1680f380c7301f92c60bf924171629c5db97febb12f04"}, - {file = "websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122"}, - {file = "websockets-15.0.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7f493881579c90fc262d9cdbaa05a6b54b3811c2f300766748db79f098db9940"}, - {file = "websockets-15.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:47b099e1f4fbc95b701b6e85768e1fcdaf1630f3cbe4765fa216596f12310e2e"}, - {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67f2b6de947f8c757db2db9c71527933ad0019737ec374a8a6be9a956786aaf9"}, - {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d08eb4c2b7d6c41da6ca0600c077e93f5adcfd979cd777d747e9ee624556da4b"}, - {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b826973a4a2ae47ba357e4e82fa44a463b8f168e1ca775ac64521442b19e87f"}, - {file = "websockets-15.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:21c1fa28a6a7e3cbdc171c694398b6df4744613ce9b36b1a498e816787e28123"}, - {file = "websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f"}, - {file = "websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee"}, -] - [[package]] name = "wheel" version = "0.45.1" @@ -10558,4 +10280,4 @@ xgboost = ["xgboost"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<3.13" -content-hash = "025dbfea51cfe322f2f1bfce521c72eb20b8106e5e77221d480a7af9bcaf9f76" +content-hash = "bf37b4b7c44c8878d8e2daf4c13de0207648570de9dccb48e0cc121b0be92a5e" diff --git a/pyproject.toml b/pyproject.toml index 57ea73feb..38febe00d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ llm = [ "ragas (>=0.2.3,<=0.2.7)", "sentencepiece (>=0.2.0,<0.3.0)", "langchain-openai (>=0.1.8)", - "deepeval (>3.3.9)", + "deepeval (>=3.7.0)", ] nlp = [ "langdetect", diff --git a/validmind/datasets/llm/agent_dataset.py b/validmind/datasets/llm/agent_dataset.py index ace2b5066..e196808be 100644 --- a/validmind/datasets/llm/agent_dataset.py +++ b/validmind/datasets/llm/agent_dataset.py @@ -103,6 +103,18 @@ def __init__( # Convert to pandas DataFrame for VMDataset compatibility df = self._convert_to_dataframe() + # Build extra_columns only for columns that exist in the DataFrame + possible_extra_columns = [ + "actual_output", + "context", + "retrieval_context", + "tools_called", + "expected_tools", + ] + extra_columns = { + col: col for col in possible_extra_columns if col in df.columns + } + # Initialize VMDataset with the converted data super().__init__( raw_dataset=df.values, @@ -110,13 +122,7 @@ def __init__( columns=df.columns.tolist(), text_column="input", # The input text for LLM target_column="expected_output", # Expected response - extra_columns={ - "actual_output": "actual_output", - "context": "context", - "retrieval_context": "retrieval_context", - "tools_called": "tools_called", - "expected_tools": "expected_tools", - }, + extra_columns=extra_columns if extra_columns else None, **kwargs, ) diff --git a/validmind/scorer/llm/deepeval/ArgumentCorrectness.py b/validmind/scorer/llm/deepeval/ArgumentCorrectness.py new file mode 100644 index 000000000..c55105604 --- /dev/null +++ b/validmind/scorer/llm/deepeval/ArgumentCorrectness.py @@ -0,0 +1,118 @@ +# Copyright © 2023-2024 ValidMind Inc. All rights reserved. +# See the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +from typing import Any, Dict, List + +from validmind import tags, tasks +from validmind.ai.utils import get_client_and_model +from validmind.errors import MissingDependencyError +from validmind.tests.decorator import scorer +from validmind.vm_models.dataset import VMDataset + +try: + from deepeval import evaluate + from deepeval.metrics import ArgumentCorrectnessMetric + from deepeval.test_case import LLMTestCase +except ImportError as e: + if "deepeval" in str(e): + raise MissingDependencyError( + "Missing required package `deepeval` for ArgumentCorrectness. " + "Please run `pip install validmind[llm]` to use LLM tests", + required_dependencies=["deepeval"], + extra="llm", + ) from e + + raise e + + +@scorer() +@tags("llm", "ArgumentCorrectness", "deepeval", "agent_evaluation", "action_layer") +@tasks("llm") +def ArgumentCorrectness( + dataset: VMDataset, + threshold: float = 0.7, + input_column: str = "input", + tools_called_column: str = "tools_called", + agent_output_column: str = "agent_output", + actual_output_column: str = "actual_output", + strict_mode: bool = False, +) -> List[Dict[str, Any]]: + """Evaluates agent argument correctness using deepeval's ArgumentCorrectnessMetric. + + This metric evaluates whether your agent generates correct arguments for each tool + call. Selecting the right tool with wrong arguments is as problematic as selecting + the wrong tool entirely. + + Unlike ToolCorrectnessMetric, this metric is fully LLM-based and referenceless—it + evaluates argument correctness based on the input context rather than comparing + against expected values. + + Args: + dataset: Dataset containing the agent input and tool calls + threshold: Minimum passing threshold (default: 0.7) + input_column: Column name for the task input (default: "input") + tools_called_column: Column name for tools called (default: "tools_called") + agent_output_column: Column name for agent output containing tool calls (default: "agent_output") + strict_mode: If True, enforces a binary score (0 or 1) + + Returns: + List[Dict[str, Any]] with keys "score" and "reason" for each row. + + Raises: + ValueError: If required columns are missing + """ + # Validate required columns exist in dataset + missing_columns: List[str] = [] + if input_column not in dataset._df.columns: + missing_columns.append(input_column) + + if missing_columns: + raise ValueError( + f"Required columns {missing_columns} not found in dataset. " + f"Available columns: {dataset._df.columns.tolist()}" + ) + + _, model = get_client_and_model() + + metric = ArgumentCorrectnessMetric( + threshold=threshold, + model=model, + include_reason=True, + strict_mode=strict_mode, + verbose_mode=False, + ) + + # Import helper functions to avoid circular import + from validmind.scorer.llm.deepeval import ( + _convert_to_tool_call_list, + extract_tool_calls_from_agent_output, + ) + + results: List[Dict[str, Any]] = [] + for _, row in dataset._df.iterrows(): + input_value = row[input_column] + + # Extract tools called + if tools_called_column in dataset._df.columns: + tools_called_value = row.get(tools_called_column, []) + else: + agent_output = row.get(agent_output_column, {}) + tools_called_value = extract_tool_calls_from_agent_output(agent_output) + tools_called_list = _convert_to_tool_call_list(tools_called_value) + + actual_output_value = row.get(actual_output_column, "") + + test_case = LLMTestCase( + input=input_value, + tools_called=tools_called_list, + actual_output=actual_output_value, + ) + + result = evaluate(test_cases=[test_case], metrics=[metric]) + metric_data = result.test_results[0].metrics_data[0] + score = metric_data.score + reason = getattr(metric_data, "reason", "No reason provided") + results.append({"score": score, "reason": reason}) + + return results diff --git a/validmind/scorer/llm/deepeval/PlanAdherence.py b/validmind/scorer/llm/deepeval/PlanAdherence.py new file mode 100644 index 000000000..26b3c327a --- /dev/null +++ b/validmind/scorer/llm/deepeval/PlanAdherence.py @@ -0,0 +1,150 @@ +# Copyright © 2023-2024 ValidMind Inc. All rights reserved. +# See the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +from typing import Any, Dict, List + +from validmind import tags, tasks +from validmind.ai.utils import get_client_and_model +from validmind.errors import MissingDependencyError +from validmind.tests.decorator import scorer +from validmind.vm_models.dataset import VMDataset + +try: + from deepeval import evaluate + from deepeval.metrics import PlanAdherenceMetric + from deepeval.test_case import LLMTestCase +except ImportError as e: + if "deepeval" in str(e): + raise MissingDependencyError( + "Missing required package `deepeval` for PlanAdherence. " + "Please run `pip install validmind[llm]` to use LLM tests", + required_dependencies=["deepeval"], + extra="llm", + ) from e + + raise e + + +# def _extract_plan_value( +# row: Any, +# has_plan_column: bool, +# has_agent_output: bool, +# plan_column: str, +# agent_output_column: str, +# ) -> str: +# """Extract plan value from row.""" +# plan_value: Optional[str] = None +# if has_plan_column: +# plan_value = row.get(plan_column) +# elif has_agent_output: +# agent_output = row.get(agent_output_column, {}) +# if isinstance(agent_output, dict): +# plan_value = agent_output.get("plan") or agent_output.get("reasoning") +# return plan_value or "" + + +# def _extract_execution_steps_value( +# row: Any, +# has_execution_steps_column: bool, +# has_agent_output: bool, +# execution_steps_column: str, +# agent_output_column: str, +# ) -> str: +# """Extract execution steps value from row.""" +# execution_steps_value: Optional[str] = None +# if has_execution_steps_column: +# execution_steps_value = row.get(execution_steps_column) +# elif has_agent_output: +# agent_output = row.get(agent_output_column, {}) +# if isinstance(agent_output, dict): +# execution_steps_value = agent_output.get("execution_steps") or agent_output.get("steps") +# return execution_steps_value or "" + + +@scorer() +@tags("llm", "PlanAdherence", "deepeval", "agent_evaluation", "reasoning_layer") +@tasks("llm") +def PlanAdherence( + dataset: VMDataset, + threshold: float = 0.7, + input_column: str = "input", + tools_called_column: str = "tools_called", + actual_output_column: str = "actual_output", + expected_output_column: str = "expected_output", + strict_mode: bool = False, +) -> List[Dict[str, Any]]: + """Evaluates agent plan adherence using deepeval's PlanAdherenceMetric. + + This metric evaluates whether your agent follows its own plan during execution. + Creating a good plan is only half the battle—an agent that deviates from its + strategy mid-execution undermines its own reasoning. + + Args: + dataset: Dataset containing the agent input, plan, and execution steps + threshold: Minimum passing threshold (default: 0.7) + input_column: Column name for the task input (default: "input") + plan_column: Column name for the agent's plan (default: "plan") + execution_steps_column: Column name for execution steps (default: "execution_steps") + agent_output_column: Column name for agent output containing plan and steps (default: "agent_output") + tools_called_column: Column name for tools called (default: "tools_called") + strict_mode: If True, enforces a binary score (0 or 1) + + Returns: + List[Dict[str, Any]] with keys "score" and "reason" for each row. + + Raises: + ValueError: If required columns are missing + """ + # Validate required columns exist in dataset + missing_columns: List[str] = [] + if input_column not in dataset._df.columns: + missing_columns.append(input_column) + + if tools_called_column not in dataset._df.columns: + missing_columns.append(tools_called_column) + + if actual_output_column not in dataset._df.columns: + missing_columns.append(actual_output_column) + + if expected_output_column not in dataset._df.columns: + missing_columns.append(expected_output_column) + + if missing_columns: + raise ValueError( + f"Required columns {missing_columns} not found in dataset. " + f"Available columns: {dataset._df.columns.tolist()}" + ) + + _, model = get_client_and_model() + + metric = PlanAdherenceMetric( + threshold=threshold, + model=model, + include_reason=True, + strict_mode=strict_mode, + verbose_mode=False, + ) + + results: List[Dict[str, Any]] = [] + for _, row in dataset._df.iterrows(): + input_value = row[input_column] + actual_output_value = row.get(actual_output_column, "") + expected_output_value = row.get(expected_output_column, "") + + tools_called_value = row.get(tools_called_column, []) + + test_case = LLMTestCase( + input=input_value, + actual_output=actual_output_value, + expected_output=expected_output_value, + tools_called=tools_called_value, + ) + + result = evaluate(test_cases=[test_case], metrics=[metric]) + metric_data = result.test_results[0].metrics_data[0] + score = metric_data.score + reason = getattr(metric_data, "reason", "No reason provided") + results.append({"score": score, "reason": reason}) + + return results diff --git a/validmind/scorer/llm/deepeval/PlanQuality.py b/validmind/scorer/llm/deepeval/PlanQuality.py new file mode 100644 index 000000000..5bbd1e83b --- /dev/null +++ b/validmind/scorer/llm/deepeval/PlanQuality.py @@ -0,0 +1,105 @@ +# Copyright © 2023-2024 ValidMind Inc. All rights reserved. +# See the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +from typing import Any, Dict, List + +from validmind import tags, tasks +from validmind.ai.utils import get_client_and_model +from validmind.errors import MissingDependencyError +from validmind.tests.decorator import scorer +from validmind.vm_models.dataset import VMDataset + +try: + from deepeval import evaluate + from deepeval.metrics import PlanQualityMetric + from deepeval.test_case import LLMTestCase +except ImportError as e: + if "deepeval" in str(e): + raise MissingDependencyError( + "Missing required package `deepeval` for PlanQuality. " + "Please run `pip install validmind[llm]` to use LLM tests", + required_dependencies=["deepeval"], + extra="llm", + ) from e + + raise e + + +@scorer() +@tags("llm", "deepeval", "agent_evaluation", "reasoning_layer") +@tasks("llm") +def PlanQuality( + dataset: VMDataset, + threshold: float = 0.7, + input_column: str = "input", + actual_output_column: str = "actual_output", + agent_output_column: str = "agent_output", + tools_called_column: str = "tools_called", + strict_mode: bool = False, +) -> List[Dict[str, Any]]: + """Evaluates agent plan quality using deepeval's PlanQualityMetric. + + This metric evaluates whether the plan your agent generates is logical, complete, + and efficient for accomplishing the given task. It extracts the task and plan from + your agent's trace and uses an LLM judge to assess plan quality. + + Args: + dataset: Dataset containing the agent input and plan + threshold: Minimum passing threshold (default: 0.7) + input_column: Column name for the task input (default: "input") + agent_output_column: Column name for agent output containing plan in trace (default: "agent_output") + tools_called_column: Column name for tools called by the agent (default: "tools_called") + strict_mode: If True, enforces a binary score (0 or 1) + + Returns: + List[Dict[str, Any]] with keys "score" and "reason" for each row. + + Raises: + ValueError: If required columns are missing + """ + # Validate required columns exist in dataset + missing_columns: List[str] = [] + if input_column not in dataset._df.columns: + missing_columns.append(input_column) + + if tools_called_column not in dataset._df.columns: + missing_columns.append(tools_called_column) + + if actual_output_column not in dataset._df.columns: + missing_columns.append(actual_output_column) + + if missing_columns: + raise ValueError( + f"Required columns {missing_columns} not found in dataset. " + f"Available columns: {dataset._df.columns.tolist()}" + ) + + _, model = get_client_and_model() + + metric = PlanQualityMetric( + threshold=threshold, + model=model, + include_reason=True, + strict_mode=strict_mode, + verbose_mode=False, + ) + + results: List[Dict[str, Any]] = [] + for _, row in dataset._df.iterrows(): + input_value = row[input_column] + actual_output_value = row.get(actual_output_column, "") + tools_called_value = row.get(tools_called_column, []) + test_case = LLMTestCase( + input=input_value, + actual_output=actual_output_value, + tools_called=tools_called_value, + ) + + result = evaluate(test_cases=[test_case], metrics=[metric]) + metric_data = result.test_results[0].metrics_data[0] + score = metric_data.score + reason = getattr(metric_data, "reason", "No reason provided") + results.append({"score": score, "reason": reason}) + + return results diff --git a/validmind/scorer/llm/deepeval/ToolCorrectness.py b/validmind/scorer/llm/deepeval/ToolCorrectness.py new file mode 100644 index 000000000..9b89f905b --- /dev/null +++ b/validmind/scorer/llm/deepeval/ToolCorrectness.py @@ -0,0 +1,130 @@ +# Copyright © 2023-2024 ValidMind Inc. All rights reserved. +# See the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +from typing import Any, Dict, List + +from validmind import tags, tasks +from validmind.ai.utils import get_client_and_model +from validmind.errors import MissingDependencyError +from validmind.tests.decorator import scorer +from validmind.vm_models.dataset import VMDataset + +try: + from deepeval import evaluate + from deepeval.metrics import ToolCorrectnessMetric + from deepeval.test_case import LLMTestCase +except ImportError as e: + if "deepeval" in str(e): + raise MissingDependencyError( + "Missing required package `deepeval` for ToolCorrectness. " + "Please run `pip install validmind[llm]` to use LLM tests", + required_dependencies=["deepeval"], + extra="llm", + ) from e + + raise e + + +@scorer() +@tags("llm", "ToolCorrectness", "deepeval", "agent_evaluation", "action_layer") +@tasks("llm") +def ToolCorrectness( + dataset: VMDataset, + threshold: float = 0.7, + input_column: str = "input", + expected_tools_column: str = "expected_tools", + tools_called_column: str = "tools_called", + agent_output_column: str = "agent_output", + actual_output_column: str = "actual_output", +) -> List[Dict[str, Any]]: + """Evaluate tool-use correctness for LLM agents using deepeval's ToolCorrectnessMetric. + + This metric assesses whether the agent called the expected tools in a task, and whether + argument and response information matches the ground truth expectations. + The metric compares the tools the agent actually called to the list of expected tools + on a per-row basis. + + Args: + dataset: VMDataset containing the agent input, expected tool calls, and actual tool calls. + threshold: Minimum passing threshold (default: 0.7). + input_column: Column containing the task input for evaluation. + expected_tools_column: Column specifying the expected tools (ToolCall/str/dict or list). + tools_called_column: Column holding the tools actually called by the agent. + If missing, will be populated by parsing agent_output_column. + agent_output_column: Column containing agent output with tool-calling trace (default: "agent_output"). + actual_output_column: Column specifying the ground-truth output string (optional). + + Returns: + List of dicts (one per row) containing: + - "score": Tool correctness score between 0 and 1. + - "reason": ToolCorrectnessMetric's reason or explanation. + + Raises: + ValueError: If required columns are missing from dataset. + + Example: + results = ToolCorrectness(dataset=my_data) + results[0]["score"] # 1.0 if tools called correctly, else <1.0 + + Risks & Limitations: + - Works best if dataset includes high-quality tool call signals & references. + - Comparison logic may be limited for atypically formatted tool call traces. + """ + # Validate required columns exist in dataset + missing_columns: List[str] = [] + if input_column not in dataset._df.columns: + missing_columns.append(input_column) + if expected_tools_column not in dataset._df.columns: + missing_columns.append(expected_tools_column) + + if missing_columns: + raise ValueError( + f"Required columns {missing_columns} not found in dataset. " + f"Available columns: {dataset._df.columns.tolist()}" + ) + + # Import helper functions to avoid circular import + from validmind.scorer.llm.deepeval import ( + _convert_to_tool_call_list, + extract_tool_calls_from_agent_output, + ) + + _, model = get_client_and_model() + + metric = ToolCorrectnessMetric( + threshold=threshold, + model=model, + ) + + results: List[Dict[str, Any]] = [] + for _, row in dataset._df.iterrows(): + input_value = row[input_column] + expected_tools_value = row.get(expected_tools_column, []) + + # Extract tools called + if tools_called_column in dataset._df.columns: + tools_called_value = row.get(tools_called_column, []) + else: + agent_output = row.get(agent_output_column, {}) + tools_called_value = extract_tool_calls_from_agent_output(agent_output) + + expected_tools_list = _convert_to_tool_call_list(expected_tools_value) + tools_called_list = _convert_to_tool_call_list(tools_called_value) + + actual_output_value = row.get(actual_output_column, "") + + test_case = LLMTestCase( + input=input_value, + expected_tools=expected_tools_list, + tools_called=tools_called_list, + actual_output=actual_output_value, + ) + + result = evaluate(test_cases=[test_case], metrics=[metric]) + metric_data = result.test_results[0].metrics_data[0] + score = metric_data.score + reason = getattr(metric_data, "reason", "No reason provided") + results.append({"score": score, "reason": reason}) + + return results diff --git a/validmind/scorer/llm/deepeval/__init__.py b/validmind/scorer/llm/deepeval/__init__.py index 0b0547949..e3fd32fd4 100644 --- a/validmind/scorer/llm/deepeval/__init__.py +++ b/validmind/scorer/llm/deepeval/__init__.py @@ -2,6 +2,163 @@ # See the LICENSE file in the root of this repository for details. # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial +from typing import Any, Dict, List + +try: + from deepeval.test_case import ToolCall +except ImportError: + ToolCall = None # type: ignore + from .AnswerRelevancy import AnswerRelevancy +from .ArgumentCorrectness import ArgumentCorrectness +from .PlanAdherence import PlanAdherence +from .PlanQuality import PlanQuality +from .StepEfficiency import StepEfficiency +from .ToolCorrectness import ToolCorrectness + +__all__ = [ + "AnswerRelevancy", + "ArgumentCorrectness", + "PlanAdherence", + "PlanQuality", + "StepEfficiency", + "ToolCorrectness", + "_extract_tool_responses", + "_extract_tool_calls_from_message", + "extract_tool_calls_from_agent_output", + "_convert_to_tool_call_list", +] + + +def _extract_tool_responses(messages: List[Any]) -> Dict[str, str]: + """Extract tool responses from the provided message list. + + Args: + messages: List of message objects or dictionaries. + + Returns: + Dictionary mapping tool_call_id to the tool's response content. + """ + tool_responses = {} + + for message in messages: + # Handle both object and dictionary formats + if isinstance(message, dict): + if ( + message.get("name") + and message.get("content") + and message.get("tool_call_id") + ): + tool_responses[message["tool_call_id"]] = message["content"] + else: + if hasattr(message, "name") and hasattr(message, "content"): + if hasattr(message, "tool_call_id"): + tool_responses[message.tool_call_id] = message.content + + return tool_responses + + +def _extract_tool_calls_from_message( + message: Any, tool_responses: Dict[str, str] +) -> List[ToolCall]: + """Extract tool calls from a single message. + + Args: + message: A message object or dict, possibly with tool_calls. + tool_responses: Dict mapping tool_call_id to response content. + + Returns: + List of ToolCall objects for all tool calls in message. + """ + tool_calls = [] + + if isinstance(message, dict): + if message.get("tool_calls"): + for tool_call in message["tool_calls"]: + tool_name = tool_call.get("name") + tool_args = tool_call.get("args", {}) + tool_id = tool_call.get("id") + if tool_name and tool_id: + response = tool_responses.get(tool_id, "") + tool_call_obj = ToolCall( + name=tool_name, input_parameters=tool_args, output=response + ) + tool_calls.append(tool_call_obj) + else: + if hasattr(message, "tool_calls") and message.tool_calls: + for tool_call in message.tool_calls: + if isinstance(tool_call, dict): + tool_name = tool_call.get("name") + tool_args = tool_call.get("args", {}) + tool_id = tool_call.get("id") + else: + tool_name = getattr(tool_call, "name", None) + tool_args = getattr(tool_call, "args", {}) + tool_id = getattr(tool_call, "id", None) + if tool_name and tool_id: + response = tool_responses.get(tool_id, "") + tool_call_obj = ToolCall( + name=tool_name, input_parameters=tool_args, output=response + ) + tool_calls.append(tool_call_obj) + + return tool_calls + + +def extract_tool_calls_from_agent_output( + agent_output: Dict[str, Any] +) -> List[ToolCall]: + """Extract ToolCall objects from an agent's output. + + Args: + agent_output: The dictionary from the agent_output column, + expected to contain a "messages" key. + + Returns: + List of ToolCall objects with name, input_parameters, and output. + """ + tool_calls = [] + + if not isinstance(agent_output, dict) or "messages" not in agent_output: + return tool_calls + + messages = agent_output["messages"] + + # First pass: collect tool responses + tool_responses = _extract_tool_responses(messages) + + # Second pass: extract tool calls and match with responses + for message in messages: + message_tool_calls = _extract_tool_calls_from_message(message, tool_responses) + tool_calls.extend(message_tool_calls) + + return tool_calls + + +def _convert_to_tool_call_list(tools: Any) -> List[ToolCall]: + """Convert a list of tool dicts/ToolCall/str into a list of ToolCall objects. + + Args: + tools: List of tools in ToolCall, dict, or str format. + + Returns: + List of ToolCall objects. + """ + if not isinstance(tools, list): + tools = [] -__all__ = ["AnswerRelevancy"] + tool_call_list = [] + for tool in tools: + if isinstance(tool, ToolCall): + tool_call_list.append(tool) + elif isinstance(tool, dict): + tool_call_list.append( + ToolCall( + name=tool.get("name", ""), + input_parameters=tool.get("input_parameters", {}), + output=tool.get("output", ""), + ) + ) + elif isinstance(tool, str): + tool_call_list.append(ToolCall(name=tool)) + return tool_call_list From f373cb0cad0968bad42a27edd53b32755a2ed642 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Wed, 17 Dec 2025 14:15:07 +0000 Subject: [PATCH 2/6] update notebook --- notebooks/code_sharing/deepeval_integration_demo.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/code_sharing/deepeval_integration_demo.ipynb b/notebooks/code_sharing/deepeval_integration_demo.ipynb index 8ce0a67ad..8b1329f5c 100644 --- a/notebooks/code_sharing/deepeval_integration_demo.ipynb +++ b/notebooks/code_sharing/deepeval_integration_demo.ipynb @@ -203,10 +203,10 @@ "import validmind as vm\n", "\n", "vm.init(\n", - " api_host=\"https://api.prod.validmind.ai/api/v1/tracking\",\n", - " api_key=\"60356f9120477ffca344e945be326ee7\",\n", - " api_secret=\"3d21c73e14c12266a4addf0d9673cec81bc4f0d23b329f6c0e3b599d7bf5052a\",\n", - " model=\"clul6y51o02ct1ojrog2d4dus\",\n", + " api_host=\"...\",\n", + " api_key=\"...\",\n", + " api_secret=\"...\",\n", + " model=\"...\",\n", ")\n" ] }, From b7499281bfa12b3e4c83aaee4271816b5d6888e9 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Mon, 5 Jan 2026 14:25:47 +0000 Subject: [PATCH 3/6] add agentic scorers in the demo notebook --- .../agents/banking_test_dataset.py | 10 +- .../langgraph_agent_simple_banking_demo.ipynb | 218 ++++++++++++++---- validmind/scorer/llm/deepeval/PlanQuality.py | 9 +- .../scorer/llm/deepeval/TaskCompletion.py | 1 + .../scorer/llm/deepeval/ToolCorrectness.py | 1 + validmind/scorer/llm/deepeval/__init__.py | 13 +- 6 files changed, 209 insertions(+), 43 deletions(-) diff --git a/notebooks/code_samples/agents/banking_test_dataset.py b/notebooks/code_samples/agents/banking_test_dataset.py index a058e6690..bd2793169 100644 --- a/notebooks/code_samples/agents/banking_test_dataset.py +++ b/notebooks/code_samples/agents/banking_test_dataset.py @@ -8,6 +8,7 @@ "input": "Analyze credit risk for a $50,000 personal loan application with $75,000 annual income, $1,200 monthly debt, and 720 credit score", "expected_tools": ["credit_risk_analyzer"], "possible_outputs": ["LOW RISK", "APPROVE", "risk score", "720", "probability of default", "2.5%"], + "expected_output": "LOW RISK", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "credit_risk" }, @@ -15,6 +16,7 @@ "input": "Evaluate credit risk for a business loan of $250,000 with monthly revenue of $85,000 and existing debt of $45,000 and credit score of 650", "expected_tools": ["credit_risk_analyzer"], "possible_outputs": ["MEDIUM RISK", "HIGH RISK", "business loan", "debt service coverage ratio", "1.8", "annual revenue", "$1,020,000", "risk score", "650"], + "expected_output": "MEDIUM RISK", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "credit_risk" }, @@ -24,6 +26,7 @@ # possible_outputs values relevant to account management # Matches what _handle_check_balance would return for customer 12345 ("John Smith"), whose checking_balance is 2547.89 in the mock DB. "possible_outputs": ["$2,547.89", "John Smith", "$2547.89"], + "expected_output": "$2,547.89", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "account_management" }, @@ -31,6 +34,7 @@ "input": "Analyze fraud risk for a $15,000 wire transfer from customer 67890 to Nigeria", "expected_tools": ["fraud_detection_system"], "possible_outputs": ["REQUIRE VERIFICATION", "fraud score", "65", "geographic risk", "block transaction", "MEDIUM RISK"], + "expected_output": "REQUIRE VERIFICATION", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "fraud_detection" }, @@ -38,6 +42,7 @@ "input": "Recommend banking products for customer 11111 with $150,000 in savings and 720 credit score", "expected_tools": ["customer_account_manager"], "possible_outputs": ["High-Yield Savings Account (2.5% APY)", "Personal Line of Credit up to $25,000"], + "expected_output": "High-Yield Savings Account (2.5% APY)", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "account_management" }, @@ -45,13 +50,15 @@ "input": "Investigate suspicious transactions totaling $75,000 across multiple accounts in the last week", "expected_tools": ["fraud_detection_system"], "possible_outputs": ["Require additional verification", "Implement 24-hour delay for verification"], + "expected_output": "Require additional verification", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "fraud_detection" }, { "input": "Assess credit risk for a $1,000,000 commercial real estate loan with $500,000 annual business income", "expected_tools": ["credit_risk_analyzer"], - "possible_outputs": ["HIGH RISK", "VERY HIGH RISK","loan-to-value", "66.7%", "debt service coverage", "2.0"], + "possible_outputs": ["HIGH RISK", "VERY HIGH RISK", "loan-to-value", "66.7%", "debt service coverage", "2.0"], + "expected_output": "HIGH RISK", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "credit_risk" }, @@ -59,6 +66,7 @@ "input": "Update customer contact information and address for account holder 22334", "expected_tools": ["customer_account_manager"], "possible_outputs": ["not found in system", "Customer ID 22334 not found in system.", "not found"], + "expected_output": "Customer ID 22334 not found in system.", # Example, adjust as needed "session_id": str(uuid.uuid4()), "category": "account_management" } diff --git a/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb b/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb index 533b30c45..426f61184 100644 --- a/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb +++ b/notebooks/code_samples/agents/langgraph_agent_simple_banking_demo.ipynb @@ -44,8 +44,12 @@ " - [Dataframe Display Settings](#toc7_2_1__) \n", "- [Banking Accuracy Test](#toc8__) \n", "- [Banking Tool Call Accuracy Test](#toc9__) \n", - "- [Scorers in ValidMind](#toc10__) \n", - "- [Task Completion scorer](#toc11__) \n", + "- [Scorers in ValidMind](#toc10__)\n", + " - [Plan Quality Metric scorer](#toc10_1) \n", + " - [Plan Adherence Metric scorer](#toc10_2) \n", + " - [Tool Correctness Metric scorer](#toc10_3) \n", + " - [Argument Correctness Metric scorer](#toc10_4) \n", + " - [Task Completion scorer](#toc10_5) \n", "- [RAGAS Tests for an Agent Evaluation](#toc12__) \n", " - [Faithfulness](#toc12_1__) \n", " - [Response Relevancy](#toc12_2__) \n", @@ -154,7 +158,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -q validmind langgraph" + "%pip install -q validmind " ] }, { @@ -231,7 +235,7 @@ " api_key=\"...\",\n", " api_secret=\"...\",\n", " model=\"...\",\n", - ")\n" + ")" ] }, { @@ -455,7 +459,13 @@ " Choose and use tools wisely to provide the most helpful banking assistance.\n", " \"\"\"\n", "# Initialize the main LLM for banking responses\n", - "main_llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0.3)\n", + "main_llm = ChatOpenAI(\n", + " model=\"gpt-5-mini\",\n", + " reasoning={\n", + " \"effort\": \"low\",\n", + " \"summary\": \"auto\"\n", + " }\n", + ")\n", "# Bind all banking tools to the main LLM\n", "llm_with_tools = main_llm.bind_tools(AVAILABLE_TOOLS)\n", "\n", @@ -539,7 +549,7 @@ "outputs": [], "source": [ "from validmind.models import Prompt\n", - "\n", + "from validmind.scorer.llm.deepeval import extract_tool_calls_from_agent_output, _convert_to_tool_call_list\n", "def banking_agent_fn(input):\n", " \"\"\"\n", " Invoke the banking agent with the given input.\n", @@ -578,7 +588,13 @@ " tool_calls_found.append(tool_call.name)\n", "\n", "\n", - " return {\"prediction\": result['messages'][-1].content, \"output\": result, \"tool_messages\": [tool_message], \"tool_calls\": tool_calls_found}\n", + " return {\n", + " \"prediction\": result['messages'][-1].content[0]['text'],\n", + " \"output\": result,\n", + " \"tool_messages\": [tool_message],\n", + " # \"tool_calls\": tool_calls_found,\n", + " \"tool_called\": _convert_to_tool_call_list(extract_tool_calls_from_agent_output(result))\n", + " }\n", " except Exception as e:\n", " # Return a fallback response if the agent fails\n", " error_message = f\"\"\"I apologize, but I encountered an error while processing your banking request: {str(e)}.\n", @@ -720,7 +736,7 @@ "\n", "vm_test_dataset = vm.init_dataset(\n", " input_id=\"banking_test_dataset\",\n", - " dataset=banking_test_dataset,\n", + " dataset=banking_test_dataset.sample(2),\n", " text_column=\"input\",\n", " target_column=\"possible_outputs\",\n", ")\n", @@ -755,28 +771,6 @@ "vm_test_dataset._df.head()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "#### Dataframe Display Settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# pd.set_option('display.max_colwidth', 40)\n", - "# pd.set_option('display.width', 120)\n", - "# pd.set_option('display.max_colwidth', None)\n", - "# print(\"Banking Test Dataset with Predictions:\")\n", - "# vm_test_dataset._df.head()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -977,10 +971,155 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", + "\n", + "### AI Agent Evaluation Metrics\n", + "\n", + "AI agent evaluation metrics are specialized measurements designed to assess how well autonomous LLM-based agents reason, plan, select and execute tools, and ultimately complete user tasks by analyzing the **full execution trace**—including reasoning steps, tool calls, intermediate decisions, and outcomes—rather than just single input–output pairs.\n", + "\n", + "These metrics are essential because agent failures often occur in ways traditional LLM metrics miss (e.g., choosing the right tool with wrong arguments, creating a good plan but not following it, or completing a task inefficiently).\n", + "\n", + "**DeepEval’s AI agent evaluation framework** breaks evaluation into three layers with corresponding metric categories:\n", + "\n", + "1. **Reasoning Layer** – Evaluates planning and strategy generation:\n", + "\n", + " * *PlanQualityMetric* – how logical, complete, and efficient the agent’s plan is\n", + " * *PlanAdherenceMetric* – whether the agent follows its own plan during execution \n", + "\n", + "2. **Action Layer** – Assesses tool usage and argument generation:\n", + "\n", + " * *ToolCorrectnessMetric* – whether the agent selects and calls the right tools\n", + " * *ArgumentCorrectnessMetric* – whether the agent generates correct tool arguments\n", + "\n", + "3. **Execution Layer** – Measures end-to-end performance:\n", + "\n", + " * *TaskCompletionMetric* – whether the agent successfully completes the intended task\n", + " * *StepEfficiencyMetric* – whether the agent avoids unnecessary or redundant steps\n", + "\n", + "Together, these metrics enable granular diagnosis of agent behavior, help pinpoint where failures occur (reasoning, action, or execution), and support both development benchmarking and production monitoring." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", "\n", - "## Task Completion scorer\n", + "#### **Reasoning Layer**\n", + "#### PlanQualityMetric\n", + "Let's measures how well the agent generates a plan before acting. A high score means the plan is logical, complete, and efficient." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorer.llm.deepeval.PlanQuality\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", "\n", + "#### PlanAdherenceMetric\n", + "Let's checks whether the agent follows the plan it created. Deviations lower this score and indicate gaps between reasoning and execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorer.llm.deepeval.PlanAdherence\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " expected_output_column = \"expected_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### **Action Layer**\n", + "#### ToolCorrectnessMetric\n", + "Let's evaluates if the agent selects the appropriate tool for the task. Choosing the wrong tool reduces performance even if reasoning was correct." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorer.llm.deepeval.ToolCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " expected_tools_column = \"expected_tools\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### ArgumentCorrectnessMetric\n", + "Let's assesses whether the agent provides correct inputs or arguments to the selected tool. Incorrect arguments can lead to failed or unexpected results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vm_test_dataset.assign_scores(\n", + " metrics = \"validmind.scorer.llm.deepeval.ArgumentCorrectness\",\n", + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "#### **Execution Layer**\n", + "#### TaskCompletionMetric\n", "The TaskCompletion test evaluates whether our banking agent successfully completes the requested tasks by analyzing its outputs and tool usage. This metric assesses the agent's ability to understand user requests, execute appropriate actions, and provide complete responses that address the original query. The test provides a score between 0-1 along with detailed feedback on task completion quality." ] }, @@ -992,12 +1131,13 @@ "source": [ "vm_test_dataset.assign_scores(\n", " metrics = \"validmind.scorer.llm.deepeval.TaskCompletion\",\n", - " input_column=\"input\",\n", - " tools_called_column=\"tools_called\",\n", - " actual_output_column=\"banking_agent_model_prediction\",\n", - " agent_output_column=\"banking_agent_model_output\"\n", - " )\n", - "vm_test_dataset._df.head(2)" + " input_column = \"input\",\n", + " actual_output_column = \"banking_agent_model_prediction\",\n", + " agent_output_column = \"banking_agent_model_output\",\n", + " tools_called_column = \"banking_agent_model_tool_called\",\n", + "\n", + ")\n", + "vm_test_dataset._df.head()" ] }, { @@ -1323,7 +1463,7 @@ ], "metadata": { "kernelspec": { - "display_name": "ValidMind Library", + "display_name": "ValidMind (Poetry)", "language": "python", "name": "validmind" }, diff --git a/validmind/scorer/llm/deepeval/PlanQuality.py b/validmind/scorer/llm/deepeval/PlanQuality.py index 5bbd1e83b..48025b856 100644 --- a/validmind/scorer/llm/deepeval/PlanQuality.py +++ b/validmind/scorer/llm/deepeval/PlanQuality.py @@ -13,7 +13,7 @@ try: from deepeval import evaluate from deepeval.metrics import PlanQualityMetric - from deepeval.test_case import LLMTestCase + from deepeval.test_case import LLMTestCase, ToolCall except ImportError as e: if "deepeval" in str(e): raise MissingDependencyError( @@ -90,10 +90,17 @@ def PlanQuality( input_value = row[input_column] actual_output_value = row.get(actual_output_column, "") tools_called_value = row.get(tools_called_column, []) + if not isinstance(tools_called_value, list) or not all( + isinstance(tool, ToolCall) for tool in tools_called_value + ): + from validmind.scorer.llm.deepeval import _convert_to_tool_call_list + + tools_called_value = _convert_to_tool_call_list(tools_called_value) test_case = LLMTestCase( input=input_value, actual_output=actual_output_value, tools_called=tools_called_value, + _trace_dict=row.get(agent_output_column, {}), ) result = evaluate(test_cases=[test_case], metrics=[metric]) diff --git a/validmind/scorer/llm/deepeval/TaskCompletion.py b/validmind/scorer/llm/deepeval/TaskCompletion.py index 9599b49d0..ca9f3cec0 100644 --- a/validmind/scorer/llm/deepeval/TaskCompletion.py +++ b/validmind/scorer/llm/deepeval/TaskCompletion.py @@ -197,6 +197,7 @@ def TaskCompletion( input=input_value, actual_output=actual_output_value, tools_called=all_tool_calls, + _trace_dict=row.get(agent_output_column, {}), ) result = evaluate(test_cases=[test_case], metrics=[metric]) diff --git a/validmind/scorer/llm/deepeval/ToolCorrectness.py b/validmind/scorer/llm/deepeval/ToolCorrectness.py index 9b89f905b..e73c094fc 100644 --- a/validmind/scorer/llm/deepeval/ToolCorrectness.py +++ b/validmind/scorer/llm/deepeval/ToolCorrectness.py @@ -119,6 +119,7 @@ def ToolCorrectness( expected_tools=expected_tools_list, tools_called=tools_called_list, actual_output=actual_output_value, + _trace_dict=row.get(agent_output_column, {}), ) result = evaluate(test_cases=[test_case], metrics=[metric]) diff --git a/validmind/scorer/llm/deepeval/__init__.py b/validmind/scorer/llm/deepeval/__init__.py index e3fd32fd4..416b57e0c 100644 --- a/validmind/scorer/llm/deepeval/__init__.py +++ b/validmind/scorer/llm/deepeval/__init__.py @@ -13,7 +13,6 @@ from .ArgumentCorrectness import ArgumentCorrectness from .PlanAdherence import PlanAdherence from .PlanQuality import PlanQuality -from .StepEfficiency import StepEfficiency from .ToolCorrectness import ToolCorrectness __all__ = [ @@ -21,7 +20,6 @@ "ArgumentCorrectness", "PlanAdherence", "PlanQuality", - "StepEfficiency", "ToolCorrectness", "_extract_tool_responses", "_extract_tool_calls_from_message", @@ -140,13 +138,24 @@ def _convert_to_tool_call_list(tools: Any) -> List[ToolCall]: Args: tools: List of tools in ToolCall, dict, or str format. + If already a list of ToolCall objects, returns them as-is. Returns: List of ToolCall objects. """ + if ToolCall is None: + raise ImportError( + "deepeval.test_case.ToolCall is not available. " + "Please install deepeval: pip install deepeval" + ) + if not isinstance(tools, list): tools = [] + # If the input is already a list of ToolCall objects, return it directly + if tools and all(isinstance(tool, ToolCall) for tool in tools): + return tools + tool_call_list = [] for tool in tools: if isinstance(tool, ToolCall): From f5e51f52c9e8dc17712f28d1cd1b207aeca3689d Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Tue, 6 Jan 2026 12:15:15 +0000 Subject: [PATCH 4/6] rename scorer folder to scorers --- tests/test_dataset.py | 24 ++--- tests/test_scorer_decorator.py | 98 +++++++++---------- validmind/__init__.py | 2 +- validmind/{scorer => scorers}/__init__.py | 2 +- .../classification/AbsoluteError.py | 0 .../classification/BrierScore.py | 0 .../classification/CalibrationError.py | 0 .../classification/ClassBalance.py | 0 .../classification/Confidence.py | 0 .../classification/Correctness.py | 0 .../classification/LogLoss.py | 0 .../classification/OutlierScore.py | 0 .../classification/ProbabilityError.py | 0 .../classification/Uncertainty.py | 0 .../classification/__init__.py | 0 .../llm/deepeval/AnswerRelevancy.py | 0 .../llm/deepeval/ArgumentCorrectness.py | 2 +- .../{scorer => scorers}/llm/deepeval/Bias.py | 0 .../llm/deepeval/ContextualPrecision.py | 0 .../llm/deepeval/ContextualRecall.py | 0 .../llm/deepeval/ContextualRelevancy.py | 0 .../llm/deepeval/Faithfulness.py | 0 .../{scorer => scorers}/llm/deepeval/GEval.py | 0 .../llm/deepeval/Hallucination.py | 0 .../llm/deepeval/PlanAdherence.py | 0 .../llm/deepeval/PlanQuality.py | 2 +- .../llm/deepeval/Summarization.py | 0 .../llm/deepeval/TaskCompletion.py | 0 .../llm/deepeval/ToolCorrectness.py | 2 +- .../llm/deepeval/__init__.py | 0 validmind/tests/decorator.py | 24 ++--- validmind/tests/test_providers.py | 6 +- validmind/vm_models/dataset/dataset.py | 14 +-- 33 files changed, 88 insertions(+), 88 deletions(-) rename validmind/{scorer => scorers}/__init__.py (98%) rename validmind/{scorer => scorers}/classification/AbsoluteError.py (100%) rename validmind/{scorer => scorers}/classification/BrierScore.py (100%) rename validmind/{scorer => scorers}/classification/CalibrationError.py (100%) rename validmind/{scorer => scorers}/classification/ClassBalance.py (100%) rename validmind/{scorer => scorers}/classification/Confidence.py (100%) rename validmind/{scorer => scorers}/classification/Correctness.py (100%) rename validmind/{scorer => scorers}/classification/LogLoss.py (100%) rename validmind/{scorer => scorers}/classification/OutlierScore.py (100%) rename validmind/{scorer => scorers}/classification/ProbabilityError.py (100%) rename validmind/{scorer => scorers}/classification/Uncertainty.py (100%) rename validmind/{scorer => scorers}/classification/__init__.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/AnswerRelevancy.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/ArgumentCorrectness.py (98%) rename validmind/{scorer => scorers}/llm/deepeval/Bias.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/ContextualPrecision.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/ContextualRecall.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/ContextualRelevancy.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/Faithfulness.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/GEval.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/Hallucination.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/PlanAdherence.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/PlanQuality.py (98%) rename validmind/{scorer => scorers}/llm/deepeval/Summarization.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/TaskCompletion.py (100%) rename validmind/{scorer => scorers}/llm/deepeval/ToolCorrectness.py (99%) rename validmind/{scorer => scorers}/llm/deepeval/__init__.py (100%) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index f5e6e590d..45423a88d 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -534,7 +534,7 @@ def test_assign_scores_single_metric(self): vm_dataset.assign_predictions(model=vm_model) # Test assign_scores with single metric - vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorer.classification.LogLoss") + vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorers.classification.LogLoss") # Check that the metric column was added expected_column = f"{vm_model.input_id}_LogLoss" @@ -566,7 +566,7 @@ def test_assign_scores_multiple_metrics(self): vm_dataset.assign_predictions(model=vm_model) # Test assign_scores with multiple metrics - metrics = ["validmind.scorer.classification.LogLoss", "validmind.scorer.classification.BrierScore", "validmind.scorer.classification.Confidence"] + metrics = ["validmind.scorers.classification.LogLoss", "validmind.scorers.classification.BrierScore", "validmind.scorers.classification.Confidence"] metrics_column_name = [metric.split(".")[-1] for metric in metrics] vm_dataset.assign_scores(model = vm_model, metrics = metrics) @@ -602,7 +602,7 @@ def test_assign_scores_with_parameters(self): vm_dataset.assign_predictions(model=vm_model) # Test assign_scores with parameters - vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorer.classification.LogLoss") + vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorers.classification.LogLoss") # Check that the metric column was added expected_column = f"{vm_model.input_id}_LogLoss" @@ -630,7 +630,7 @@ def test_assign_scores_full_metric_id(self): vm_dataset.assign_predictions(model=vm_model) # Test assign_scores with full metric ID - full_metric_id = "validmind.scorer.classification.LogLoss" + full_metric_id = "validmind.scorers.classification.LogLoss" vm_dataset.assign_scores(model = vm_model, metrics = full_metric_id) # Check that the metric column was added with correct name @@ -659,7 +659,7 @@ def test_assign_scores_regression_model(self): vm_dataset.assign_predictions(model=vm_model) # Test assign_scores with available row metrics (using classification metrics for testing) - vm_dataset.assign_scores(model=vm_model, metrics=["validmind.scorer.classification.LogLoss", "validmind.scorer.classification.BrierScore"]) + vm_dataset.assign_scores(model=vm_model, metrics=["validmind.scorers.classification.LogLoss", "validmind.scorers.classification.BrierScore"]) # Check that both metric columns were added expected_columns = ["reg_model_LogLoss", "reg_model_BrierScore"] @@ -695,7 +695,7 @@ def test_assign_scores_no_model_input_id(self): vm_dataset.assign_predictions(model=vm_model) # Should work and create column without prefix - vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorer.classification.LogLoss") + vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorers.classification.LogLoss") # Check that the metric column was added without prefix expected_column = "LogLoss" # No model prefix @@ -745,7 +745,7 @@ def test_assign_scores_no_predictions(self): # Don't assign predictions - test that assign_scores raises error # (row metrics require predictions to be available) with self.assertRaises(ValueError) as context: - vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorer.classification.LogLoss") + vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorers.classification.LogLoss") self.assertIn("No prediction column found", str(context.exception)) @@ -767,7 +767,7 @@ def test_assign_scores_column_naming_convention(self): vm_dataset.assign_predictions(model=vm_model) # Test multiple metrics to verify naming convention - metrics = ["validmind.scorer.classification.LogLoss", "validmind.scorer.classification.BrierScore", "validmind.scorer.classification.Confidence"] + metrics = ["validmind.scorers.classification.LogLoss", "validmind.scorers.classification.BrierScore", "validmind.scorers.classification.Confidence"] metrics_column_name = [metric.split(".")[-1] for metric in metrics] vm_dataset.assign_scores(model = vm_model, metrics = metrics) @@ -800,8 +800,8 @@ def test_assign_scores_multiple_models(self): vm_dataset.assign_predictions(model=vm_rf_model) # Assign scores for both models - vm_dataset.assign_scores(model = vm_lr_model, metrics = "validmind.scorer.classification.LogLoss") - vm_dataset.assign_scores(model = vm_rf_model, metrics = "validmind.scorer.classification.LogLoss") + vm_dataset.assign_scores(model = vm_lr_model, metrics = "validmind.scorers.classification.LogLoss") + vm_dataset.assign_scores(model = vm_rf_model, metrics = "validmind.scorers.classification.LogLoss") # Check that both metric columns exist with correct names lr_column = "lr_model_LogLoss" @@ -903,7 +903,7 @@ def test_assign_scores_mixed_model_scenarios(self): vm_dataset.assign_predictions(model=vm_model) # Scenario 1: Model with input_id (should have prefix) - vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorer.classification.LogLoss") + vm_dataset.assign_scores(model = vm_model, metrics = "validmind.scorers.classification.LogLoss") self.assertTrue("test_model_LogLoss" in vm_dataset.df.columns) # Scenario 2: Model without input_id (should not have prefix) @@ -911,7 +911,7 @@ def test_assign_scores_mixed_model_scenarios(self): vm_model_no_id.input_id = None # Assign predictions for this model too vm_dataset.assign_predictions(model=vm_model_no_id) - vm_dataset.assign_scores(model = vm_model_no_id, metrics = "validmind.scorer.classification.BrierScore") + vm_dataset.assign_scores(model = vm_model_no_id, metrics = "validmind.scorers.classification.BrierScore") self.assertTrue("BrierScore" in vm_dataset.df.columns) # Scenario 3: No model (should not have prefix) diff --git a/tests/test_scorer_decorator.py b/tests/test_scorer_decorator.py index 50b8a05e8..d582e4b29 100644 --- a/tests/test_scorer_decorator.py +++ b/tests/test_scorer_decorator.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Unit tests for the @scorer decorator functionality (merged). +Unit tests for the @scorer decorator functionality. This module includes two kinds of tests: 1) Integration tests that exercise the real ValidMind imports (skipped if imports fail) @@ -40,16 +40,16 @@ def tearDown(self): def test_scorer_with_explicit_id(self): """Test @scorer decorator with explicit ID.""" - @scorer("validmind.scorer.test.ExplicitScorer") + @scorer("validmind.scorers.test.ExplicitScorer") def explicit_scorer(model, dataset): """A scorer with explicit ID.""" return [1.0, 2.0, 3.0] # Check that the scorer is registered - registered_scorer = scorer_store.get_scorer("validmind.scorer.test.ExplicitScorer") + registered_scorer = scorer_store.get_scorer("validmind.scorers.test.ExplicitScorer") self.assertIsNotNone(registered_scorer) self.assertEqual(registered_scorer, explicit_scorer) - self.assertEqual(explicit_scorer.scorer_id, "validmind.scorer.test.ExplicitScorer") + self.assertEqual(explicit_scorer.scorer_id, "validmind.scorers.test.ExplicitScorer") def test_scorer_with_empty_parentheses(self): """Test @scorer() decorator with empty parentheses.""" @@ -89,43 +89,43 @@ def no_parentheses_scorer(model, dataset): def test_scorer_separation_from_tests(self): """Test that scorers are stored separately from regular tests.""" - @scorer("validmind.scorer.test.SeparationTest") + @scorer("validmind.scorers.test.SeparationTest") def separation_scorer(model, dataset): """A scorer for separation testing.""" return list([1.0]) # Check that scorer is in scorer store - scorer_in_store = scorer_store.get_scorer("validmind.scorer.test.SeparationTest") + scorer_in_store = scorer_store.get_scorer("validmind.scorers.test.SeparationTest") self.assertIsNotNone(scorer_in_store) self.assertEqual(scorer_in_store, separation_scorer) # Check that scorer is NOT in regular test store - test_in_store = test_store.get_test("validmind.scorer.test.SeparationTest") + test_in_store = test_store.get_test("validmind.scorers.test.SeparationTest") self.assertIsNone(test_in_store) def test_scorer_with_tags_and_tasks(self): """Test that @scorer decorator works with @tags and @tasks decorators.""" - @scorer("validmind.scorer.test.TaggedScorer") - @tags("test", "scorer", "tagged") + @scorer("validmind.scorers.test.TaggedScorer") + @tags("test", "score", "tagged") @tasks("classification") def tagged_scorer(model, dataset): """A scorer with tags and tasks.""" return list([1.0]) # Check that the scorer is registered - registered_scorer = scorer_store.get_scorer("validmind.scorer.test.TaggedScorer") + registered_scorer = scorer_store.get_scorer("validmind.scorers.test.TaggedScorer") self.assertIsNotNone(registered_scorer) # Check that tags and tasks are preserved self.assertTrue(hasattr(tagged_scorer, '__tags__')) - self.assertEqual(tagged_scorer.__tags__, ["test", "scorer", "tagged"]) + self.assertEqual(tagged_scorer.__tags__, ["test", "score", "tagged"]) self.assertTrue(hasattr(tagged_scorer, '__tasks__')) self.assertEqual(tagged_scorer.__tasks__, ["classification"]) def test_scorer_save_functionality(self): """Test that the save functionality is available.""" - @scorer("validmind.scorer.test.SaveTest") + @scorer("validmind.scorers.test.SaveTest") def save_test_scorer(model, dataset): """A scorer for testing save functionality.""" return list([1.0]) @@ -136,44 +136,44 @@ def save_test_scorer(model, dataset): def test_multiple_scorers_registration(self): """Test that multiple scorers can be registered without conflicts.""" - @scorer("validmind.scorer.test.Multiple1") + @scorer("validmind.scorers.test.Multiple1") def scorer1(model, dataset): return list([1.0]) - @scorer("validmind.scorer.test.Multiple2") + @scorer("validmind.scorers.test.Multiple2") def scorer2(model, dataset): return list([2.0]) - @scorer("validmind.scorer.test.Multiple3") + @scorer("validmind.scorers.test.Multiple3") def scorer3(model, dataset): return list([3.0]) # Check that all scorers are registered - self.assertIsNotNone(scorer_store.get_scorer("validmind.scorer.test.Multiple1")) - self.assertIsNotNone(scorer_store.get_scorer("validmind.scorer.test.Multiple2")) - self.assertIsNotNone(scorer_store.get_scorer("validmind.scorer.test.Multiple3")) + self.assertIsNotNone(scorer_store.get_scorer("validmind.scorers.test.Multiple1")) + self.assertIsNotNone(scorer_store.get_scorer("validmind.scorers.test.Multiple2")) + self.assertIsNotNone(scorer_store.get_scorer("validmind.scorers.test.Multiple3")) # Check that they are different functions self.assertNotEqual( - scorer_store.get_scorer("validmind.scorer.test.Multiple1"), - scorer_store.get_scorer("validmind.scorer.test.Multiple2") + scorer_store.get_scorer("validmind.scorers.test.Multiple1"), + scorer_store.get_scorer("validmind.scorers.test.Multiple2") ) def test_scorer_with_parameters(self): """Test that scorers can have parameters.""" - @scorer("validmind.scorer.test.ParameterScorer") + @scorer("validmind.scorers.test.ParameterScorer") def parameter_scorer(model, dataset, threshold: float = 0.5, multiplier: int = 2): """A scorer with parameters.""" return list([threshold * multiplier]) # Check that the scorer is registered - registered_scorer = scorer_store.get_scorer("validmind.scorer.test.ParameterScorer") + registered_scorer = scorer_store.get_scorer("validmind.scorers.test.ParameterScorer") self.assertIsNotNone(registered_scorer) self.assertEqual(registered_scorer, parameter_scorer) def test_scorer_docstring_preservation(self): """Test that docstrings are preserved.""" - @scorer("validmind.scorer.test.DocstringTest") + @scorer("validmind.scorers.test.DocstringTest") def docstring_scorer(model, dataset): """This is a test docstring for the scorer.""" return list([1.0]) @@ -199,15 +199,15 @@ def tearDown(self): def test_generate_id_from_path_classification(self, mock_abspath, mock_relpath, mock_getfile): """Test ID generation for classification scorer.""" # Mock the file path - mock_getfile.return_value = "/path/to/validmind/scorer/classification/BrierScore.py" - mock_abspath.return_value = "/path/to/validmind/scorer" + mock_getfile.return_value = "/path/to/validmind/scorers/classification/BrierScore.py" + mock_abspath.return_value = "/path/to/validmind/scorers" mock_relpath.return_value = "classification/BrierScore.py" def mock_function(): pass scorer_id = _generate_scorer_id_from_path(mock_function) - expected_id = "validmind.scorer.classification.BrierScore" + expected_id = "validmind.scorers.classification.BrierScore" self.assertEqual(scorer_id, expected_id) @patch('validmind.tests.decorator.inspect.getfile') @@ -216,15 +216,15 @@ def mock_function(): def test_generate_id_from_path_llm(self, mock_abspath, mock_relpath, mock_getfile): """Test ID generation for LLM scorer.""" # Mock the file path - mock_getfile.return_value = "/path/to/validmind/scorer/llm/deepeval/AnswerRelevancy.py" - mock_abspath.return_value = "/path/to/validmind/scorer" + mock_getfile.return_value = "/path/to/validmind/scorers/llm/deepeval/AnswerRelevancy.py" + mock_abspath.return_value = "/path/to/validmind/scorers" mock_relpath.return_value = "llm/deepeval/AnswerRelevancy.py" def mock_function(): pass scorer_id = _generate_scorer_id_from_path(mock_function) - expected_id = "validmind.scorer.llm.deepeval.AnswerRelevancy" + expected_id = "validmind.scorers.llm.deepeval.AnswerRelevancy" self.assertEqual(scorer_id, expected_id) @patch('validmind.tests.decorator.inspect.getfile') @@ -233,15 +233,15 @@ def mock_function(): def test_generate_id_from_path_root_scorer(self, mock_abspath, mock_relpath, mock_getfile): """Test ID generation for scorer in root scorer directory.""" # Mock the file path - mock_getfile.return_value = "/path/to/validmind/scorer/MyScorer.py" - mock_abspath.return_value = "/path/to/validmind/scorer" + mock_getfile.return_value = "/path/to/validmind/scorers/MyScorer.py" + mock_abspath.return_value = "/path/to/validmind/scorers" mock_relpath.return_value = "MyScorer.py" def mock_function(): pass scorer_id = _generate_scorer_id_from_path(mock_function) - expected_id = "validmind.scorer.MyScorer" + expected_id = "validmind.scorers.MyScorer" self.assertEqual(scorer_id, expected_id) @patch('validmind.tests.decorator.inspect.getfile') @@ -254,7 +254,7 @@ def mock_function(): pass scorer_id = _generate_scorer_id_from_path(mock_function) - expected_id = "validmind.scorer.mock_function" + expected_id = "validmind.scorers.mock_function" self.assertEqual(scorer_id, expected_id) @patch('validmind.tests.decorator.inspect.getfile') @@ -264,14 +264,14 @@ def test_generate_id_fallback_on_value_error(self, mock_abspath, mock_relpath, m """Test ID generation fallback when relative path calculation fails.""" # Mock getfile to return a path outside the scorer directory mock_getfile.return_value = "/path/to/some/other/directory/MyScorer.py" - mock_abspath.return_value = "/path/to/validmind/scorer" + mock_abspath.return_value = "/path/to/validmind/scorers" mock_relpath.side_effect = ValueError("Path not under scorer directory") def mock_function(): pass scorer_id = _generate_scorer_id_from_path(mock_function) - expected_id = "validmind.scorer.mock_function" + expected_id = "validmind.scorers.mock_function" self.assertEqual(scorer_id, expected_id) @@ -299,16 +299,16 @@ def test_scorer_store_singleton(self): def test_scorer_registration_and_retrieval(self): """Test complete registration and retrieval cycle.""" - @scorer("validmind.scorer.test.IntegrationTest") + @scorer("validmind.scorers.test.IntegrationTest") def integration_scorer(model, dataset): """Integration test scorer.""" return list([1.0, 2.0, 3.0]) # Test registration - self.assertIsNotNone(scorer_store.get_scorer("validmind.scorer.test.IntegrationTest")) + self.assertIsNotNone(scorer_store.get_scorer("validmind.scorers.test.IntegrationTest")) # Test retrieval - retrieved_scorer = scorer_store.get_scorer("validmind.scorer.test.IntegrationTest") + retrieved_scorer = scorer_store.get_scorer("validmind.scorers.test.IntegrationTest") self.assertEqual(retrieved_scorer, integration_scorer) # Test that it's callable @@ -316,7 +316,7 @@ def integration_scorer(model, dataset): def test_scorer_with_mock_model_and_dataset(self): """Test scorer execution with mock model and dataset.""" - @scorer("validmind.scorer.test.MockExecution") + @scorer("validmind.scorers.test.MockExecution") def mock_execution_scorer(model, dataset): """Scorer for mock execution testing.""" return list([1.0, 2.0, 3.0]) @@ -378,11 +378,11 @@ def _mock_scorer(func_or_id: Union[Callable[..., Any], str, None] = None) -> Cal def _decorator(func: Callable[..., Any]) -> Callable[..., Any]: if func_or_id is None or func_or_id == "": - scorer_id = f"validmind.scorer.{func.__name__}" + scorer_id = f"validmind.scorers.{func.__name__}" elif isinstance(func_or_id, str): scorer_id = func_or_id else: - scorer_id = f"validmind.scorer.{func.__name__}" + scorer_id = f"validmind.scorers.{func.__name__}" _mock_scorer_store.register_scorer(scorer_id, func) func.scorer_id = scorer_id @@ -406,18 +406,18 @@ def test_scorer_with_empty_string_id(self): @_mock_scorer("") def empty_string_scorer(model, dataset): return _MockList([1.0]) - self.assertEqual(empty_string_scorer.scorer_id, "validmind.scorer.empty_string_scorer") - self.assertIsNotNone(_mock_scorer_store.get_scorer("validmind.scorer.empty_string_scorer")) + self.assertEqual(empty_string_scorer.scorer_id, "validmind.scorers.empty_string_scorer") + self.assertIsNotNone(_mock_scorer_store.get_scorer("validmind.scorers.empty_string_scorer")) def test_scorer_with_none_id(self): @_mock_scorer(None) def none_id_scorer(model, dataset): return _MockList([1.0]) - self.assertEqual(none_id_scorer.scorer_id, "validmind.scorer.none_id_scorer") - self.assertIsNotNone(_mock_scorer_store.get_scorer("validmind.scorer.none_id_scorer")) + self.assertEqual(none_id_scorer.scorer_id, "validmind.scorers.none_id_scorer") + self.assertIsNotNone(_mock_scorer_store.get_scorer("validmind.scorers.none_id_scorer")) def test_scorer_with_complex_parameters(self): - @_mock_scorer("validmind.scorer.test.ComplexParams") + @_mock_scorer("validmind.scorers.test.ComplexParams") def complex_params_scorer( model, dataset, @@ -432,13 +432,13 @@ def complex_params_scorer( config = {"key": "value"} return _MockList([threshold, float(enabled), len(categories)]) - self.assertIsNotNone(_mock_scorer_store.get_scorer("validmind.scorer.test.ComplexParams")) + self.assertIsNotNone(_mock_scorer_store.get_scorer("validmind.scorers.test.ComplexParams")) def test_scorer_with_no_parameters(self): - @_mock_scorer("validmind.scorer.test.NoParams") + @_mock_scorer("validmind.scorers.test.NoParams") def no_params_scorer(model, dataset): return _MockList([1.0]) - self.assertIsNotNone(_mock_scorer_store.get_scorer("validmind.scorer.test.NoParams")) + self.assertIsNotNone(_mock_scorer_store.get_scorer("validmind.scorers.test.NoParams")) if __name__ == '__main__': diff --git a/validmind/__init__.py b/validmind/__init__.py index 5b96ea574..356e3985b 100644 --- a/validmind/__init__.py +++ b/validmind/__init__.py @@ -48,7 +48,7 @@ except ImportError: ... -from . import scorer +from . import scorers as scorer from .__version__ import __version__ # noqa: E402 from .api_client import init, log_metric, log_text, reload from .client import ( # noqa: E402 diff --git a/validmind/scorer/__init__.py b/validmind/scorers/__init__.py similarity index 98% rename from validmind/scorer/__init__.py rename to validmind/scorers/__init__.py index 51032d109..f04cd4ab7 100644 --- a/validmind/scorer/__init__.py +++ b/validmind/scorers/__init__.py @@ -13,7 +13,7 @@ def list_scorers(**kwargs): vm_provider = test_provider_store.get_test_provider("validmind") vm_scorers_provider = vm_provider.scorers_provider - prefix = "validmind.scorer." + prefix = "validmind.scorers." return [ f"{prefix}{test_id}" for test_id in vm_scorers_provider.list_tests(**kwargs) diff --git a/validmind/scorer/classification/AbsoluteError.py b/validmind/scorers/classification/AbsoluteError.py similarity index 100% rename from validmind/scorer/classification/AbsoluteError.py rename to validmind/scorers/classification/AbsoluteError.py diff --git a/validmind/scorer/classification/BrierScore.py b/validmind/scorers/classification/BrierScore.py similarity index 100% rename from validmind/scorer/classification/BrierScore.py rename to validmind/scorers/classification/BrierScore.py diff --git a/validmind/scorer/classification/CalibrationError.py b/validmind/scorers/classification/CalibrationError.py similarity index 100% rename from validmind/scorer/classification/CalibrationError.py rename to validmind/scorers/classification/CalibrationError.py diff --git a/validmind/scorer/classification/ClassBalance.py b/validmind/scorers/classification/ClassBalance.py similarity index 100% rename from validmind/scorer/classification/ClassBalance.py rename to validmind/scorers/classification/ClassBalance.py diff --git a/validmind/scorer/classification/Confidence.py b/validmind/scorers/classification/Confidence.py similarity index 100% rename from validmind/scorer/classification/Confidence.py rename to validmind/scorers/classification/Confidence.py diff --git a/validmind/scorer/classification/Correctness.py b/validmind/scorers/classification/Correctness.py similarity index 100% rename from validmind/scorer/classification/Correctness.py rename to validmind/scorers/classification/Correctness.py diff --git a/validmind/scorer/classification/LogLoss.py b/validmind/scorers/classification/LogLoss.py similarity index 100% rename from validmind/scorer/classification/LogLoss.py rename to validmind/scorers/classification/LogLoss.py diff --git a/validmind/scorer/classification/OutlierScore.py b/validmind/scorers/classification/OutlierScore.py similarity index 100% rename from validmind/scorer/classification/OutlierScore.py rename to validmind/scorers/classification/OutlierScore.py diff --git a/validmind/scorer/classification/ProbabilityError.py b/validmind/scorers/classification/ProbabilityError.py similarity index 100% rename from validmind/scorer/classification/ProbabilityError.py rename to validmind/scorers/classification/ProbabilityError.py diff --git a/validmind/scorer/classification/Uncertainty.py b/validmind/scorers/classification/Uncertainty.py similarity index 100% rename from validmind/scorer/classification/Uncertainty.py rename to validmind/scorers/classification/Uncertainty.py diff --git a/validmind/scorer/classification/__init__.py b/validmind/scorers/classification/__init__.py similarity index 100% rename from validmind/scorer/classification/__init__.py rename to validmind/scorers/classification/__init__.py diff --git a/validmind/scorer/llm/deepeval/AnswerRelevancy.py b/validmind/scorers/llm/deepeval/AnswerRelevancy.py similarity index 100% rename from validmind/scorer/llm/deepeval/AnswerRelevancy.py rename to validmind/scorers/llm/deepeval/AnswerRelevancy.py diff --git a/validmind/scorer/llm/deepeval/ArgumentCorrectness.py b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py similarity index 98% rename from validmind/scorer/llm/deepeval/ArgumentCorrectness.py rename to validmind/scorers/llm/deepeval/ArgumentCorrectness.py index c55105604..003354630 100644 --- a/validmind/scorer/llm/deepeval/ArgumentCorrectness.py +++ b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py @@ -84,7 +84,7 @@ def ArgumentCorrectness( ) # Import helper functions to avoid circular import - from validmind.scorer.llm.deepeval import ( + from validmind.scorers.llm.deepeval import ( _convert_to_tool_call_list, extract_tool_calls_from_agent_output, ) diff --git a/validmind/scorer/llm/deepeval/Bias.py b/validmind/scorers/llm/deepeval/Bias.py similarity index 100% rename from validmind/scorer/llm/deepeval/Bias.py rename to validmind/scorers/llm/deepeval/Bias.py diff --git a/validmind/scorer/llm/deepeval/ContextualPrecision.py b/validmind/scorers/llm/deepeval/ContextualPrecision.py similarity index 100% rename from validmind/scorer/llm/deepeval/ContextualPrecision.py rename to validmind/scorers/llm/deepeval/ContextualPrecision.py diff --git a/validmind/scorer/llm/deepeval/ContextualRecall.py b/validmind/scorers/llm/deepeval/ContextualRecall.py similarity index 100% rename from validmind/scorer/llm/deepeval/ContextualRecall.py rename to validmind/scorers/llm/deepeval/ContextualRecall.py diff --git a/validmind/scorer/llm/deepeval/ContextualRelevancy.py b/validmind/scorers/llm/deepeval/ContextualRelevancy.py similarity index 100% rename from validmind/scorer/llm/deepeval/ContextualRelevancy.py rename to validmind/scorers/llm/deepeval/ContextualRelevancy.py diff --git a/validmind/scorer/llm/deepeval/Faithfulness.py b/validmind/scorers/llm/deepeval/Faithfulness.py similarity index 100% rename from validmind/scorer/llm/deepeval/Faithfulness.py rename to validmind/scorers/llm/deepeval/Faithfulness.py diff --git a/validmind/scorer/llm/deepeval/GEval.py b/validmind/scorers/llm/deepeval/GEval.py similarity index 100% rename from validmind/scorer/llm/deepeval/GEval.py rename to validmind/scorers/llm/deepeval/GEval.py diff --git a/validmind/scorer/llm/deepeval/Hallucination.py b/validmind/scorers/llm/deepeval/Hallucination.py similarity index 100% rename from validmind/scorer/llm/deepeval/Hallucination.py rename to validmind/scorers/llm/deepeval/Hallucination.py diff --git a/validmind/scorer/llm/deepeval/PlanAdherence.py b/validmind/scorers/llm/deepeval/PlanAdherence.py similarity index 100% rename from validmind/scorer/llm/deepeval/PlanAdherence.py rename to validmind/scorers/llm/deepeval/PlanAdherence.py diff --git a/validmind/scorer/llm/deepeval/PlanQuality.py b/validmind/scorers/llm/deepeval/PlanQuality.py similarity index 98% rename from validmind/scorer/llm/deepeval/PlanQuality.py rename to validmind/scorers/llm/deepeval/PlanQuality.py index 48025b856..bfa4ef664 100644 --- a/validmind/scorer/llm/deepeval/PlanQuality.py +++ b/validmind/scorers/llm/deepeval/PlanQuality.py @@ -93,7 +93,7 @@ def PlanQuality( if not isinstance(tools_called_value, list) or not all( isinstance(tool, ToolCall) for tool in tools_called_value ): - from validmind.scorer.llm.deepeval import _convert_to_tool_call_list + from validmind.scorers.llm.deepeval import _convert_to_tool_call_list tools_called_value = _convert_to_tool_call_list(tools_called_value) test_case = LLMTestCase( diff --git a/validmind/scorer/llm/deepeval/Summarization.py b/validmind/scorers/llm/deepeval/Summarization.py similarity index 100% rename from validmind/scorer/llm/deepeval/Summarization.py rename to validmind/scorers/llm/deepeval/Summarization.py diff --git a/validmind/scorer/llm/deepeval/TaskCompletion.py b/validmind/scorers/llm/deepeval/TaskCompletion.py similarity index 100% rename from validmind/scorer/llm/deepeval/TaskCompletion.py rename to validmind/scorers/llm/deepeval/TaskCompletion.py diff --git a/validmind/scorer/llm/deepeval/ToolCorrectness.py b/validmind/scorers/llm/deepeval/ToolCorrectness.py similarity index 99% rename from validmind/scorer/llm/deepeval/ToolCorrectness.py rename to validmind/scorers/llm/deepeval/ToolCorrectness.py index e73c094fc..0bc3f0853 100644 --- a/validmind/scorer/llm/deepeval/ToolCorrectness.py +++ b/validmind/scorers/llm/deepeval/ToolCorrectness.py @@ -85,7 +85,7 @@ def ToolCorrectness( ) # Import helper functions to avoid circular import - from validmind.scorer.llm.deepeval import ( + from validmind.scorers.llm.deepeval import ( _convert_to_tool_call_list, extract_tool_calls_from_agent_output, ) diff --git a/validmind/scorer/llm/deepeval/__init__.py b/validmind/scorers/llm/deepeval/__init__.py similarity index 100% rename from validmind/scorer/llm/deepeval/__init__.py rename to validmind/scorers/llm/deepeval/__init__.py diff --git a/validmind/tests/decorator.py b/validmind/tests/decorator.py index a5b48a2c1..45ef65077 100644 --- a/validmind/tests/decorator.py +++ b/validmind/tests/decorator.py @@ -172,12 +172,12 @@ def scorer(func_or_id: Union[Callable[..., Any], str, None] = None) -> Callable[ This decorator registers the function it wraps as a scorer function within ValidMind under the provided ID. Once decorated, the function can be run using the - `validmind.scorer.run_scorer` function. + `validmind.scorers.run_scorer` function. The scorer ID can be provided in three ways: - 1. Explicit ID: `@scorer("validmind.scorer.classification.BrierScore")` + 1. Explicit ID: `@scorer("validmind.scorers.classification.BrierScore")` 2. Auto-generated from path: `@scorer()` - automatically generates ID from file path - 3. Function name only: `@scorer` - uses function name with validmind.scorer prefix + 3. Function name only: `@scorer` - uses function name with validmind.scorers prefix The function can take two different types of arguments: @@ -263,7 +263,7 @@ def _generate_scorer_id_from_path(func: Callable[..., Any]) -> str: func: The function to generate an ID for Returns: - str: The generated scorer ID in the format validmind.scorer.path.to.function + str: The generated scorer ID in the format validmind.scorers.path.to.function """ import inspect @@ -271,16 +271,16 @@ def _generate_scorer_id_from_path(func: Callable[..., Any]) -> str: # Get the file path of the function file_path = inspect.getfile(func) - # Find the scorer directory in the path - scorer_dir = os.path.join(os.path.dirname(__file__), "..", "scorer") - scorer_dir = os.path.abspath(scorer_dir) + # Find the scorers directory in the path + scorers_dir = os.path.join(os.path.dirname(__file__), "..", "scorers") + scorers_dir = os.path.abspath(scorers_dir) # Get relative path from scorer directory try: - rel_path = os.path.relpath(file_path, scorer_dir) + rel_path = os.path.relpath(file_path, scorers_dir) except ValueError: # If file is not under scorer directory, fall back to function name - return f"validmind.scorer.{func.__name__}" + return f"validmind.scorers.{func.__name__}" # Convert path to scorer ID # Remove .py extension and replace path separators with dots @@ -288,11 +288,11 @@ def _generate_scorer_id_from_path(func: Callable[..., Any]) -> str: # If the path is just the filename (no subdirectories), use it as is if scorer_path == func.__name__: - return f"validmind.scorer.{func.__name__}" + return f"validmind.scorers.{func.__name__}" # Otherwise, use the full path - return f"validmind.scorer.{scorer_path}" + return f"validmind.scorers.{scorer_path}" except (OSError, TypeError): # Fallback to function name if we can't determine the path - return f"validmind.scorer.{func.__name__}" + return f"validmind.scorers.{func.__name__}" diff --git a/validmind/tests/test_providers.py b/validmind/tests/test_providers.py index 06c67c139..d6ee1e119 100644 --- a/validmind/tests/test_providers.py +++ b/validmind/tests/test_providers.py @@ -163,7 +163,7 @@ def __init__(self) -> None: os.path.join(os.path.dirname(__file__), "..", "unit_metrics") ) self.scorers_provider = LocalTestProvider( - os.path.join(os.path.dirname(__file__), "..", "scorer") + os.path.join(os.path.dirname(__file__), "..", "scorers") ) self.test_provider = LocalTestProvider(os.path.dirname(__file__)) @@ -183,7 +183,7 @@ def load_test(self, test_id: str) -> Callable[..., Any]: return self.unit_metrics_provider.load_test( test_id.replace("unit_metrics.", "") ) - elif test_id.startswith("scorer."): - return self.scorers_provider.load_test(test_id.replace("scorer.", "")) + elif test_id.startswith("scorers."): + return self.scorers_provider.load_test(test_id.replace("scorers.", "")) else: return self.test_provider.load_test(test_id) diff --git a/validmind/vm_models/dataset/dataset.py b/validmind/vm_models/dataset/dataset.py index 168094ffe..e277cbae5 100644 --- a/validmind/vm_models/dataset/dataset.py +++ b/validmind/vm_models/dataset/dataset.py @@ -477,7 +477,7 @@ def assign_scores( metrics (Union[str, List[str]]): Single metric ID or list of metric IDs. Can be either: - Short name (e.g., "BrierScore", "LogLoss") - - Full metric ID (e.g., "validmind.scorer.classification.BrierScore") + - Full metric ID (e.g., "validmind.scorers.classification.BrierScore") **kwargs: Additional parameters passed to the row metrics. Examples: @@ -520,11 +520,11 @@ def _assign_single_score( """Compute and add a single metric's scores as dataset columns.""" # Import scorer module try: - from validmind.scorer import run_scorer + from validmind.scorers import run_scorer except ImportError as e: raise ImportError( f"Failed to import scorer module: {e}. " - "Make sure validmind.scorer is available." + "Make sure validmind.scorers is available." ) from e # Normalize metric ID and name @@ -768,12 +768,12 @@ def _normalize_metric_id(self, metric: str) -> str: str: Full metric ID """ # If already a full ID, return as-is - if metric.startswith("validmind.scorer."): + if metric.startswith("validmind.scorers."): return metric # Try to find the metric by short name try: - from validmind.scorer import list_scorers + from validmind.scorers import list_scorers from validmind.tests._store import scorer_store # Get built-in scorers @@ -797,11 +797,11 @@ def _normalize_metric_id(self, metric: str) -> str: suggestions = [m for m in available_metrics if metric.lower() in m.lower()] if suggestions: raise ValueError( - f"Metric '{metric}' not found in scorer. Did you mean one of: {suggestions[:5]}" + f"Metric '{metric}' not found in scorers. Did you mean one of: {suggestions[:5]}" ) else: raise ValueError( - f"Metric '{metric}' not found in scorer. Available metrics: {available_metrics[:10]}..." + f"Metric '{metric}' not found in scorers. Available metrics: {available_metrics[:10]}..." ) except ImportError as e: From 561ff2922cb54428f5ab2e675ac2fea9c1797539 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Thu, 8 Jan 2026 11:28:42 +0000 Subject: [PATCH 5/6] add relevant tags in the deepeval scorers --- .../scorers/llm/deepeval/AnswerRelevancy.py | 2 +- .../llm/deepeval/ArgumentCorrectness.py | 9 ++++- validmind/scorers/llm/deepeval/Bias.py | 2 +- .../llm/deepeval/ContextualPrecision.py | 2 +- .../llm/deepeval/ContextualRelevancy.py | 2 +- .../scorers/llm/deepeval/Faithfulness.py | 2 +- .../scorers/llm/deepeval/PlanAdherence.py | 38 +------------------ validmind/scorers/llm/deepeval/PlanQuality.py | 2 +- .../scorers/llm/deepeval/TaskCompletion.py | 2 +- .../scorers/llm/deepeval/ToolCorrectness.py | 4 +- 10 files changed, 19 insertions(+), 46 deletions(-) diff --git a/validmind/scorers/llm/deepeval/AnswerRelevancy.py b/validmind/scorers/llm/deepeval/AnswerRelevancy.py index 784203f76..fd827f2bc 100644 --- a/validmind/scorers/llm/deepeval/AnswerRelevancy.py +++ b/validmind/scorers/llm/deepeval/AnswerRelevancy.py @@ -28,7 +28,7 @@ # Create custom ValidMind tests for DeepEval metrics @scorer() -@tags("llm", "AnswerRelevancy", "deepeval") +@tags("llm", "AnswerRelevancy", "deepeval", "rag") @tasks("llm") def AnswerRelevancy( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py index 003354630..00bc78fd0 100644 --- a/validmind/scorers/llm/deepeval/ArgumentCorrectness.py +++ b/validmind/scorers/llm/deepeval/ArgumentCorrectness.py @@ -27,7 +27,14 @@ @scorer() -@tags("llm", "ArgumentCorrectness", "deepeval", "agent_evaluation", "action_layer") +@tags( + "llm", + "ArgumentCorrectness", + "deepeval", + "agent_evaluation", + "action_layer", + "agentic", +) @tasks("llm") def ArgumentCorrectness( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/Bias.py b/validmind/scorers/llm/deepeval/Bias.py index 12b2fe671..0f2b99a17 100644 --- a/validmind/scorers/llm/deepeval/Bias.py +++ b/validmind/scorers/llm/deepeval/Bias.py @@ -28,7 +28,7 @@ # Create custom ValidMind tests for DeepEval metrics @scorer() -@tags("llm", "Bias", "deepeval") +@tags("llm", "Bias", "deepeval", "safety") @tasks("llm") def Bias( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/ContextualPrecision.py b/validmind/scorers/llm/deepeval/ContextualPrecision.py index 45959ee37..8c86d9528 100644 --- a/validmind/scorers/llm/deepeval/ContextualPrecision.py +++ b/validmind/scorers/llm/deepeval/ContextualPrecision.py @@ -28,7 +28,7 @@ # Create custom ValidMind tests for DeepEval metrics @scorer() -@tags("llm", "ContextualPrecision", "deepeval") +@tags("llm", "ContextualPrecision", "deepeval", "rag") @tasks("llm") def ContextualPrecision( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/ContextualRelevancy.py b/validmind/scorers/llm/deepeval/ContextualRelevancy.py index 1e0c7708c..e56185308 100644 --- a/validmind/scorers/llm/deepeval/ContextualRelevancy.py +++ b/validmind/scorers/llm/deepeval/ContextualRelevancy.py @@ -28,7 +28,7 @@ # Create custom ValidMind tests for DeepEval metrics @scorer() -@tags("llm", "ContextualRelevancy", "deepeval") +@tags("llm", "ContextualRelevancy", "deepeval", "rag") @tasks("llm") def ContextualRelevancy( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/Faithfulness.py b/validmind/scorers/llm/deepeval/Faithfulness.py index b37d32cc3..eb1c051c0 100644 --- a/validmind/scorers/llm/deepeval/Faithfulness.py +++ b/validmind/scorers/llm/deepeval/Faithfulness.py @@ -28,7 +28,7 @@ # Create custom ValidMind tests for DeepEval metrics @scorer() -@tags("llm", "Faithfulness", "deepeval") +@tags("llm", "Faithfulness", "deepeval", "rag") @tasks("llm") def Faithfulness( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/PlanAdherence.py b/validmind/scorers/llm/deepeval/PlanAdherence.py index 26b3c327a..0335a6fdd 100644 --- a/validmind/scorers/llm/deepeval/PlanAdherence.py +++ b/validmind/scorers/llm/deepeval/PlanAdherence.py @@ -26,44 +26,8 @@ raise e -# def _extract_plan_value( -# row: Any, -# has_plan_column: bool, -# has_agent_output: bool, -# plan_column: str, -# agent_output_column: str, -# ) -> str: -# """Extract plan value from row.""" -# plan_value: Optional[str] = None -# if has_plan_column: -# plan_value = row.get(plan_column) -# elif has_agent_output: -# agent_output = row.get(agent_output_column, {}) -# if isinstance(agent_output, dict): -# plan_value = agent_output.get("plan") or agent_output.get("reasoning") -# return plan_value or "" - - -# def _extract_execution_steps_value( -# row: Any, -# has_execution_steps_column: bool, -# has_agent_output: bool, -# execution_steps_column: str, -# agent_output_column: str, -# ) -> str: -# """Extract execution steps value from row.""" -# execution_steps_value: Optional[str] = None -# if has_execution_steps_column: -# execution_steps_value = row.get(execution_steps_column) -# elif has_agent_output: -# agent_output = row.get(agent_output_column, {}) -# if isinstance(agent_output, dict): -# execution_steps_value = agent_output.get("execution_steps") or agent_output.get("steps") -# return execution_steps_value or "" - - @scorer() -@tags("llm", "PlanAdherence", "deepeval", "agent_evaluation", "reasoning_layer") +@tags("llm", "PlanAdherence", "deepeval", "agent_evaluation", "reasoning_layer", "agentic") @tasks("llm") def PlanAdherence( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/PlanQuality.py b/validmind/scorers/llm/deepeval/PlanQuality.py index bfa4ef664..f9b674640 100644 --- a/validmind/scorers/llm/deepeval/PlanQuality.py +++ b/validmind/scorers/llm/deepeval/PlanQuality.py @@ -27,7 +27,7 @@ @scorer() -@tags("llm", "deepeval", "agent_evaluation", "reasoning_layer") +@tags("llm", "deepeval", "agent_evaluation", "reasoning_layer", "agentic") @tasks("llm") def PlanQuality( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/TaskCompletion.py b/validmind/scorers/llm/deepeval/TaskCompletion.py index ca9f3cec0..70a2c490e 100644 --- a/validmind/scorers/llm/deepeval/TaskCompletion.py +++ b/validmind/scorers/llm/deepeval/TaskCompletion.py @@ -133,7 +133,7 @@ def extract_tool_calls_from_agent_output( # Create custom ValidMind tests for DeepEval metrics @scorer() -@tags("llm", "TaskCompletion", "deepeval") +@tags("llm", "TaskCompletion", "deepeval", "agentic") @tasks("llm") def TaskCompletion( dataset: VMDataset, diff --git a/validmind/scorers/llm/deepeval/ToolCorrectness.py b/validmind/scorers/llm/deepeval/ToolCorrectness.py index 0bc3f0853..dac8610f4 100644 --- a/validmind/scorers/llm/deepeval/ToolCorrectness.py +++ b/validmind/scorers/llm/deepeval/ToolCorrectness.py @@ -27,7 +27,9 @@ @scorer() -@tags("llm", "ToolCorrectness", "deepeval", "agent_evaluation", "action_layer") +@tags( + "llm", "ToolCorrectness", "deepeval", "agent_evaluation", "action_layer", "agentic" +) @tasks("llm") def ToolCorrectness( dataset: VMDataset, From 7aab5136e06918dc17e9c9b97d85bb903fcd5231 Mon Sep 17 00:00:00 2001 From: Anil Sorathiya Date: Thu, 8 Jan 2026 11:29:06 +0000 Subject: [PATCH 6/6] add relevant tags in the deepeval scorers --- validmind/scorers/llm/deepeval/ContextualRecall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validmind/scorers/llm/deepeval/ContextualRecall.py b/validmind/scorers/llm/deepeval/ContextualRecall.py index ee6df890f..14dde0282 100644 --- a/validmind/scorers/llm/deepeval/ContextualRecall.py +++ b/validmind/scorers/llm/deepeval/ContextualRecall.py @@ -28,7 +28,7 @@ # Create custom ValidMind tests for DeepEval metrics @scorer() -@tags("llm", "ContextualRecall", "deepeval") +@tags("llm", "ContextualRecall", "deepeval", "rag") @tasks("llm") def ContextualRecall( dataset: VMDataset,