From cd3626e626bf4980d42d04ee904a9cf725393f0c Mon Sep 17 00:00:00 2001 From: John Walz Date: Wed, 6 Aug 2025 13:56:40 -0400 Subject: [PATCH 01/11] Update dependencies and add PII detection utilities - Bump Poetry version to 2.1.3 in `poetry.lock`. - Introduce optional PII detection capabilities using Microsoft Presidio in `README.md`. - Add `pii_filter.py` for detecting and masking PII in test results. - Modify `check_for_sensitive_data` to utilize the new PII detection functionality. - Update `pyproject.toml` to include `presidio-analyzer` as an optional dependency. - Adjust `TestResult` class to ensure PII checks are performed correctly. --- README.md | 16 + poetry.lock | 913 ++++++++++++++++++++++- pyproject.toml | 23 +- validmind/__version__.py | 2 +- validmind/tests/__types__.py | 8 + validmind/vm_models/result/pii_filter.py | 209 ++++++ validmind/vm_models/result/result.py | 2 +- validmind/vm_models/result/utils.py | 35 +- 8 files changed, 1155 insertions(+), 53 deletions(-) create mode 100644 validmind/vm_models/result/pii_filter.py diff --git a/README.md b/README.md index 767c9b947..2484d50a2 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,22 @@ pip install validmind pip install rpy2 ``` +## PII Detection + +The ValidMind Library includes optional PII detection capabilities using Microsoft Presidio to automatically detect sensitive data in test results and prevent accidental logging. + +**Installation:** + +```bash +pip install validmind[pii-detection] +``` + +**Enable PII detection:** + +```bash +export VALIDMIND_PII_FILTERING_ENABLED=true +``` + ## How to contribute ### Install dependencies diff --git a/poetry.lock b/poetry.lock index 23c7b54ca..9e4bdbe2c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "aiodns" @@ -591,6 +591,48 @@ webencodings = "*" [package.extras] css = ["tinycss2 (>=1.1.0,<1.3)"] +[[package]] +name = "blis" +version = "1.2.1" +description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension." +optional = true +python-versions = "<3.13,>=3.6" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "blis-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:112443b90698158ada38f71e74c079c3561e802554a51e9850d487c39db25de0"}, + {file = "blis-1.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b9f8c4fbc303f47778d1fd47916cae785b6f3beaa2031502112a8c0aa5eb29f6"}, + {file = "blis-1.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0260ecbbaa890f11d8c88e9ce37d4fc9a91839adc34ba1763ba89424362e54c9"}, + {file = "blis-1.2.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b70e0693564444b608d765727ab31618de3b92c5f203b9dc6b6a108170a8cea"}, + {file = "blis-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67ae48f73828cf38f65f24b6c6d8ec16f22c99820e0d13e7d97370682fdb023d"}, + {file = "blis-1.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9eff1af9b142fd156a7b83f513061f2e464c4409afb37080fde436e969951703"}, + {file = "blis-1.2.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d05f07fd37b407edb294322d3b2991b0950a61123076cc380d3e9c3deba77c83"}, + {file = "blis-1.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8d5abc324180918a4d7ef81f31c37907d13e85f2831317cba3edacd4ef9b7d39"}, + {file = "blis-1.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:8de9a1e536202064b57c60d09ff0886275b50c5878df6d58fb49c731eaf535a7"}, + {file = "blis-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:778c4f72b71f97187e3304acfbd30eab98c9ba1a5b03b65128bc3875400ae604"}, + {file = "blis-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5c5f2ffb0ae9c1f5aaa95b9681bcdd9a777d007c501fa220796329b939ca2790"}, + {file = "blis-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db4dc5d2d57106bb411633603a5c7d178a0845267c3efc7e5ea4fa7a44772976"}, + {file = "blis-1.2.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c621271c2843101927407e052b35a67f853da59d5c74e9e070e982c7f82e2e04"}, + {file = "blis-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43f65f882250b817566d7543abd1f6da297f1662e5dd9936e14c04b88285a497"}, + {file = "blis-1.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78a0613d559ccc426c101c67e8f84e1f93491e29d722c370872c538ee652bd07"}, + {file = "blis-1.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f5e32e5e5635fc7087b724b53120dbcd86201f56c0405882ce254bc0e493392"}, + {file = "blis-1.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d339c97cc83f53e39c1013d0dcd7d5278c853dc102d931132eeb05b226e28429"}, + {file = "blis-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:8d284323cc994e9b818c32046f1aa3e57bcc41c74e02daebdf0d3bc3e14355cb"}, + {file = "blis-1.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1cd35e94a1a97b37b31b11f097f998a3a0e75ac06d57e6edf7d9597200f55756"}, + {file = "blis-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7b6394d27f2259c580df8d13ebe9c0a188a6ace0a689e93d6e49cb15018d4d9c"}, + {file = "blis-1.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9c127159415dc772f345abc3575e1e2d02bb1ae7cb7f532267d67705be04c66"}, + {file = "blis-1.2.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5f9fa589aa72448009fd5001afb05e69f3bc953fe778b44580fd7d79ee8201a1"}, + {file = "blis-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1aa6150259caf4fa0b527bfc8c1e858542f9ca88a386aa90b93e1ca4c2add6df"}, + {file = "blis-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3ba67c09883cae52da3d9e9d3f4305464efedd336032c4d5c6c429b27b16f4c1"}, + {file = "blis-1.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7d9c5fca21b01c4b2f3cb95b71ce7ef95e58b3b62f0d79d1f699178c72c1e03e"}, + {file = "blis-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6952a4a1f15e0d1f73cc1206bd71368b32551f2e94852dae288b50c4ea0daf31"}, + {file = "blis-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:bd0360427b1669684cd35a8355be126d7a33992ccac6dcb1fbef5e100f4e3026"}, + {file = "blis-1.2.1.tar.gz", hash = "sha256:1066beedbedc2143c22bd28742658de05694afebacde8d8c2d14dd4b5a96765a"}, +] + +[package.dependencies] +numpy = {version = ">=1.19.0,<3.0.0", markers = "python_version >= \"3.9\""} + [[package]] name = "brotli" version = "1.1.0" @@ -726,6 +768,19 @@ files = [ [package.dependencies] cffi = ">=1.0.0" +[[package]] +name = "catalogue" +version = "2.0.10" +description = "Super lightweight function registries for your library" +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "catalogue-2.0.10-py3-none-any.whl", hash = "sha256:58c2de0020aa90f4a2da7dfad161bf7b3b054c86a5f09fcedc0b2b740c109a9f"}, + {file = "catalogue-2.0.10.tar.gz", hash = "sha256:4f56daa940913d3f09d589c191c74e5a6d51762b3a9e37dd53b7437afd6cda15"}, +] + [[package]] name = "catboost" version = "1.2.7" @@ -995,6 +1050,28 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "cloudpathlib" +version = "0.21.1" +description = "pathlib-style classes for cloud storage services." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "cloudpathlib-0.21.1-py3-none-any.whl", hash = "sha256:bfe580ad72ec030472ec233cd7380701b2d3227da7b2898387bd170aa70c803c"}, + {file = "cloudpathlib-0.21.1.tar.gz", hash = "sha256:f26a855abf34d98f267aafd15efdb2db3c9665913dbabe5fad079df92837a431"}, +] + +[package.dependencies] +typing-extensions = {version = ">4", markers = "python_version < \"3.11\""} + +[package.extras] +all = ["cloudpathlib[azure]", "cloudpathlib[gs]", "cloudpathlib[s3]"] +azure = ["azure-storage-blob (>=12)", "azure-storage-file-datalake (>=12)"] +gs = ["google-cloud-storage"] +s3 = ["boto3 (>=1.34.0)"] + [[package]] name = "cloudpickle" version = "3.1.1" @@ -1038,6 +1115,23 @@ traitlets = ">=4" [package.extras] test = ["pytest"] +[[package]] +name = "confection" +version = "0.1.5" +description = "The sweetest config system for Python" +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "confection-0.1.5-py3-none-any.whl", hash = "sha256:e29d3c3f8eac06b3f77eb9dfb4bf2fc6bcc9622a98ca00a698e3d019c6430b14"}, + {file = "confection-0.1.5.tar.gz", hash = "sha256:8e72dd3ca6bd4f48913cd220f10b8275978e740411654b6e8ca6d7008c590f0e"}, +] + +[package.dependencies] +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +srsly = ">=2.4.0,<3.0.0" + [[package]] name = "contourpy" version = "1.1.1" @@ -1177,6 +1271,53 @@ files = [ docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] tests = ["pytest", "pytest-cov", "pytest-xdist"] +[[package]] +name = "cymem" +version = "2.0.11" +description = "Manage calls to calloc/free through Cython" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "cymem-2.0.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b4dd8f8c2475c7c9948eefa89c790d83134600858d8d43b90276efd8df3882e"}, + {file = "cymem-2.0.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d46ba0d2e0f749195297d16f2286b55af7d7c084db2b853fdfccece2c000c5dc"}, + {file = "cymem-2.0.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739c4336b9d04ce9761851e9260ef77508d4a86ee3060e41302bfb6fa82c37de"}, + {file = "cymem-2.0.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a69c470c2fb118161f49761f9137384f46723c77078b659bba33858e19e46b49"}, + {file = "cymem-2.0.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:40159f6c92627438de970fd761916e745d70dfd84a7dcc28c1627eb49cee00d8"}, + {file = "cymem-2.0.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f503f98e6aa333fffbe657a6854f13a9c3de68860795ae21171284213b9c5c09"}, + {file = "cymem-2.0.11-cp310-cp310-win_amd64.whl", hash = "sha256:7f05ed5920cc92d6b958ec5da55bd820d326fe9332b90660e6fa67e3b476ceb1"}, + {file = "cymem-2.0.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3ee54039aad3ef65de82d66c40516bf54586287b46d32c91ea0530c34e8a2745"}, + {file = "cymem-2.0.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4c05ef75b5db217be820604e43a47ccbbafea98ab6659d07cea92fa3c864ea58"}, + {file = "cymem-2.0.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d5381e5793ce531bac0dbc00829c8381f18605bb67e4b61d34f8850463da40"}, + {file = "cymem-2.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2b9d3f42d7249ac81802135cad51d707def058001a32f73fc7fbf3de7045ac7"}, + {file = "cymem-2.0.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:39b78f2195d20b75c2d465732f6b8e8721c5d4eb012777c2cb89bdb45a043185"}, + {file = "cymem-2.0.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2203bd6525a80d8fd0c94654a263af21c0387ae1d5062cceaebb652bf9bad7bc"}, + {file = "cymem-2.0.11-cp311-cp311-win_amd64.whl", hash = "sha256:aa54af7314de400634448da1f935b61323da80a49484074688d344fb2036681b"}, + {file = "cymem-2.0.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a0fbe19ce653cd688842d81e5819dc63f911a26e192ef30b0b89f0ab2b192ff2"}, + {file = "cymem-2.0.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de72101dc0e6326f6a2f73e05a438d1f3c6110d41044236d0fbe62925091267d"}, + {file = "cymem-2.0.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee4395917f6588b8ac1699499128842768b391fe8896e8626950b4da5f9a406"}, + {file = "cymem-2.0.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b02f2b17d760dc3fe5812737b1ce4f684641cdd751d67761d333a3b5ea97b83"}, + {file = "cymem-2.0.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:04ee6b4041ddec24512d6e969ed6445e57917f01e73b9dabbe17b7e6b27fef05"}, + {file = "cymem-2.0.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e1048dae7e627ee25f22c87bb670b13e06bc0aecc114b89b959a798d487d1bf4"}, + {file = "cymem-2.0.11-cp312-cp312-win_amd64.whl", hash = "sha256:0c269c7a867d74adeb9db65fa1d226342aacf44d64b7931282f0b0eb22eb6275"}, + {file = "cymem-2.0.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4a311c82f743275c84f708df89ac5bf60ddefe4713d532000c887931e22941f"}, + {file = "cymem-2.0.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:02ed92bead896cca36abad00502b14fa651bdf5d8319461126a2d5ac8c9674c5"}, + {file = "cymem-2.0.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44ddd3588379f8f376116384af99e3fb5f90091d90f520c341942618bf22f05e"}, + {file = "cymem-2.0.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87ec985623624bbd298762d8163fc194a096cb13282731a017e09ff8a60bb8b1"}, + {file = "cymem-2.0.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3385a47285435848e0ed66cfd29b35f3ed8703218e2b17bd7a0c053822f26bf"}, + {file = "cymem-2.0.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5461e65340d6572eb64deadce79242a446a1d39cb7bf70fe7b7e007eb0d799b0"}, + {file = "cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7"}, + {file = "cymem-2.0.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1450498623d9f176d48578779c4e9d133c7f252f73c5a93b762f35d059a09398"}, + {file = "cymem-2.0.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a407fd8766e1f666c48cb232f760267cecf0acb04cc717d8ec4de6adc6ab8e0"}, + {file = "cymem-2.0.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6347aed08442679a57bcce5ad1e338f6b717e46654549c5d65c798552d910591"}, + {file = "cymem-2.0.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d8f11149b1a154de0e93f5eda0a13ad9948a739b58a2aace996ca41bbb6d0f5"}, + {file = "cymem-2.0.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7a2b4d1a9b1674d6ac0e4c5136b70b805535dc8d1060aa7c4ded3e52fb74e615"}, + {file = "cymem-2.0.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:dec13c1a84612815365939f59e128a0031cae5f6b5a86e4b8fd7c4efa3fad262"}, + {file = "cymem-2.0.11-cp39-cp39-win_amd64.whl", hash = "sha256:332ea5bc1c13c9a186532a06846881288eb846425898b70f047a0820714097bf"}, + {file = "cymem-2.0.11.tar.gz", hash = "sha256:efe49a349d4a518be6b6c6b255d4a80f740a341544bde1a807707c058b88d0bd"}, +] + [[package]] name = "cython" version = "0.29.37" @@ -3151,6 +3292,26 @@ files = [ [package.dependencies] langchain-core = ">=0.3.51,<1.0.0" +[[package]] +name = "langcodes" +version = "3.5.0" +description = "Tools for labeling human languages with IETF language tags" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "langcodes-3.5.0-py3-none-any.whl", hash = "sha256:853c69d1a35e0e13da2f427bb68fb2fa4a8f4fb899e0c62ad8df8d073dcfed33"}, + {file = "langcodes-3.5.0.tar.gz", hash = "sha256:1eef8168d07e51e131a2497ffecad4b663f6208e7c3ae3b8dc15c51734a6f801"}, +] + +[package.dependencies] +language-data = ">=1.2" + +[package.extras] +build = ["build", "twine"] +test = ["pytest", "pytest-cov"] + [[package]] name = "langdetect" version = "1.0.9" @@ -3194,6 +3355,26 @@ openai-agents = ["openai-agents (>=0.0.3,<0.1)"] otel = ["opentelemetry-api (>=1.30.0,<2.0.0)", "opentelemetry-exporter-otlp-proto-http (>=1.30.0,<2.0.0)", "opentelemetry-sdk (>=1.30.0,<2.0.0)"] pytest = ["pytest (>=7.0.0)", "rich (>=13.9.4,<14.0.0)"] +[[package]] +name = "language-data" +version = "1.3.0" +description = "Supplementary data about languages used by the langcodes module" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "language_data-1.3.0-py3-none-any.whl", hash = "sha256:e2ee943551b5ae5f89cd0e801d1fc3835bb0ef5b7e9c3a4e8e17b2b214548fbf"}, + {file = "language_data-1.3.0.tar.gz", hash = "sha256:7600ef8aa39555145d06c89f0c324bf7dab834ea0b0a439d8243762e3ebad7ec"}, +] + +[package.dependencies] +marisa-trie = ">=1.1.0" + +[package.extras] +build = ["build", "twine"] +test = ["pytest", "pytest-cov"] + [[package]] name = "llvmlite" version = "0.41.1" @@ -3228,17 +3409,111 @@ files = [ {file = "llvmlite-0.41.1.tar.gz", hash = "sha256:f19f767a018e6ec89608e1f6b13348fa2fcde657151137cb64e56d48598a92db"}, ] +[[package]] +name = "marisa-trie" +version = "1.2.1" +description = "Static memory-efficient and fast Trie-like structures for Python." +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "marisa_trie-1.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a2eb41d2f9114d8b7bd66772c237111e00d2bae2260824560eaa0a1e291ce9e8"}, + {file = "marisa_trie-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9e956e6a46f604b17d570901e66f5214fb6f658c21e5e7665deace236793cef6"}, + {file = "marisa_trie-1.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bd45142501300e7538b2e544905580918b67b1c82abed1275fe4c682c95635fa"}, + {file = "marisa_trie-1.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8443d116c612cfd1961fbf76769faf0561a46d8e317315dd13f9d9639ad500c"}, + {file = "marisa_trie-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:875a6248e60fbb48d947b574ffa4170f34981f9e579bde960d0f9a49ea393ecc"}, + {file = "marisa_trie-1.2.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:746a7c60a17fccd3cfcfd4326926f02ea4fcdfc25d513411a0c4fc8e4a1ca51f"}, + {file = "marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e70869737cc0e5bd903f620667da6c330d6737048d1f44db792a6af68a1d35be"}, + {file = "marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06b099dd743676dbcd8abd8465ceac8f6d97d8bfaabe2c83b965495523b4cef2"}, + {file = "marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d2a82eb21afdaf22b50d9b996472305c05ca67fc4ff5a026a220320c9c961db6"}, + {file = "marisa_trie-1.2.1-cp310-cp310-win32.whl", hash = "sha256:8951e7ce5d3167fbd085703b4cbb3f47948ed66826bef9a2173c379508776cf5"}, + {file = "marisa_trie-1.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:5685a14b3099b1422c4f59fa38b0bf4b5342ee6cc38ae57df9666a0b28eeaad3"}, + {file = "marisa_trie-1.2.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ed3fb4ed7f2084597e862bcd56c56c5529e773729a426c083238682dba540e98"}, + {file = "marisa_trie-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fe69fb9ffb2767746181f7b3b29bbd3454d1d24717b5958e030494f3d3cddf3"}, + {file = "marisa_trie-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4728ed3ae372d1ea2cdbd5eaa27b8f20a10e415d1f9d153314831e67d963f281"}, + {file = "marisa_trie-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8cf4f25cf895692b232f49aa5397af6aba78bb679fb917a05fce8d3cb1ee446d"}, + {file = "marisa_trie-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cca7f96236ffdbf49be4b2e42c132e3df05968ac424544034767650913524de"}, + {file = "marisa_trie-1.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d7eb20bf0e8b55a58d2a9b518aabc4c18278787bdba476c551dd1c1ed109e509"}, + {file = "marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b1ec93f0d1ee6d7ab680a6d8ea1a08bf264636358e92692072170032dda652ba"}, + {file = "marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e2699255d7ac610dee26d4ae7bda5951d05c7d9123a22e1f7c6a6f1964e0a4e4"}, + {file = "marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c484410911182457a8a1a0249d0c09c01e2071b78a0a8538cd5f7fa45589b13a"}, + {file = "marisa_trie-1.2.1-cp311-cp311-win32.whl", hash = "sha256:ad548117744b2bcf0e3d97374608be0a92d18c2af13d98b728d37cd06248e571"}, + {file = "marisa_trie-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:436f62d27714970b9cdd3b3c41bdad046f260e62ebb0daa38125ef70536fc73b"}, + {file = "marisa_trie-1.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:638506eacf20ca503fff72221a7e66a6eadbf28d6a4a6f949fcf5b1701bb05ec"}, + {file = "marisa_trie-1.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de1665eaafefa48a308e4753786519888021740501a15461c77bdfd57638e6b4"}, + {file = "marisa_trie-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f713af9b8aa66a34cd3a78c7d150a560a75734713abe818a69021fd269e927fa"}, + {file = "marisa_trie-1.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2a7d00f53f4945320b551bccb826b3fb26948bde1a10d50bb9802fabb611b10"}, + {file = "marisa_trie-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98042040d1d6085792e8d0f74004fc0f5f9ca6091c298f593dd81a22a4643854"}, + {file = "marisa_trie-1.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6532615111eec2c79e711965ece0bc95adac1ff547a7fff5ffca525463116deb"}, + {file = "marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:20948e40ab2038e62b7000ca6b4a913bc16c91a2c2e6da501bd1f917eeb28d51"}, + {file = "marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:66b23e5b35dd547f85bf98db7c749bc0ffc57916ade2534a6bbc32db9a4abc44"}, + {file = "marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6704adf0247d2dda42e876b793be40775dff46624309ad99bc7537098bee106d"}, + {file = "marisa_trie-1.2.1-cp312-cp312-win32.whl", hash = "sha256:3ad356442c2fea4c2a6f514738ddf213d23930f942299a2b2c05df464a00848a"}, + {file = "marisa_trie-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:f2806f75817392cedcacb24ac5d80b0350dde8d3861d67d045c1d9b109764114"}, + {file = "marisa_trie-1.2.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:b5ea16e69bfda0ac028c921b58de1a4aaf83d43934892977368579cd3c0a2554"}, + {file = "marisa_trie-1.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9f627f4e41be710b6cb6ed54b0128b229ac9d50e2054d9cde3af0fef277c23cf"}, + {file = "marisa_trie-1.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5e649f3dc8ab5476732094f2828cc90cac3be7c79bc0c8318b6fda0c1d248db4"}, + {file = "marisa_trie-1.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46e528ee71808c961baf8c3ce1c46a8337ec7a96cc55389d11baafe5b632f8e9"}, + {file = "marisa_trie-1.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36aa4401a1180615f74d575571a6550081d84fc6461e9aefc0bb7b2427af098e"}, + {file = "marisa_trie-1.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce59bcd2cda9bb52b0e90cc7f36413cd86c3d0ce7224143447424aafb9f4aa48"}, + {file = "marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f4cd800704a5fc57e53c39c3a6b0c9b1519ebdbcb644ede3ee67a06eb542697d"}, + {file = "marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2428b495003c189695fb91ceeb499f9fcced3a2dce853e17fa475519433c67ff"}, + {file = "marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:735c363d9aaac82eaf516a28f7c6b95084c2e176d8231c87328dc80e112a9afa"}, + {file = "marisa_trie-1.2.1-cp313-cp313-win32.whl", hash = "sha256:eba6ca45500ca1a042466a0684aacc9838e7f20fe2605521ee19f2853062798f"}, + {file = "marisa_trie-1.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:aa7cd17e1c690ce96c538b2f4aae003d9a498e65067dd433c52dd069009951d4"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5e43891a37b0d7f618819fea14bd951289a0a8e3dd0da50c596139ca83ebb9b1"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6946100a43f933fad6bc458c502a59926d80b321d5ac1ed2ff9c56605360496f"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4177dc0bd1374e82be9b2ba4d0c2733b0a85b9d154ceeea83a5bee8c1e62fbf"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f35c2603a6be168088ed1db6ad1704b078aa8f39974c60888fbbced95dcadad4"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d659fda873d8dcb2c14c2c331de1dee21f5a902d7f2de7978b62c6431a8850ef"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:b0ef26733d3c836be79e812071e1a431ce1f807955a27a981ebb7993d95f842b"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:536ea19ce6a2ce61c57fed4123ecd10d18d77a0db45cd2741afff2b8b68f15b3"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-win32.whl", hash = "sha256:0ee6cf6a16d9c3d1c94e21c8e63c93d8b34bede170ca4e937e16e1c0700d399f"}, + {file = "marisa_trie-1.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7e7b1786e852e014d03e5f32dbd991f9a9eb223dd3fa9a2564108b807e4b7e1c"}, + {file = "marisa_trie-1.2.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:952af3a5859c3b20b15a00748c36e9eb8316eb2c70bd353ae1646da216322908"}, + {file = "marisa_trie-1.2.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24a81aa7566e4ec96fc4d934581fe26d62eac47fc02b35fa443a0bb718b471e8"}, + {file = "marisa_trie-1.2.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9c9b32b14651a6dcf9e8857d2df5d29d322a1ea8c0be5c8ffb88f9841c4ec62b"}, + {file = "marisa_trie-1.2.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ac170d20b97beb75059ba65d1ccad6b434d777c8992ab41ffabdade3b06dd74"}, + {file = "marisa_trie-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da4e4facb79614cc4653cfd859f398e4db4ca9ab26270ff12610e50ed7f1f6c6"}, + {file = "marisa_trie-1.2.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25688f34cac3bec01b4f655ffdd6c599a01f0bd596b4a79cf56c6f01a7df3560"}, + {file = "marisa_trie-1.2.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:1db3213b451bf058d558f6e619bceff09d1d130214448a207c55e1526e2773a1"}, + {file = "marisa_trie-1.2.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:d5648c6dcc5dc9200297fb779b1663b8a4467bda034a3c69bd9c32d8afb33b1d"}, + {file = "marisa_trie-1.2.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:5bd39a4e1cc839a88acca2889d17ebc3f202a5039cd6059a13148ce75c8a6244"}, + {file = "marisa_trie-1.2.1-cp38-cp38-win32.whl", hash = "sha256:594f98491a96c7f1ffe13ce292cef1b4e63c028f0707effdea0f113364c1ae6c"}, + {file = "marisa_trie-1.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:5fe5a286f997848a410eebe1c28657506adaeb405220ee1e16cfcfd10deb37f2"}, + {file = "marisa_trie-1.2.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c0fe2ace0cb1806badbd1c551a8ec2f8d4cf97bf044313c082ef1acfe631ddca"}, + {file = "marisa_trie-1.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67f0c2ec82c20a02c16fc9ba81dee2586ef20270127c470cb1054767aa8ba310"}, + {file = "marisa_trie-1.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a3c98613180cf1730e221933ff74b454008161b1a82597e41054127719964188"}, + {file = "marisa_trie-1.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:429858a0452a7bedcf67bc7bb34383d00f666c980cb75a31bcd31285fbdd4403"}, + {file = "marisa_trie-1.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2eacb84446543082ec50f2fb563f1a94c96804d4057b7da8ed815958d0cdfbe"}, + {file = "marisa_trie-1.2.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:852d7bcf14b0c63404de26e7c4c8d5d65ecaeca935e93794331bc4e2f213660b"}, + {file = "marisa_trie-1.2.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:e58788004adda24c401d1751331618ed20c507ffc23bfd28d7c0661a1cf0ad16"}, + {file = "marisa_trie-1.2.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:aefe0973cc4698e0907289dc0517ab0c7cdb13d588201932ff567d08a50b0e2e"}, + {file = "marisa_trie-1.2.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6c50c861faad0a5c091bd763e0729f958c316e678dfa065d3984fbb9e4eacbcd"}, + {file = "marisa_trie-1.2.1-cp39-cp39-win32.whl", hash = "sha256:b1ce340da608530500ab4f963f12d6bfc8d8680900919a60dbdc9b78c02060a4"}, + {file = "marisa_trie-1.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:ce37d8ca462bb64cc13f529b9ed92f7b21fe8d1f1679b62e29f9cb7d0e888b49"}, + {file = "marisa_trie-1.2.1.tar.gz", hash = "sha256:3a27c408e2aefc03e0f1d25b2ff2afb85aac3568f6fa2ae2a53b57a2e87ce29d"}, +] + +[package.dependencies] +setuptools = "*" + +[package.extras] +test = ["hypothesis", "pytest", "readme-renderer"] + [[package]] name = "markdown-it-py" version = "3.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false python-versions = ">=3.8" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, ] +markers = {main = "extra == \"all\" or extra == \"pii-detection\""} [package.dependencies] mdurl = ">=0.1,<1.0" @@ -3463,11 +3738,12 @@ version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +markers = {main = "extra == \"all\" or extra == \"pii-detection\""} [[package]] name = "mistune" @@ -3656,6 +3932,53 @@ files = [ {file = "multitasking-0.0.11.tar.gz", hash = "sha256:4d6bc3cc65f9b2dca72fb5a787850a88dae8f620c2b36ae9b55248e51bcd6026"}, ] +[[package]] +name = "murmurhash" +version = "1.0.13" +description = "Cython bindings for MurmurHash" +optional = true +python-versions = "<3.14,>=3.6" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "murmurhash-1.0.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:136c7017e7d59ef16f065c2285bf5d30557ad8260adf47714c3c2802725e3e07"}, + {file = "murmurhash-1.0.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d0292f6fcd99361157fafad5c86d508f367931b7699cce1e14747364596950cb"}, + {file = "murmurhash-1.0.13-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12265dc748257966c62041b677201b8fa74334a2548dc27f1c7a9e78dab7c2c1"}, + {file = "murmurhash-1.0.13-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e411d5be64d37f2ce10a5d4d74c50bb35bd06205745b9631c4d8b1cb193e540"}, + {file = "murmurhash-1.0.13-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:da3500ad3dbf75ac9c6bc8c5fbc677d56dfc34aec0a289269939d059f194f61d"}, + {file = "murmurhash-1.0.13-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b23278c5428fc14f3101f8794f38ec937da042198930073e8c86d00add0fa2f0"}, + {file = "murmurhash-1.0.13-cp310-cp310-win_amd64.whl", hash = "sha256:7bc27226c0e8d9927f8e59af0dfefc93f5009e4ec3dde8da4ba7751ba19edd47"}, + {file = "murmurhash-1.0.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b20d168370bc3ce82920121b78ab35ae244070a9b18798f4a2e8678fa03bd7e0"}, + {file = "murmurhash-1.0.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cef667d2e83bdceea3bc20c586c491fa442662ace1aea66ff5e3a18bb38268d8"}, + {file = "murmurhash-1.0.13-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:507148e50929ba1fce36898808573b9f81c763d5676f3fc6e4e832ff56b66992"}, + {file = "murmurhash-1.0.13-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d50f6173d266ad165beb8bca6101d824217fc9279f9e9981f4c0245c1e7ee6"}, + {file = "murmurhash-1.0.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0f272e15a84a8ae5f8b4bc0a68f9f47be38518ddffc72405791178058e9d019a"}, + {file = "murmurhash-1.0.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9423e0b0964ed1013a06c970199538c7ef9ca28c0be54798c0f1473a6591761"}, + {file = "murmurhash-1.0.13-cp311-cp311-win_amd64.whl", hash = "sha256:83b81e7084b696df3d853f2c78e0c9bda6b285d643f923f1a6fa9ab145d705c5"}, + {file = "murmurhash-1.0.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbe882e46cb3f86e092d8a1dd7a5a1c992da1ae3b39f7dd4507b6ce33dae7f92"}, + {file = "murmurhash-1.0.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:52a33a12ecedc432493692c207c784b06b6427ffaa897fc90b7a76e65846478d"}, + {file = "murmurhash-1.0.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:950403a7f0dc2d9c8d0710f07c296f2daab66299d9677d6c65d6b6fa2cb30aaa"}, + {file = "murmurhash-1.0.13-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fde9fb5d2c106d86ff3ef2e4a9a69c2a8d23ba46e28c6b30034dc58421bc107b"}, + {file = "murmurhash-1.0.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3aa55d62773745616e1ab19345dece122f6e6d09224f7be939cc5b4c513c8473"}, + {file = "murmurhash-1.0.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:060dfef1b405cf02c450f182fb629f76ebe7f79657cced2db5054bc29b34938b"}, + {file = "murmurhash-1.0.13-cp312-cp312-win_amd64.whl", hash = "sha256:a8e79627d44a6e20a6487effc30bfe1c74754c13d179106e68cc6d07941b022c"}, + {file = "murmurhash-1.0.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8a7f8befd901379b6dc57a9e49c5188454113747ad6aa8cdd951a6048e10790"}, + {file = "murmurhash-1.0.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f741aab86007510199193eee4f87c5ece92bc5a6ca7d0fe0d27335c1203dface"}, + {file = "murmurhash-1.0.13-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82614f18fa6d9d83da6bb0918f3789a3e1555d0ce12c2548153e97f79b29cfc9"}, + {file = "murmurhash-1.0.13-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91f22a48b9454712e0690aa0b76cf0156a5d5a083d23ec7e209cfaeef28f56ff"}, + {file = "murmurhash-1.0.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c4bc7938627b8fcb3d598fe6657cc96d1e31f4eba6a871b523c1512ab6dacb3e"}, + {file = "murmurhash-1.0.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58a61f1fc840f9ef704e638c39b8517bab1d21f1a9dbb6ba3ec53e41360e44ec"}, + {file = "murmurhash-1.0.13-cp313-cp313-win_amd64.whl", hash = "sha256:c451a22f14c2f40e7abaea521ee24fa0e46fbec480c4304c25c946cdb6e81883"}, + {file = "murmurhash-1.0.13-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:94371ea3df7bfbc9106a9b163e185190fa45b071028a6594c16f9e6722177683"}, + {file = "murmurhash-1.0.13-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1db35c354c6834aa0dcf693db34ccdf3b051c1cba59b8dc8992a4181c26ec463"}, + {file = "murmurhash-1.0.13-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:273939515100361dc27bfb3b0ccde462633b514e227dc22b29f99c34e742d794"}, + {file = "murmurhash-1.0.13-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b16a58afda1e285755a4c15cd3403d596c4c37d7770f45745f5ec76b80ba0fc5"}, + {file = "murmurhash-1.0.13-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1e858c40d051ae48ed23b288ecb49aa8f95955ad830d5803b4ce45e08106ec18"}, + {file = "murmurhash-1.0.13-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6e7250c095592ab9fc62a6d95728a15c33010f9347d9b3263dcffb33a89d3b7a"}, + {file = "murmurhash-1.0.13-cp39-cp39-win_amd64.whl", hash = "sha256:3fff9b252b7abb737a7e9baf5a466a2abecb21be3a86a3d452a5696ee054bfcc"}, + {file = "murmurhash-1.0.13.tar.gz", hash = "sha256:737246d41ee00ff74b07b0bd1f0888be304d203ce668e642c86aa64ede30f8b7"}, +] + [[package]] name = "mypy-extensions" version = "1.0.0" @@ -4540,6 +4863,19 @@ files = [ [package.dependencies] ptyprocess = ">=0.5" +[[package]] +name = "phonenumbers" +version = "9.0.10" +description = "Python version of Google's common library for parsing, formatting, storing and validating international phone numbers." +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "phonenumbers-9.0.10-py2.py3-none-any.whl", hash = "sha256:13b12d269be1f2b363c9bc2868656a7e2e8b50f1a1cef629c75005da6c374c6b"}, + {file = "phonenumbers-9.0.10.tar.gz", hash = "sha256:c2d15a6a9d0534b14a7764f51246ada99563e263f65b80b0251d1a760ac4a1ba"}, +] + [[package]] name = "pickleshare" version = "0.7.5" @@ -4779,6 +5115,83 @@ nodeenv = ">=0.11.1" pyyaml = ">=5.1" virtualenv = ">=20.10.0" +[[package]] +name = "preshed" +version = "3.0.10" +description = "Cython hash table that trusts the keys are pre-hashed" +optional = true +python-versions = "<3.14,>=3.6" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "preshed-3.0.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:14593c32e6705fda0fd54684293ca079530418bb1fb036dcbaa6c0ef0f144b7d"}, + {file = "preshed-3.0.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ba1960a3996678aded882260133853e19e3a251d9f35a19c9d7d830c4238c4eb"}, + {file = "preshed-3.0.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0830c0a262015be743a01455a1da5963750afed1bde2395590b01af3b7da2741"}, + {file = "preshed-3.0.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:165dda5862c28e77ee1f3feabad98d4ebb65345f458b5626596b92fd20a65275"}, + {file = "preshed-3.0.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e88e4c7fbbfa7c23a90d7d0cbe27e4c5fa2fd742ef1be09c153f9ccd2c600098"}, + {file = "preshed-3.0.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:87780ae00def0c97130c9d1652295ec8362c2e4ca553673b64fe0dc7b321a382"}, + {file = "preshed-3.0.10-cp310-cp310-win_amd64.whl", hash = "sha256:32496f216255a6cbdd60965dde29ff42ed8fc2d77968c28ae875e3856c6fa01a"}, + {file = "preshed-3.0.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d96c4fe2b41c1cdcc8c4fc1fdb10f922a6095c0430a3ebe361fe62c78902d068"}, + {file = "preshed-3.0.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cb01ea930b96f3301526a2ab26f41347d07555e4378c4144c6b7645074f2ebb0"}, + {file = "preshed-3.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dd1f0a7b7d150e229d073fd4fe94f72610cae992e907cee74687c4695873a98"}, + {file = "preshed-3.0.10-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fd7b350c280137f324cd447afbf6ba9a849af0e8898850046ac6f34010e08bd"}, + {file = "preshed-3.0.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cf6a5fdc89ad06079aa6ee63621e417d4f4cf2a3d8b63c72728baad35a9ff641"}, + {file = "preshed-3.0.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b4c29a7bd66985808ad181c9ad05205a6aa7400cd0f98426acd7bc86588b93f8"}, + {file = "preshed-3.0.10-cp311-cp311-win_amd64.whl", hash = "sha256:1367c1fd6f44296305315d4e1c3fe3171787d4d01c1008a76bc9466bd79c3249"}, + {file = "preshed-3.0.10-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6e9c46933d55c8898c8f7a6019a8062cd87ef257b075ada2dd5d1e57810189ea"}, + {file = "preshed-3.0.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c4ebc4f8ef0114d55f2ffdce4965378129c7453d0203664aeeb03055572d9e4"}, + {file = "preshed-3.0.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ab5ab4c6dfd3746fb4328e7fbeb2a0544416b872db02903bfac18e6f5cd412f"}, + {file = "preshed-3.0.10-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40586fd96ae3974c552a7cd78781b6844ecb1559ee7556586f487058cf13dd96"}, + {file = "preshed-3.0.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a606c24cda931306b98e0edfafed3309bffcf8d6ecfe07804db26024c4f03cd6"}, + {file = "preshed-3.0.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:394015566f9354738be903447039e8dbc6d93ba5adf091af694eb03c4e726b1e"}, + {file = "preshed-3.0.10-cp312-cp312-win_amd64.whl", hash = "sha256:fd7e38225937e580420c84d1996dde9b4f726aacd9405093455c3a2fa60fede5"}, + {file = "preshed-3.0.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:23e6e0581a517597f3f76bc24a4cdb0ba5509933d4f61c34fca49649dd71edf9"}, + {file = "preshed-3.0.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:574e6d6056981540310ff181b47a2912f4bddc91bcace3c7a9c6726eafda24ca"}, + {file = "preshed-3.0.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd658dd73e853d1bb5597976a407feafa681b9d6155bc9bc7b4c2acc2a6ee96"}, + {file = "preshed-3.0.10-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b95396046328ffb461a68859ce2141aca4815b8624167832d28ced70d541626"}, + {file = "preshed-3.0.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3e6728b2028bbe79565eb6cf676b5bae5ce1f9cc56e4bf99bb28ce576f88054d"}, + {file = "preshed-3.0.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c4ef96cb28bf5f08de9c070143113e168efccbb68fd4961e7d445f734c051a97"}, + {file = "preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed"}, + {file = "preshed-3.0.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:52f07d53a46510fe4d583272aa18ddb76904eb2fe58b534624e742a05be5f43e"}, + {file = "preshed-3.0.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e5e41cdb12f43a27fa5f8f5d788aa8b3b6eb699434bb1e95d0da3d18727a5f8d"}, + {file = "preshed-3.0.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60e93f8692d70597d19c59ef9b44e7e9def85a3060d3ff0f3629909bd996d9fa"}, + {file = "preshed-3.0.10-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23fd32c1f3519d1811d02a13a98cd9e7601d4a65b23c61e5bbc80460f11d748e"}, + {file = "preshed-3.0.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:25b2a0f3737fbb05f488eef0e62f82ac6573122bffb5119833af463f00455342"}, + {file = "preshed-3.0.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7ab8316d9aceb84d9e88e7cef48de92d0ad93f31cca8c91fbf98bc635a212707"}, + {file = "preshed-3.0.10-cp39-cp39-win_amd64.whl", hash = "sha256:a046e3070c8bdae7b7c888eca2d5a320f84406755ec6f20654b049f52b31eb51"}, + {file = "preshed-3.0.10.tar.gz", hash = "sha256:5a5c8e685e941f4ffec97f1fbf32694b8107858891a4bc34107fac981d8296ff"}, +] + +[package.dependencies] +cymem = ">=2.0.2,<2.1.0" +murmurhash = ">=0.28.0,<1.1.0" + +[[package]] +name = "presidio-analyzer" +version = "2.2.359" +description = "Presidio Analyzer package" +optional = true +python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "presidio_analyzer-2.2.359-py3-none-any.whl", hash = "sha256:5f9a71ce5e484b1d9fd10a3f40ba37cb311deeb7cc25c3a87c0ba36b468ee26d"}, +] + +[package.dependencies] +phonenumbers = ">=8.12,<10.0.0" +pyyaml = "*" +regex = "*" +spacy = ">=3.4.4,<3.7.0 || >3.7.0,<4.0.0" +tldextract = "*" + +[package.extras] +azure-ai-language = ["azure-ai-textanalytics", "azure-core"] +gliner = ["gliner (>=0.2.13,<1.0.0) ; python_version >= \"3.10\"", "huggingface_hub", "onnxruntime (>=1.19) ; python_version >= \"3.10\"", "transformers"] +server = ["flask (>=1.1)", "gunicorn"] +stanza = ["stanza (>=1.10.1,<2.0.0)"] +transformers = ["accelerate", "huggingface_hub", "spacy_huggingface_pipelines", "transformers"] + [[package]] name = "prometheus-client" version = "0.21.1" @@ -5052,19 +5465,6 @@ files = [ {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"}, - {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"}, - {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"}, - {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"}, - {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"}, - {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"}, ] [package.dependencies] @@ -5927,6 +6327,22 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-file" +version = "2.1.0" +description = "File transport adapter for Requests" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"}, + {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"}, +] + +[package.dependencies] +requests = ">=1.0.0" + [[package]] name = "requests-toolbelt" version = "1.0.0" @@ -5991,11 +6407,12 @@ version = "13.9.4" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.8.0" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, ] +markers = {main = "extra == \"all\" or extra == \"pii-detection\""} [package.dependencies] markdown-it-py = ">=2.2.0" @@ -6456,7 +6873,7 @@ files = [ {file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"}, {file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"}, ] -markers = {main = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +markers = {main = "extra == \"all\" or extra == \"pii-detection\" or platform_system == \"Linux\" and platform_machine == \"x86_64\""} [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] @@ -6520,6 +6937,19 @@ test = ["catboost", "gpboost", "lightgbm", "ngboost ; python_version < \"3.11\"" test-core = ["pytest", "pytest-cov", "pytest-mpl"] test-notebooks = ["datasets", "jupyter", "keras", "nbconvert", "nbformat", "nlp", "transformers"] +[[package]] +name = "shellingham" +version = "1.5.4" +description = "Tool to Detect Surrounding Shell" +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"}, + {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"}, +] + [[package]] name = "six" version = "1.17.0" @@ -6544,6 +6974,33 @@ files = [ {file = "slicer-0.0.7.tar.gz", hash = "sha256:f5d5f7b45f98d155b9c0ba6554fa9770c6b26d5793a3e77a1030fb56910ebeec"}, ] +[[package]] +name = "smart-open" +version = "7.3.0.post1" +description = "Utils for streaming large files (S3, HDFS, GCS, SFTP, Azure Blob Storage, gzip, bz2, zst...)" +optional = true +python-versions = "<4.0,>=3.8" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "smart_open-7.3.0.post1-py3-none-any.whl", hash = "sha256:c73661a2c24bf045c1e04e08fffc585b59af023fe783d57896f590489db66fb4"}, + {file = "smart_open-7.3.0.post1.tar.gz", hash = "sha256:ce6a3d9bc1afbf6234ad13c010b77f8cd36d24636811e3c52c3b5160f5214d1e"}, +] + +[package.dependencies] +wrapt = "*" + +[package.extras] +all = ["smart_open[azure,gcs,http,s3,ssh,webhdfs,zst]"] +azure = ["azure-common", "azure-core", "azure-storage-blob"] +gcs = ["google-cloud-storage (>=2.6.0)"] +http = ["requests"] +s3 = ["boto3"] +ssh = ["paramiko"] +test = ["awscli", "moto[server]", "numpy", "pyopenssl", "pytest", "pytest-rerunfailures", "pytest_benchmark", "responses", "smart_open[all]"] +webhdfs = ["requests"] +zst = ["zstandard"] + [[package]] name = "sniffio" version = "1.3.1" @@ -6580,6 +7037,127 @@ files = [ {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, ] +[[package]] +name = "spacy" +version = "3.8.7" +description = "Industrial-strength Natural Language Processing (NLP) in Python" +optional = true +python-versions = "<3.14,>=3.9" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "spacy-3.8.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6ec0368ce96cd775fb14906f04b771c912ea8393ba30f8b35f9c4dc47a420b8e"}, + {file = "spacy-3.8.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5672f8a0fe7a3847e925544890be60015fbf48a60a838803425f82e849dd4f18"}, + {file = "spacy-3.8.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60cde9fe8b15be04eb1e634c353d9c160187115d825b368cc1975452dd54f264"}, + {file = "spacy-3.8.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cac8e58fb92fb1c5e06328039595fa6589a9d1403681266f8f5e454d15319c"}, + {file = "spacy-3.8.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:1456245a4ed04bc882db2d89a27ca1b6dc0b947b643bedaeaa5da11d9f7e22ec"}, + {file = "spacy-3.8.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bb98f85d467963d17c7c660884069ba948bde71c07280c91ee3235e554375308"}, + {file = "spacy-3.8.7-cp310-cp310-win_amd64.whl", hash = "sha256:b0df50d69e6691e97eae228733b321971607dbbb799e59d8470f2e70b8b27a8e"}, + {file = "spacy-3.8.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bdff8b9b556468a6dd527af17f0ddf9fb0b0bee92ee7703339ddf542361cff98"}, + {file = "spacy-3.8.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9194b7cf015ed9b4450ffb162da49c8a9305e76b468de036b0948abdfc748a37"}, + {file = "spacy-3.8.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7dc38b78d48b9c2a80a3eea95f776304993f63fc307f07cdd104441442f92f1e"}, + {file = "spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e43bd70772751b8fc7a14f338d087a3d297195d43d171832923ef66204b23ab"}, + {file = "spacy-3.8.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c402bf5dcf345fd96d202378c54bc345219681e3531f911d99567d569328c45f"}, + {file = "spacy-3.8.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4234189861e486d86f1269e50542d87e8a6391a1ee190652479cf1a793db115f"}, + {file = "spacy-3.8.7-cp311-cp311-win_amd64.whl", hash = "sha256:e9d12e2eb7f36bc11dd9edae011032fe49ea100d63e83177290d3cbd80eaa650"}, + {file = "spacy-3.8.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:88b397e37793cea51df298e6c651a763e49877a25bead5ba349761531a456687"}, + {file = "spacy-3.8.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f70b676955fa6959347ca86ed6edd8ff0d6eb2ba20561fdfec76924bd3e540f9"}, + {file = "spacy-3.8.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c4b5a624797ade30c25b5b69daa35a93ee24bcc56bd79b0884b2565f76f35d6"}, + {file = "spacy-3.8.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9d83e006df66decccefa3872fa958b3756228fb216d83783595444cf42ca10c"}, + {file = "spacy-3.8.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0dca25deba54f3eb5dcfbf63bf16e613e6c601da56f91c4a902d38533c098941"}, + {file = "spacy-3.8.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5eef3f805a1c118d9b709a23e2d378f5f20da5a0d6258c9cfdc87c4cb234b4fc"}, + {file = "spacy-3.8.7-cp312-cp312-win_amd64.whl", hash = "sha256:25d7a68e445200c9e9dc0044f8b7278ec0ef01ccc7cb5a95d1de2bd8e3ed6be2"}, + {file = "spacy-3.8.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dda7d57f42ec57c19fbef348095a9c82504e4777bca7b8db4b0d8318ba280fc7"}, + {file = "spacy-3.8.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:de0e0bddb810ed05bce44bcb91460eabe52bc56323da398d2ca74288a906da35"}, + {file = "spacy-3.8.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a2e58f92b684465777a7c1a65d5578b1dc36fe55c48d9964fb6d46cc9449768"}, + {file = "spacy-3.8.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46330da2eb357d6979f40ea8fc16ee5776ee75cd0c70aac2a4ea10c80364b8f3"}, + {file = "spacy-3.8.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:86b6a6ad23ca5440ef9d29c2b1e3125e28722c927db612ae99e564d49202861c"}, + {file = "spacy-3.8.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ccfe468cbb370888153df145ce3693af8e54dae551940df49057258081b2112f"}, + {file = "spacy-3.8.7-cp313-cp313-win_amd64.whl", hash = "sha256:ca81e416ff35209769e8b5dd5d13acc52e4f57dd9d028364bccbbe157c2ae86b"}, + {file = "spacy-3.8.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:be17d50eeade1cfdd743f532d594d2bb21da5788abfde61a7ed47b347d6e5b02"}, + {file = "spacy-3.8.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fdff9526d3f79914c6eae8eb40af440f0085be122264df2ada0f2ba294be2b42"}, + {file = "spacy-3.8.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdb15e6d22655479fdd55bf35b39459a753d68ba3fa5c339c8293925a9cd9012"}, + {file = "spacy-3.8.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1406fde475900c8340c917c71b2e3e8077a027ce9b4d373315cee9dc37322eb"}, + {file = "spacy-3.8.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f90d3a2b64323f89ef2cdfe3e4045dc63595ab7487d2ca3ea033aa69e25abf08"}, + {file = "spacy-3.8.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6cc95942a233d70238b201f7429f7cd8fdd7802e29ccb629da20fe82699959b5"}, + {file = "spacy-3.8.7-cp39-cp39-win_amd64.whl", hash = "sha256:8bfa987aee76cd710197a02ec7a94663b83387c8707f542c11b3f721278cb4e1"}, + {file = "spacy-3.8.7.tar.gz", hash = "sha256:700fd174c6c552276be142c48e70bb53cae24c4dd86003c4432af9cb93e4c908"}, +] + +[package.dependencies] +catalogue = ">=2.0.6,<2.1.0" +cymem = ">=2.0.2,<2.1.0" +jinja2 = "*" +langcodes = ">=3.2.0,<4.0.0" +murmurhash = ">=0.28.0,<1.1.0" +numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""} +packaging = ">=20.0" +preshed = ">=3.0.2,<3.1.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +requests = ">=2.13.0,<3.0.0" +setuptools = "*" +spacy-legacy = ">=3.0.11,<3.1.0" +spacy-loggers = ">=1.0.0,<2.0.0" +srsly = ">=2.4.3,<3.0.0" +thinc = ">=8.3.4,<8.4.0" +tqdm = ">=4.38.0,<5.0.0" +typer = ">=0.3.0,<1.0.0" +wasabi = ">=0.9.1,<1.2.0" +weasel = ">=0.1.0,<0.5.0" + +[package.extras] +apple = ["thinc-apple-ops (>=1.0.0,<2.0.0)"] +cuda = ["cupy (>=5.0.0b4,<13.0.0)"] +cuda-autodetect = ["cupy-wheel (>=11.0.0,<13.0.0)"] +cuda100 = ["cupy-cuda100 (>=5.0.0b4,<13.0.0)"] +cuda101 = ["cupy-cuda101 (>=5.0.0b4,<13.0.0)"] +cuda102 = ["cupy-cuda102 (>=5.0.0b4,<13.0.0)"] +cuda110 = ["cupy-cuda110 (>=5.0.0b4,<13.0.0)"] +cuda111 = ["cupy-cuda111 (>=5.0.0b4,<13.0.0)"] +cuda112 = ["cupy-cuda112 (>=5.0.0b4,<13.0.0)"] +cuda113 = ["cupy-cuda113 (>=5.0.0b4,<13.0.0)"] +cuda114 = ["cupy-cuda114 (>=5.0.0b4,<13.0.0)"] +cuda115 = ["cupy-cuda115 (>=5.0.0b4,<13.0.0)"] +cuda116 = ["cupy-cuda116 (>=5.0.0b4,<13.0.0)"] +cuda117 = ["cupy-cuda117 (>=5.0.0b4,<13.0.0)"] +cuda11x = ["cupy-cuda11x (>=11.0.0,<13.0.0)"] +cuda12x = ["cupy-cuda12x (>=11.5.0,<13.0.0)"] +cuda80 = ["cupy-cuda80 (>=5.0.0b4,<13.0.0)"] +cuda90 = ["cupy-cuda90 (>=5.0.0b4,<13.0.0)"] +cuda91 = ["cupy-cuda91 (>=5.0.0b4,<13.0.0)"] +cuda92 = ["cupy-cuda92 (>=5.0.0b4,<13.0.0)"] +ja = ["sudachidict_core (>=20211220)", "sudachipy (>=0.5.2,!=0.6.1)"] +ko = ["natto-py (>=0.9.0)"] +lookups = ["spacy_lookups_data (>=1.0.3,<1.1.0)"] +th = ["pythainlp (>=2.0)"] +transformers = ["spacy_transformers (>=1.1.2,<1.4.0)"] + +[[package]] +name = "spacy-legacy" +version = "3.0.12" +description = "Legacy registered functions for spaCy backwards compatibility" +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774"}, + {file = "spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f"}, +] + +[[package]] +name = "spacy-loggers" +version = "1.0.5" +description = "Logging utilities for SpaCy" +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24"}, + {file = "spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645"}, +] + [[package]] name = "sphinx" version = "6.2.1" @@ -6862,6 +7440,56 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] pymysql = ["pymysql"] sqlcipher = ["sqlcipher3_binary"] +[[package]] +name = "srsly" +version = "2.5.1" +description = "Modern high-performance serialization utilities for Python" +optional = true +python-versions = "<3.14,>=3.9" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "srsly-2.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d0cda6f65cc0dd1daf47e856b0d6c5d51db8a9343c5007723ca06903dcfe367d"}, + {file = "srsly-2.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf643e6f45c266cfacea54997a1f9cfe0113fadac1ac21a1ec5b200cfe477ba0"}, + {file = "srsly-2.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:467ed25ddab09ca9404fda92519a317c803b5ea0849f846e74ba8b7843557df5"}, + {file = "srsly-2.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f8113d202664b7d31025bdbe40b9d3536e8d7154d09520b6a1955818fa6d622"}, + {file = "srsly-2.5.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:794d39fccd2b333d24f1b445acc78daf90f3f37d3c0f6f0167f25c56961804e7"}, + {file = "srsly-2.5.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:df7fd77457c4d6c630f700b1019a8ad173e411e7cf7cfdea70e5ed86b608083b"}, + {file = "srsly-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:1a4dddb2edb8f7974c9aa5ec46dc687a75215b3bbdc815ce3fc9ea68fe1e94b5"}, + {file = "srsly-2.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58f0736794ce00a71d62a39cbba1d62ea8d5be4751df956e802d147da20ecad7"}, + {file = "srsly-2.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a8269c40859806d71920396d185f4f38dc985cdb6a28d3a326a701e29a5f629"}, + {file = "srsly-2.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:889905900401fefc1032e22b73aecbed8b4251aa363f632b2d1f86fc16f1ad8e"}, + {file = "srsly-2.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf454755f22589df49c25dc799d8af7b47dce3d861dded35baf0f0b6ceab4422"}, + {file = "srsly-2.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cc0607c8a59013a51dde5c1b4e465558728e9e0a35dcfa73c7cbefa91a0aad50"}, + {file = "srsly-2.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d5421ba3ab3c790e8b41939c51a1d0f44326bfc052d7a0508860fb79a47aee7f"}, + {file = "srsly-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:b96ea5a9a0d0379a79c46d255464a372fb14c30f59a8bc113e4316d131a530ab"}, + {file = "srsly-2.5.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:683b54ed63d7dfee03bc2abc4b4a5f2152f81ec217bbadbac01ef1aaf2a75790"}, + {file = "srsly-2.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:459d987130e57e83ce9e160899afbeb871d975f811e6958158763dd9a8a20f23"}, + {file = "srsly-2.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:184e3c98389aab68ff04aab9095bd5f1a8e5a72cc5edcba9d733bac928f5cf9f"}, + {file = "srsly-2.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00c2a3e4856e63b7efd47591d049aaee8e5a250e098917f50d93ea68853fab78"}, + {file = "srsly-2.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:366b4708933cd8d6025c13c2cea3331f079c7bb5c25ec76fca392b6fc09818a0"}, + {file = "srsly-2.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c8a0b03c64eb6e150d772c5149befbadd981cc734ab13184b0561c17c8cef9b1"}, + {file = "srsly-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:7952538f6bba91b9d8bf31a642ac9e8b9ccc0ccbb309feb88518bfb84bb0dc0d"}, + {file = "srsly-2.5.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84b372f7ef1604b4a5b3cee1571993931f845a5b58652ac01bcb32c52586d2a8"}, + {file = "srsly-2.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6ac3944c112acb3347a39bfdc2ebfc9e2d4bace20fe1c0b764374ac5b83519f2"}, + {file = "srsly-2.5.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6118f9c4b221cde0a990d06a42c8a4845218d55b425d8550746fe790acf267e9"}, + {file = "srsly-2.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7481460110d9986781d9e4ac0f5f991f1d6839284a80ad268625f9a23f686950"}, + {file = "srsly-2.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e57b8138082f09e35db60f99757e16652489e9e3692471d8e0c39aa95180688"}, + {file = "srsly-2.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bab90b85a63a1fe0bbc74d373c8bb9bb0499ddfa89075e0ebe8d670f12d04691"}, + {file = "srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee"}, + {file = "srsly-2.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d3b846ece78ec02aee637c1028cbbc6f0756faf8b01af190e9bbc8705321fc0"}, + {file = "srsly-2.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1529f5beb25a736ba1177f55532a942c786a8b4fe544bf9e9fbbebc5c63f4224"}, + {file = "srsly-2.5.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3c689a9f8dfa25c56533a3f145693b20ddc56415e25035e526ff7a7251a8c11"}, + {file = "srsly-2.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5982d01c7ddd62dbdb778a8bd176513d4d093cc56ef925fa2b0e13f71ed1809a"}, + {file = "srsly-2.5.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:196d3a2cc74758b2284e45f192e0df55d032b70be8481e207affc03216ddb464"}, + {file = "srsly-2.5.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:de756942e08ac3d8e8f5ae4595855932d7e4357f63adac6925b516c168f24711"}, + {file = "srsly-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:08b4045506cd4b63d2bb0da523156ab3ee67719aac3ca8cb591d6ed7ee55080e"}, + {file = "srsly-2.5.1.tar.gz", hash = "sha256:ab1b4bf6cf3e29da23dae0493dd1517fb787075206512351421b89b4fc27c77e"}, +] + +[package.dependencies] +catalogue = ">=2.0.3,<2.1.0" + [[package]] name = "stack-data" version = "0.6.3" @@ -7027,6 +7655,78 @@ dev = ["pre-commit (>=3.5,<4.0)", "textblob[tests]", "tox"] docs = ["PyYAML (==6.0.1)", "sphinx (==7.2.6)", "sphinx-issues (==4.0.0)"] tests = ["numpy", "pytest"] +[[package]] +name = "thinc" +version = "8.3.4" +description = "A refreshing functional take on deep learning, compatible with your favorite libraries" +optional = true +python-versions = "<3.13,>=3.9" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "thinc-8.3.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:916ea79a7c7462664be9435679b7769b4fc1ecea3886db6da6118e4eb5cc8c8b"}, + {file = "thinc-8.3.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6c985ce9cf82a611f4f348c721372d073537ca0e8b7bbb8bd865c1598ddd79d1"}, + {file = "thinc-8.3.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fff4b30f8513832d13a31486e9074a7020de3d48f8a3d1527e369c242d6ebe9"}, + {file = "thinc-8.3.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a9ee46d19b9f4cac13a5539f97978c857338a31e4bf8d9b3a7741dcbc792220f"}, + {file = "thinc-8.3.4-cp310-cp310-win_amd64.whl", hash = "sha256:d08529d53f8652e15e4f3c0f6953e73f85cc71d3b6e4750d2d9ace23616dbe8f"}, + {file = "thinc-8.3.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a8bb4b47358a1855803b375f4432cefdf373f46ef249b554418d2e77c7323040"}, + {file = "thinc-8.3.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:00ed92f9a34b9794f51fcd48467c863f4eb7c5b41559aef6ef3c980c21378fec"}, + {file = "thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85691fca84a6a1506f7ddbd2c1706a5524d56f65582e76b2e260a06d9e83e86d"}, + {file = "thinc-8.3.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:eae1573fc19e514defc1bfd4f93f0b4bfc1dcefdb6d70bad1863825747f24800"}, + {file = "thinc-8.3.4-cp311-cp311-win_amd64.whl", hash = "sha256:81e8638f9bdc38e366674acc4b63cf7c6267266a15477963a5db21b3d9f1aa36"}, + {file = "thinc-8.3.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c9da6375b106df5186bd2bfd1273bc923c01ab7d482f8942e4ee528a28965c3a"}, + {file = "thinc-8.3.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:07091c6b5faace50857c4cf0982204969d77388d0a6f156dd2442297dceeb838"}, + {file = "thinc-8.3.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd40ad71bcd8b1b9daa0462e1255b1c1e86e901c2fd773966601f44a95878032"}, + {file = "thinc-8.3.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb10823b3a3f1c6440998b11bf9a3571dd859feaed0fdb510a1c1097d9dc6a86"}, + {file = "thinc-8.3.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5e5e7bf5dae142fd50ed9785971292c4aab4d9ed18e4947653b6a0584d5227c"}, + {file = "thinc-8.3.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:960366f41f0d5c4cecdf8610d03bdf80b14a959a7fe94008b788a5336d388781"}, + {file = "thinc-8.3.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d85babfae9b31e2e20f4884787b1391ca126f84e9b9f7f498990c07f7019f848"}, + {file = "thinc-8.3.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8791c87857c474499455bfdd3f58432e2dc1e2cdadf46eb2f3c2293851a8a837"}, + {file = "thinc-8.3.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c95456cbc1344ab9041c2e16c9fa065ac2b56520929a5a594b3c80ddda136b1e"}, + {file = "thinc-8.3.4-cp39-cp39-win_amd64.whl", hash = "sha256:11e6e14c1bfdb7c456f3da19dcf94def8304a7b279329f328e55062a292bc79f"}, + {file = "thinc-8.3.4.tar.gz", hash = "sha256:b5925482498bbb6dca0771e375b35c915818f735891e93d93a662dab15f6ffd8"}, +] + +[package.dependencies] +blis = ">=1.2.0,<1.3.0" +catalogue = ">=2.0.4,<2.1.0" +confection = ">=0.0.1,<1.0.0" +cymem = ">=2.0.2,<2.1.0" +murmurhash = ">=1.0.2,<1.1.0" +numpy = {version = ">=1.19.0,<3.0.0", markers = "python_version >= \"3.9\""} +packaging = ">=20.0" +preshed = ">=3.0.2,<3.1.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +setuptools = "*" +srsly = ">=2.4.0,<3.0.0" +wasabi = ">=0.8.1,<1.2.0" + +[package.extras] +apple = ["thinc-apple-ops (>=1.0.0,<2.0.0)"] +cuda = ["cupy (>=5.0.0b4)"] +cuda-autodetect = ["cupy-wheel (>=11.0.0)"] +cuda100 = ["cupy-cuda100 (>=5.0.0b4)"] +cuda101 = ["cupy-cuda101 (>=5.0.0b4)"] +cuda102 = ["cupy-cuda102 (>=5.0.0b4)"] +cuda110 = ["cupy-cuda110 (>=5.0.0b4)"] +cuda111 = ["cupy-cuda111 (>=5.0.0b4)"] +cuda112 = ["cupy-cuda112 (>=5.0.0b4)"] +cuda113 = ["cupy-cuda113 (>=5.0.0b4)"] +cuda114 = ["cupy-cuda114 (>=5.0.0b4)"] +cuda115 = ["cupy-cuda115 (>=5.0.0b4)"] +cuda116 = ["cupy-cuda116 (>=5.0.0b4)"] +cuda117 = ["cupy-cuda117 (>=5.0.0b4)"] +cuda11x = ["cupy-cuda11x (>=11.0.0)"] +cuda12x = ["cupy-cuda12x (>=11.5.0)"] +cuda80 = ["cupy-cuda80 (>=5.0.0b4)"] +cuda90 = ["cupy-cuda90 (>=5.0.0b4)"] +cuda91 = ["cupy-cuda91 (>=5.0.0b4)"] +cuda92 = ["cupy-cuda92 (>=5.0.0b4)"] +datasets = ["ml_datasets (>=0.2.0,<0.3.0)"] +mxnet = ["mxnet (>=1.5.1,<1.6.0)"] +tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] +torch = ["torch (>=1.6.0)"] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -7111,6 +7811,29 @@ webencodings = ">=0.4" doc = ["sphinx", "sphinx_rtd_theme"] test = ["flake8", "isort", "pytest"] +[[package]] +name = "tldextract" +version = "5.3.0" +description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "tldextract-5.3.0-py3-none-any.whl", hash = "sha256:f70f31d10b55c83993f55e91ecb7c5d84532a8972f22ec578ecfbe5ea2292db2"}, + {file = "tldextract-5.3.0.tar.gz", hash = "sha256:b3d2b70a1594a0ecfa6967d57251527d58e00bb5a91a74387baa0d87a0678609"}, +] + +[package.dependencies] +filelock = ">=3.0.8" +idna = "*" +requests = ">=2.1.0" +requests-file = ">=1.4" + +[package.extras] +release = ["build", "twine"] +testing = ["mypy", "pytest", "pytest-gitignore", "pytest-mock", "responses", "ruff", "syrupy", "tox", "tox-uv", "types-filelock", "types-requests"] + [[package]] name = "tokenizers" version = "0.20.3" @@ -7523,6 +8246,25 @@ rfc3986 = ">=1.4.0" rich = ">=12.0.0" urllib3 = ">=1.26.0" +[[package]] +name = "typer" +version = "0.16.0" +description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "typer-0.16.0-py3-none-any.whl", hash = "sha256:1f79bed11d4d02d4310e3c1b7ba594183bcedb0ac73b27a9e5f28f6fb5b98855"}, + {file = "typer-0.16.0.tar.gz", hash = "sha256:af377ffaee1dbe37ae9440cb4e8f11686ea5ce4e9bae01b84ae7c63b87f1dd3b"}, +] + +[package.dependencies] +click = ">=8.0.0" +rich = ">=10.11.0" +shellingham = ">=1.3.0" +typing-extensions = ">=3.7.4.3" + [[package]] name = "types-python-dateutil" version = "2.9.0.20241206" @@ -7671,6 +8413,22 @@ platformdirs = ">=3.9.1,<5" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] +[[package]] +name = "wasabi" +version = "1.1.3" +description = "A lightweight console printing and formatting toolkit" +optional = true +python-versions = ">=3.6" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c"}, + {file = "wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878"}, +] + +[package.dependencies] +colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\" and python_version >= \"3.7\""} + [[package]] name = "wcwidth" version = "0.2.13" @@ -7683,6 +8441,30 @@ files = [ {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, ] +[[package]] +name = "weasel" +version = "0.4.1" +description = "Weasel: A small and easy workflow system" +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "weasel-0.4.1-py3-none-any.whl", hash = "sha256:24140a090ea1ac512a2b2f479cc64192fd1d527a7f3627671268d08ed5ac418c"}, + {file = "weasel-0.4.1.tar.gz", hash = "sha256:aabc210f072e13f6744e5c3a28037f93702433405cd35673f7c6279147085aa9"}, +] + +[package.dependencies] +cloudpathlib = ">=0.7.0,<1.0.0" +confection = ">=0.0.4,<0.2.0" +packaging = ">=20.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +requests = ">=2.13.0,<3.0.0" +smart-open = ">=5.2.1,<8.0.0" +srsly = ">=2.4.3,<3.0.0" +typer = ">=0.3.0,<1.0.0" +wasabi = ">=0.9.1,<1.2.0" + [[package]] name = "webcolors" version = "24.8.0" @@ -7740,6 +8522,96 @@ files = [ {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"}, ] +[[package]] +name = "wrapt" +version = "1.17.2" +description = "Module for decorators, wrappers and monkey patching." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984"}, + {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22"}, + {file = "wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7"}, + {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c"}, + {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72"}, + {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061"}, + {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2"}, + {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c"}, + {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62"}, + {file = "wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563"}, + {file = "wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f"}, + {file = "wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58"}, + {file = "wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda"}, + {file = "wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438"}, + {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a"}, + {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000"}, + {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6"}, + {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b"}, + {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662"}, + {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72"}, + {file = "wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317"}, + {file = "wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3"}, + {file = "wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925"}, + {file = "wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392"}, + {file = "wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40"}, + {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb1d0dbf99411f3d871deb6faa9aabb9d4e744d67dcaaa05399af89d847a91d"}, + {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d18a4865f46b8579d44e4fe1e2bcbc6472ad83d98e22a26c963d46e4c125ef0b"}, + {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc570b5f14a79734437cb7b0500376b6b791153314986074486e0b0fa8d71d98"}, + {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6d9187b01bebc3875bac9b087948a2bccefe464a7d8f627cf6e48b1bbae30f82"}, + {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9e8659775f1adf02eb1e6f109751268e493c73716ca5761f8acb695e52a756ae"}, + {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8b2816ebef96d83657b56306152a93909a83f23994f4b30ad4573b00bd11bb9"}, + {file = "wrapt-1.17.2-cp312-cp312-win32.whl", hash = "sha256:468090021f391fe0056ad3e807e3d9034e0fd01adcd3bdfba977b6fdf4213ea9"}, + {file = "wrapt-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:ec89ed91f2fa8e3f52ae53cd3cf640d6feff92ba90d62236a81e4e563ac0e991"}, + {file = "wrapt-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6ed6ffac43aecfe6d86ec5b74b06a5be33d5bb9243d055141e8cabb12aa08125"}, + {file = "wrapt-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35621ae4c00e056adb0009f8e86e28eb4a41a4bfa8f9bfa9fca7d343fe94f998"}, + {file = "wrapt-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a604bf7a053f8362d27eb9fefd2097f82600b856d5abe996d623babd067b1ab5"}, + {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbabee4f083b6b4cd282f5b817a867cf0b1028c54d445b7ec7cfe6505057cf8"}, + {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49703ce2ddc220df165bd2962f8e03b84c89fee2d65e1c24a7defff6f988f4d6"}, + {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112e52c5822fc4253f3901b676c55ddf288614dc7011634e2719718eaa187dc"}, + {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fee687dce376205d9a494e9c121e27183b2a3df18037f89d69bd7b35bcf59e2"}, + {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:18983c537e04d11cf027fbb60a1e8dfd5190e2b60cc27bc0808e653e7b218d1b"}, + {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:703919b1633412ab54bcf920ab388735832fdcb9f9a00ae49387f0fe67dad504"}, + {file = "wrapt-1.17.2-cp313-cp313-win32.whl", hash = "sha256:abbb9e76177c35d4e8568e58650aa6926040d6a9f6f03435b7a522bf1c487f9a"}, + {file = "wrapt-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:69606d7bb691b50a4240ce6b22ebb319c1cfb164e5f6569835058196e0f3a845"}, + {file = "wrapt-1.17.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4a721d3c943dae44f8e243b380cb645a709ba5bd35d3ad27bc2ed947e9c68192"}, + {file = "wrapt-1.17.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:766d8bbefcb9e00c3ac3b000d9acc51f1b399513f44d77dfe0eb026ad7c9a19b"}, + {file = "wrapt-1.17.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e496a8ce2c256da1eb98bd15803a79bee00fc351f5dfb9ea82594a3f058309e0"}, + {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d615e4fe22f4ad3528448c193b218e077656ca9ccb22ce2cb20db730f8d306"}, + {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5aaeff38654462bc4b09023918b7f21790efb807f54c000a39d41d69cf552cb"}, + {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7d15bbd2bc99e92e39f49a04653062ee6085c0e18b3b7512a4f2fe91f2d681"}, + {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e3890b508a23299083e065f435a492b5435eba6e304a7114d2f919d400888cc6"}, + {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c8b293cd65ad716d13d8dd3624e42e5a19cc2a2f1acc74b30c2c13f15cb61a6"}, + {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f"}, + {file = "wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555"}, + {file = "wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c"}, + {file = "wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9"}, + {file = "wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119"}, + {file = "wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6"}, + {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9"}, + {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a"}, + {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2"}, + {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a"}, + {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04"}, + {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f"}, + {file = "wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7"}, + {file = "wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3"}, + {file = "wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a"}, + {file = "wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061"}, + {file = "wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82"}, + {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62c2caa1585c82b3f7a7ab56afef7b3602021d6da34fbc1cf234ff139fed3cd9"}, + {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c958bcfd59bacc2d0249dcfe575e71da54f9dcf4a8bdf89c4cb9a68a1170d73f"}, + {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc78a84e2dfbc27afe4b2bd7c80c8db9bca75cc5b85df52bfe634596a1da846b"}, + {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba0f0eb61ef00ea10e00eb53a9129501f52385c44853dbd6c4ad3f403603083f"}, + {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1e1fe0e6ab7775fd842bc39e86f6dcfc4507ab0ffe206093e76d61cde37225c8"}, + {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c86563182421896d73858e08e1db93afdd2b947a70064b813d515d66549e15f9"}, + {file = "wrapt-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f393cda562f79828f38a819f4788641ac7c4085f30f1ce1a68672baa686482bb"}, + {file = "wrapt-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:36ccae62f64235cf8ddb682073a60519426fdd4725524ae38874adf72b5f2aeb"}, + {file = "wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8"}, + {file = "wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3"}, +] + [[package]] name = "xgboost" version = "2.1.4" @@ -8196,12 +9068,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["langchain-openai", "pycocoevalcap", "ragas", "sentencepiece", "torch", "transformers"] +all = ["langchain-openai", "presidio-analyzer", "pycocoevalcap", "ragas", "sentencepiece", "torch", "transformers"] huggingface = ["sentencepiece", "transformers"] llm = ["langchain-openai", "pycocoevalcap", "ragas", "sentencepiece", "torch", "transformers"] +pii-detection = ["presidio-analyzer"] pytorch = ["torch"] [metadata] lock-version = "2.1" python-versions = ">=3.9.0,<3.12" -content-hash = "d44d66b661fc8ddca8f5c66fca73056d9b186e53a5aad0730e5de8209868f8bc" +content-hash = "b1fac5d0289db1b858f45c4d8f508a2db719b1402aae9dfa2a5292aaead2091d" diff --git a/pyproject.toml b/pyproject.toml index b84287a90..5c5415e41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,10 +10,10 @@ description = "ValidMind Library" license = "Commercial License" name = "validmind" readme = "README.pypi.md" -version = "2.8.29" +version = "2.9.0" [tool.poetry.dependencies] -aiohttp = {extras = ["speedups"], version = "*"} +aiohttp = { extras = ["speedups"], version = "*" } anywidget = "^0.9.13" arch = "*" bert-score = ">=0.3.13" @@ -23,41 +23,42 @@ evaluate = "*" h11 = ">=0.16.0" ipywidgets = "*" kaleido = ">=0.2.1,!=0.2.1.post1" -langchain-openai = {version = ">=0.1.8", optional = true} +langchain-openai = { version = ">=0.1.8", optional = true } langdetect = "*" -llvmlite = {version = "*", python = ">=3.8,<=3.11"} +llvmlite = { version = "*", python = ">=3.8,<=3.11" } matplotlib = "*" mistune = "^3.0.2" nest-asyncio = "^1.6.0" nltk = "^3.8.1" -numba = "<0.59.0" # TODO: https://github.com/validmind/validmind-library/pull/28 +numba = "<0.59.0" # TODO: https://github.com/validmind/validmind-library/pull/28 numpy = "*" openai = ">=1" pandas = ">=1.1,<=2.0.3" plotly = "<6.0.0" plotly-express = "*" polars = "*" -pycocoevalcap = {version = "^1.2", optional = true} +pycocoevalcap = { version = "^1.2", optional = true } python = ">=3.9.0,<3.12" python-dotenv = "*" -ragas = {version = ">=0.2.3,<=0.2.7", optional = true} +ragas = { version = ">=0.2.3,<=0.2.7", optional = true } rouge = ">=1" scikit-learn = "*,<1.6.0" scipy = "*" scorecardpy = "^0.1.9.6" seaborn = "*" -sentencepiece = {version = "^0.2.0", optional = true} +sentencepiece = { version = "^0.2.0", optional = true } sentry-sdk = "^1.24.0" shap = "0.44.1" statsmodels = "*" tabulate = "^0.8.9" textblob = "^0.18.0.post0" tiktoken = "*" -torch = {version = "2.7.0", optional = true} +torch = { version = "2.7.0", optional = true } tqdm = "*" -transformers = {version = "^4.32.0", optional = true} +transformers = { version = "^4.32.0", optional = true } xgboost = ">=1.5.2,<3" yfinance = "^0.2.48" +presidio-analyzer = { version = "^2.2.0", optional = true } [tool.poetry.group.dev.dependencies] black = "^22.1.0" @@ -86,6 +87,7 @@ all = [ "ragas", "sentencepiece", "langchain-openai", + "presidio-analyzer", ] huggingface = ["transformers", "sentencepiece"] llm = [ @@ -97,6 +99,7 @@ llm = [ "langchain-openai", ] pytorch = ["torch"] +pii-detection = ["presidio-analyzer"] [build-system] build-backend = "poetry.core.masonry.api" diff --git a/validmind/__version__.py b/validmind/__version__.py index e7d98bbba..43ce13db0 100644 --- a/validmind/__version__.py +++ b/validmind/__version__.py @@ -1 +1 @@ -__version__ = "2.8.28" +__version__ = "2.9.0" diff --git a/validmind/tests/__types__.py b/validmind/tests/__types__.py index 43346c41d..3979aa58c 100644 --- a/validmind/tests/__types__.py +++ b/validmind/tests/__types__.py @@ -187,6 +187,10 @@ "validmind.ongoing_monitoring.ScoreBandsDrift", "validmind.ongoing_monitoring.ScorecardHistogramDrift", "validmind.ongoing_monitoring.TargetPredictionDistributionPlot", + "validmind.plots.BoxPlot", + "validmind.plots.CorrelationHeatmap", + "validmind.plots.HistogramPlot", + "validmind.plots.ViolinPlot", "validmind.prompt_validation.Bias", "validmind.prompt_validation.Clarity", "validmind.prompt_validation.Conciseness", @@ -194,6 +198,10 @@ "validmind.prompt_validation.NegativeInstruction", "validmind.prompt_validation.Robustness", "validmind.prompt_validation.Specificity", + "validmind.stats.CorrelationAnalysis", + "validmind.stats.DescriptiveStats", + "validmind.stats.NormalityTests", + "validmind.stats.OutlierDetection", "validmind.unit_metrics.classification.Accuracy", "validmind.unit_metrics.classification.F1", "validmind.unit_metrics.classification.Precision", diff --git a/validmind/vm_models/result/pii_filter.py b/validmind/vm_models/result/pii_filter.py new file mode 100644 index 000000000..4b41b62ee --- /dev/null +++ b/validmind/vm_models/result/pii_filter.py @@ -0,0 +1,209 @@ +# Copyright © 2023-2024 ValidMind Inc. All rights reserved. +# See the LICENSE file in the root of this repository for details. +# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial + +""" +PII filtering utilities using Microsoft Presidio for detecting and masking +personally identifiable information in test result data. +""" + +import os +from typing import Dict, List, Optional, Union + +import pandas as pd + +from ...logging import get_logger + +logger = get_logger(__name__) + +# Check if PII filtering is enabled via environment variable +PII_FILTERING_ENABLED = ( + os.getenv("VALIDMIND_PII_FILTERING_ENABLED", "false").lower() == "true" +) + +# Lazy load presidio components to avoid import errors when not installed +_analyzer = None + + +def _get_presidio_analyzer(): + """Lazy load Presidio analyzer to avoid import errors when not installed.""" + global _analyzer + if _analyzer is None: + try: + from presidio_analyzer import AnalyzerEngine # type: ignore + + _analyzer = AnalyzerEngine() + logger.debug("Presidio analyzer initialized successfully") + except ImportError: + logger.warning( + "Presidio analyzer not available. Install with: pip install validmind[pii-detection]" + ) + _analyzer = False + return _analyzer if _analyzer is not False else None + + +def is_pii_filtering_enabled() -> bool: + """Check if PII filtering is enabled and available.""" + return PII_FILTERING_ENABLED and _get_presidio_analyzer() is not None + + +def detect_pii_in_text( + text: str, + entities: Optional[List[str]] = None, + language: str = "en", + threshold: float = 0.5, +) -> List[Dict]: + """ + Detect PII entities in text using Presidio analyzer. + + Args: + text: The text to analyze for PII + entities: List of entity types to detect. If None, detects all supported entities + language: Language code for analysis (default: "en") + threshold: Minimum confidence score for PII detection (default: 0.5) + + Returns: + List of detected PII entities with their positions and confidence scores + """ + analyzer = _get_presidio_analyzer() + if analyzer is None: + logger.debug("PII detection skipped - Presidio not available") + return [] + + try: + # Default entities to detect common PII types + if entities is None: + entities = [ + "PERSON", + "EMAIL_ADDRESS", + "PHONE_NUMBER", + "CREDIT_CARD", + "US_SSN", + "US_DRIVER_LICENSE", + "IP_ADDRESS", + "LOCATION", + "DATE_TIME", + "US_PASSPORT", + "US_BANK_NUMBER", + "IBAN_CODE", + ] + + results = analyzer.analyze(text=text, entities=entities, language=language) + + # Filter results by confidence threshold + filtered_results = [ + { + "entity_type": result.entity_type, + "start": result.start, + "end": result.end, + "score": result.score, + "text": text[result.start : result.end], + } + for result in results + if result.score >= threshold + ] + + if filtered_results: + logger.debug(f"Detected {len(filtered_results)} PII entities in text") + + return filtered_results + + except Exception as e: + logger.warning(f"PII detection failed: {e}") + return [] + + +def scan_dataframe_for_pii( + df: pd.DataFrame, + columns: Optional[List[str]] = None, + threshold: float = 0.5, + sample_size: int = 100, +) -> Dict[str, List[Dict]]: + """ + Scan a pandas DataFrame for PII content in text columns. + + Args: + df: The DataFrame to scan + columns: List of column names to scan (if None, scans all string columns) + threshold: Minimum confidence score for PII detection + sample_size: Maximum number of rows to sample for PII detection + + Returns: + Dictionary mapping column names to lists of detected PII entities + """ + if not is_pii_filtering_enabled(): + return {} + + pii_findings = {} + + # Determine which columns to scan + if columns is None: + # Scan all string/object columns + columns = [col for col in df.columns if df[col].dtype == "object"] + + # Limit the number of rows to scan for performance + sample_df = df.head(sample_size) if len(df) > sample_size else df + + for column in columns: + column_pii = [] + + # Scan non-null string values in the column + for idx, value in sample_df[column].dropna().items(): + if isinstance(value, str) and len(value.strip()) > 0: + pii_entities = detect_pii_in_text(text=str(value), threshold=threshold) + + if pii_entities: + column_pii.extend( + [ + {**entity, "row_index": idx, "column": column} + for entity in pii_entities + ] + ) + + if column_pii: + pii_findings[column] = column_pii + logger.info(f"Found {len(column_pii)} PII entities in column '{column}'") + + return pii_findings + + +def check_table_for_pii( + table_data: Union[pd.DataFrame, List[Dict]], + threshold: float = 0.5, + raise_on_detection: bool = False, +) -> None: + """ + Check a table (DataFrame or list of dicts) for PII content. + + Args: + table_data: The table data to check + threshold: Minimum confidence score for PII detection + raise_on_detection: If True, raises ValueError when PII is detected + + Raises: + ValueError: If PII is detected and raise_on_detection is True + """ + if not is_pii_filtering_enabled(): + return + + # Convert to DataFrame if it's a list of dicts + if isinstance(table_data, list): + if not table_data: + return + df = pd.DataFrame(table_data) + else: + df = table_data + + # Scan for PII + pii_findings = scan_dataframe_for_pii(df, threshold=threshold) + has_pii = bool(pii_findings) + + if has_pii and raise_on_detection: + entity_types = set() + for findings in pii_findings.values(): + entity_types.update(entity["entity_type"] for entity in findings) + + raise ValueError( + f"PII detected in table data. Entity types found: {', '.join(entity_types)}. " + f"Pass `unsafe=True` to bypass PII filtering." + ) diff --git a/validmind/vm_models/result/result.py b/validmind/vm_models/result/result.py index ecc763af4..ee39f9647 100644 --- a/validmind/vm_models/result/result.py +++ b/validmind/vm_models/result/result.py @@ -532,7 +532,7 @@ def log( if not unsafe: for table in self.tables or []: - check_for_sensitive_data(table.data, self._get_flat_inputs()) + check_for_sensitive_data(table.data) if section_id: self._validate_section_id_for_block(section_id, position) diff --git a/validmind/vm_models/result/utils.py b/validmind/vm_models/result/utils.py index a9563f90d..c58139a21 100644 --- a/validmind/vm_models/result/utils.py +++ b/validmind/vm_models/result/utils.py @@ -11,7 +11,6 @@ from ... import api_client from ...logging import get_logger -from ..dataset import VMDataset from ..figure import Figure from ..input import VMInput @@ -52,28 +51,22 @@ async def update_metadata(content_id: str, text: str, _json: Union[Dict, List] = await api_client.alog_metadata(content_id, text, _json) -def check_for_sensitive_data(data: pd.DataFrame, inputs: List[VMInput]): - """Check if the data contains sensitive information from input datasets.""" - dataset_columns = { - col: len(input_obj.df) - for input_obj in inputs - if isinstance(input_obj, VMDataset) - for col in input_obj.columns - } +def check_for_sensitive_data(data: pd.DataFrame): + """Check if the data contains sensitive information (PII).""" + # Check for PII content + try: + from .pii_filter import check_table_for_pii - table_columns = {col: len(data) for col in data.columns} + check_table_for_pii(table_data=data, threshold=0.5, raise_on_detection=True) - offending_columns = [ - col - for col in table_columns - if col in dataset_columns and table_columns[col] == dataset_columns[col] - ] - - if offending_columns: - raise ValueError( - f"Raw input data found in table, pass `unsafe=True` " - f"or remove the offending columns: {offending_columns}" - ) + except ImportError: + logger.debug("PII filtering not installed - skipping PII check") + except ValueError as e: + # PII was detected and raise_on_detection is True + raise e + except Exception as e: + # Log other PII checking errors but don't fail the entire operation + logger.warning(f"PII checking failed: {e}") def tables_to_widgets(tables: List["ResultTable"]): From ff4135399cb6d7af9925c2898ae58401f9fbef3b Mon Sep 17 00:00:00 2001 From: John Walz Date: Wed, 6 Aug 2025 13:57:11 -0400 Subject: [PATCH 02/11] refactor: remove unused import from utils.py --- validmind/vm_models/result/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/validmind/vm_models/result/utils.py b/validmind/vm_models/result/utils.py index c58139a21..6161c852c 100644 --- a/validmind/vm_models/result/utils.py +++ b/validmind/vm_models/result/utils.py @@ -12,7 +12,6 @@ from ... import api_client from ...logging import get_logger from ..figure import Figure -from ..input import VMInput if TYPE_CHECKING: from .result import ResultTable From f7404ccdecf85d5bf827f949c792e168cec91764 Mon Sep 17 00:00:00 2001 From: John Walz Date: Thu, 7 Aug 2025 12:27:43 -0400 Subject: [PATCH 03/11] feat: use enum for PII detection configuration and functionality - Update README.md to reflect new PII filtering options, replacing "Enable PII detection" with "Configure PII filtering" and detailing available modes. - Modify run_e2e_notebooks.py to accept a new command-line option for PII filtering mode, allowing users to specify their desired filtering behavior during notebook execution. - Implement PII filtering in test descriptions by adding a new function to filter PII from summaries before sending to the LLM. - Introduce an Enum for PII filtering modes in pii_filter.py, improving clarity and maintainability of PII filtering logic. - Update existing functions to utilize the new PII filtering capabilities, ensuring that PII is appropriately handled in test results and descriptions. --- README.md | 14 ++- scripts/run_e2e_notebooks.py | 27 +++++- validmind/ai/test_descriptions.py | 22 ++++- validmind/vm_models/result/pii_filter.py | 104 +++++++++++++++++++++-- 4 files changed, 152 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 2484d50a2..21d3bd273 100644 --- a/README.md +++ b/README.md @@ -73,10 +73,20 @@ The ValidMind Library includes optional PII detection capabilities using Microso pip install validmind[pii-detection] ``` -**Enable PII detection:** +**Configure PII filtering:** ```bash -export VALIDMIND_PII_FILTERING_ENABLED=true +# Enable PII filtering for test results only (default behavior) +export VALIDMIND_PII_FILTERING=test_results + +# Enable PII filtering for test descriptions only +export VALIDMIND_PII_FILTERING=test_descriptions + +# Enable PII filtering for both test results and descriptions +export VALIDMIND_PII_FILTERING=all + +# Disable PII filtering (default) +export VALIDMIND_PII_FILTERING=disabled ``` ## How to contribute diff --git a/scripts/run_e2e_notebooks.py b/scripts/run_e2e_notebooks.py index b5976b1bb..7d9cdab10 100644 --- a/scripts/run_e2e_notebooks.py +++ b/scripts/run_e2e_notebooks.py @@ -89,6 +89,7 @@ INIT_CELL_CODE = """ import os os.environ["VALIDMIND_LLM_DESCRIPTIONS_ENABLED"] = "0" +os.environ["VALIDMIND_PII_FILTERING"] = "{pii_filtering_mode}" import validmind as vm vm.init( @@ -110,6 +111,9 @@ # Print the path to the site-packages directory logger.info("Site-packages path: " + str(site.getsitepackages())) + +# Print PII filtering configuration for debugging +logger.info("PII filtering mode: " + os.environ.get("VALIDMIND_PII_FILTERING", "disabled")) """ @@ -135,7 +139,19 @@ default=False, help="Run notebooks for purpose of updating model data templates in backend.", ) -def main(kernel, log_output=False, progress_bar=True, update_data_template=False): +@click.option( + "--pii-filtering-mode", + default="disabled", + type=click.Choice(["disabled", "test_results", "test_descriptions", "all"]), + help="PII filtering mode to use for testing.", +) +def main( + kernel, + log_output=False, + progress_bar=True, + update_data_template=False, + pii_filtering_mode="disabled", +): """Run notebooks from the specified directory for end-to-end testing.""" if update_data_template: notebooks = DATA_TEMPLATE_NOTEBOOKS @@ -152,8 +168,10 @@ def main(kernel, log_output=False, progress_bar=True, update_data_template=False backup_notebook(notebook_path) try: - update_vm_init_cell(notebook_path, model) - click.echo(f"\n -------- Executing {notebook_path} ---------- \n") + update_vm_init_cell(notebook_path, model, pii_filtering_mode) + click.echo( + f"\n -------- Executing {notebook_path} (PII filtering: {pii_filtering_mode}) ---------- \n" + ) run_notebook( notebook_path=notebook_path, kernel_name=kernel, @@ -192,7 +210,7 @@ def run_notebook(notebook_path, kernel_name, log_output=False, progress_bar=True os.remove(output_path) -def update_vm_init_cell(notebook_path, model): +def update_vm_init_cell(notebook_path, model, pii_filtering_mode="disabled"): api_host = os.getenv( "NOTEBOOK_RUNNER_API_HOST", "https://api.dev.vm.validmind.ai/api/v1/tracking" ) @@ -204,6 +222,7 @@ def update_vm_init_cell(notebook_path, model): api_key=api_key, api_secret=api_secret, model=model, + pii_filtering_mode=pii_filtering_mode, ) with open(notebook_path, "r") as f: diff --git a/validmind/ai/test_descriptions.py b/validmind/ai/test_descriptions.py index 3a9a05ebe..eb83f081b 100644 --- a/validmind/ai/test_descriptions.py +++ b/validmind/ai/test_descriptions.py @@ -37,6 +37,23 @@ def _get_llm_global_context(): return context if context_enabled and context else None +def _filter_pii_from_summary(summary: Union[str, None]) -> Union[str, None]: + """Filter PII from summary text before sending to LLM.""" + if summary is None: + return summary + + try: + from ..vm_models.result.pii_filter import filter_pii_from_text + + return filter_pii_from_text(summary) + except ImportError: + logger.debug("PII filtering not available - skipping PII filtering for summary") + return summary + except Exception as e: + logger.warning(f"PII filtering failed for summary: {e}") + return summary + + def _truncate_summary( summary: Union[str, None], test_id: str, max_tokens: int = 100_000 ): @@ -101,12 +118,15 @@ def generate_description( else: summary = None + # Filter PII from summary before sending to LLM + filtered_summary = _filter_pii_from_summary(summary) + return generate_test_result_description( { "test_name": test_name, "test_description": test_description, "title": title, - "summary": _truncate_summary(summary, test_id), + "summary": _truncate_summary(filtered_summary, test_id), "figures": [ figure._get_b64_url() for figure in ([] if tables else figures) ], diff --git a/validmind/vm_models/result/pii_filter.py b/validmind/vm_models/result/pii_filter.py index 4b41b62ee..b84c214d7 100644 --- a/validmind/vm_models/result/pii_filter.py +++ b/validmind/vm_models/result/pii_filter.py @@ -8,6 +8,7 @@ """ import os +from enum import Enum from typing import Dict, List, Optional, Union import pandas as pd @@ -16,10 +17,30 @@ logger = get_logger(__name__) -# Check if PII filtering is enabled via environment variable -PII_FILTERING_ENABLED = ( - os.getenv("VALIDMIND_PII_FILTERING_ENABLED", "false").lower() == "true" -) + +class PIIFilteringMode(Enum): + """Enum for PII filtering modes.""" + + DISABLED = "disabled" + TEST_RESULTS = "test_results" + TEST_DESCRIPTIONS = "test_descriptions" + ALL = "all" + + +def _get_pii_filtering_mode() -> PIIFilteringMode: + """Get the current PII filtering mode from environment variable.""" + mode_str = os.getenv("VALIDMIND_PII_FILTERING", "disabled").lower() + + try: + return PIIFilteringMode(mode_str) + except ValueError: + logger.warning( + f"Invalid PII filtering mode '{mode_str}'. " + f"Valid options: {', '.join([mode.value for mode in PIIFilteringMode])}. " + f"Defaulting to 'disabled'." + ) + return PIIFilteringMode.DISABLED + # Lazy load presidio components to avoid import errors when not installed _analyzer = None @@ -42,9 +63,28 @@ def _get_presidio_analyzer(): return _analyzer if _analyzer is not False else None +def is_pii_filtering_enabled_for_test_results() -> bool: + """Check if PII filtering is enabled for test results and available.""" + mode = _get_pii_filtering_mode() + return ( + mode in [PIIFilteringMode.TEST_RESULTS, PIIFilteringMode.ALL] + and _get_presidio_analyzer() is not None + ) + + +def is_pii_filtering_enabled_for_test_descriptions() -> bool: + """Check if PII filtering is enabled for test descriptions and available.""" + mode = _get_pii_filtering_mode() + return ( + mode in [PIIFilteringMode.TEST_DESCRIPTIONS, PIIFilteringMode.ALL] + and _get_presidio_analyzer() is not None + ) + + def is_pii_filtering_enabled() -> bool: - """Check if PII filtering is enabled and available.""" - return PII_FILTERING_ENABLED and _get_presidio_analyzer() is not None + """Check if PII filtering is enabled for any mode and available.""" + mode = _get_pii_filtering_mode() + return mode != PIIFilteringMode.DISABLED and _get_presidio_analyzer() is not None def detect_pii_in_text( @@ -131,7 +171,7 @@ def scan_dataframe_for_pii( Returns: Dictionary mapping column names to lists of detected PII entities """ - if not is_pii_filtering_enabled(): + if not is_pii_filtering_enabled_for_test_results(): return {} pii_findings = {} @@ -183,7 +223,7 @@ def check_table_for_pii( Raises: ValueError: If PII is detected and raise_on_detection is True """ - if not is_pii_filtering_enabled(): + if not is_pii_filtering_enabled_for_test_results(): return # Convert to DataFrame if it's a list of dicts @@ -207,3 +247,51 @@ def check_table_for_pii( f"PII detected in table data. Entity types found: {', '.join(entity_types)}. " f"Pass `unsafe=True` to bypass PII filtering." ) + + +def filter_pii_from_text( + text: str, + entities: Optional[List[str]] = None, + language: str = "en", + threshold: float = 0.5, + mask_char: str = "*", +) -> str: + """ + Filter PII from text by replacing detected entities with mask characters. + + Args: + text: The text to filter + entities: List of entity types to detect and filter + language: Language code for analysis + threshold: Minimum confidence score for PII detection + mask_char: Character to use for masking PII + + Returns: + Text with PII entities masked + """ + if not is_pii_filtering_enabled_for_test_descriptions(): + return text + + pii_entities = detect_pii_in_text( + text=text, entities=entities, language=language, threshold=threshold + ) + + if not pii_entities: + return text + + # Sort entities by start position in reverse order to avoid index shifting + pii_entities.sort(key=lambda x: x["start"], reverse=True) + + filtered_text = text + for entity in pii_entities: + # Replace the PII text with mask characters + mask_length = entity["end"] - entity["start"] + mask = mask_char * mask_length + filtered_text = ( + filtered_text[: entity["start"]] + mask + filtered_text[entity["end"] :] + ) + + if pii_entities: + logger.info(f"Masked {len(pii_entities)} PII entities from text") + + return filtered_text From bf25d56133460c79eacb752b940205055d1cdbef Mon Sep 17 00:00:00 2001 From: John Walz Date: Thu, 7 Aug 2025 12:43:05 -0400 Subject: [PATCH 04/11] refactor: rename PII filtering to PII detection and update related functionality - Update README.md to change "PII filtering" to "PII detection" for clarity and consistency. - Modify run_e2e_notebooks.py to reflect the new environment variable for PII detection. - Refactor test descriptions to check for PII content instead of filtering it, raising exceptions when PII is detected. - Rename PII filtering-related functions and enums in pii_filter.py to align with the new terminology. - Ensure all references to PII handling are updated to use the new detection logic. --- README.md | 18 ++--- scripts/run_e2e_notebooks.py | 20 ++--- validmind/ai/test_descriptions.py | 25 +++--- validmind/vm_models/result/pii_filter.py | 98 +++++++++++------------- 4 files changed, 77 insertions(+), 84 deletions(-) diff --git a/README.md b/README.md index 21d3bd273..c11f7c2aa 100644 --- a/README.md +++ b/README.md @@ -73,20 +73,20 @@ The ValidMind Library includes optional PII detection capabilities using Microso pip install validmind[pii-detection] ``` -**Configure PII filtering:** +**Configure PII detection:** ```bash -# Enable PII filtering for test results only (default behavior) -export VALIDMIND_PII_FILTERING=test_results +# Enable PII detection for test results only (default behavior) +export VALIDMIND_PII_DETECTION=test_results -# Enable PII filtering for test descriptions only -export VALIDMIND_PII_FILTERING=test_descriptions +# Enable PII detection for test descriptions only +export VALIDMIND_PII_DETECTION=test_descriptions -# Enable PII filtering for both test results and descriptions -export VALIDMIND_PII_FILTERING=all +# Enable PII detection for both test results and descriptions +export VALIDMIND_PII_DETECTION=all -# Disable PII filtering (default) -export VALIDMIND_PII_FILTERING=disabled +# Disable PII detection (default) +export VALIDMIND_PII_DETECTION=disabled ``` ## How to contribute diff --git a/scripts/run_e2e_notebooks.py b/scripts/run_e2e_notebooks.py index 7d9cdab10..86e99bd4e 100644 --- a/scripts/run_e2e_notebooks.py +++ b/scripts/run_e2e_notebooks.py @@ -89,7 +89,7 @@ INIT_CELL_CODE = """ import os os.environ["VALIDMIND_LLM_DESCRIPTIONS_ENABLED"] = "0" -os.environ["VALIDMIND_PII_FILTERING"] = "{pii_filtering_mode}" +os.environ["VALIDMIND_PII_DETECTION"] = "{pii_detection_mode}" import validmind as vm vm.init( @@ -112,8 +112,8 @@ # Print the path to the site-packages directory logger.info("Site-packages path: " + str(site.getsitepackages())) -# Print PII filtering configuration for debugging -logger.info("PII filtering mode: " + os.environ.get("VALIDMIND_PII_FILTERING", "disabled")) +# Print PII detection configuration for debugging +logger.info("PII detection mode: " + os.environ.get("VALIDMIND_PII_DETECTION", "disabled")) """ @@ -140,17 +140,17 @@ help="Run notebooks for purpose of updating model data templates in backend.", ) @click.option( - "--pii-filtering-mode", + "--pii-detection-mode", default="disabled", type=click.Choice(["disabled", "test_results", "test_descriptions", "all"]), - help="PII filtering mode to use for testing.", + help="PII detection mode to use for testing.", ) def main( kernel, log_output=False, progress_bar=True, update_data_template=False, - pii_filtering_mode="disabled", + pii_detection_mode="disabled", ): """Run notebooks from the specified directory for end-to-end testing.""" if update_data_template: @@ -168,9 +168,9 @@ def main( backup_notebook(notebook_path) try: - update_vm_init_cell(notebook_path, model, pii_filtering_mode) + update_vm_init_cell(notebook_path, model, pii_detection_mode) click.echo( - f"\n -------- Executing {notebook_path} (PII filtering: {pii_filtering_mode}) ---------- \n" + f"\n -------- Executing {notebook_path} (PII detection: {pii_detection_mode}) ---------- \n" ) run_notebook( notebook_path=notebook_path, @@ -210,7 +210,7 @@ def run_notebook(notebook_path, kernel_name, log_output=False, progress_bar=True os.remove(output_path) -def update_vm_init_cell(notebook_path, model, pii_filtering_mode="disabled"): +def update_vm_init_cell(notebook_path, model, pii_detection_mode="disabled"): api_host = os.getenv( "NOTEBOOK_RUNNER_API_HOST", "https://api.dev.vm.validmind.ai/api/v1/tracking" ) @@ -222,7 +222,7 @@ def update_vm_init_cell(notebook_path, model, pii_filtering_mode="disabled"): api_key=api_key, api_secret=api_secret, model=model, - pii_filtering_mode=pii_filtering_mode, + pii_detection_mode=pii_detection_mode, ) with open(notebook_path, "r") as f: diff --git a/validmind/ai/test_descriptions.py b/validmind/ai/test_descriptions.py index eb83f081b..8fe617192 100644 --- a/validmind/ai/test_descriptions.py +++ b/validmind/ai/test_descriptions.py @@ -37,21 +37,22 @@ def _get_llm_global_context(): return context if context_enabled and context else None -def _filter_pii_from_summary(summary: Union[str, None]) -> Union[str, None]: - """Filter PII from summary text before sending to LLM.""" +def _check_summary_for_pii(summary: Union[str, None]) -> None: + """Check summary text for PII content before sending to LLM.""" if summary is None: - return summary + return try: - from ..vm_models.result.pii_filter import filter_pii_from_text + from ..vm_models.result.pii_filter import check_text_for_pii - return filter_pii_from_text(summary) + check_text_for_pii(summary, raise_on_detection=True) except ImportError: - logger.debug("PII filtering not available - skipping PII filtering for summary") - return summary + logger.debug("PII detection not available - skipping PII check for summary") + except ValueError: + # Re-raise PII detection errors + raise except Exception as e: - logger.warning(f"PII filtering failed for summary: {e}") - return summary + logger.warning(f"PII detection failed for summary: {e}") def _truncate_summary( @@ -118,15 +119,15 @@ def generate_description( else: summary = None - # Filter PII from summary before sending to LLM - filtered_summary = _filter_pii_from_summary(summary) + # Check summary for PII before sending to LLM (will raise exception if PII found) + _check_summary_for_pii(summary) return generate_test_result_description( { "test_name": test_name, "test_description": test_description, "title": title, - "summary": _truncate_summary(filtered_summary, test_id), + "summary": _truncate_summary(summary, test_id), "figures": [ figure._get_b64_url() for figure in ([] if tables else figures) ], diff --git a/validmind/vm_models/result/pii_filter.py b/validmind/vm_models/result/pii_filter.py index b84c214d7..5ce355ff4 100644 --- a/validmind/vm_models/result/pii_filter.py +++ b/validmind/vm_models/result/pii_filter.py @@ -18,8 +18,8 @@ logger = get_logger(__name__) -class PIIFilteringMode(Enum): - """Enum for PII filtering modes.""" +class PIIDetectionMode(Enum): + """Enum for PII detection modes.""" DISABLED = "disabled" TEST_RESULTS = "test_results" @@ -27,19 +27,19 @@ class PIIFilteringMode(Enum): ALL = "all" -def _get_pii_filtering_mode() -> PIIFilteringMode: - """Get the current PII filtering mode from environment variable.""" - mode_str = os.getenv("VALIDMIND_PII_FILTERING", "disabled").lower() +def _get_pii_detection_mode() -> PIIDetectionMode: + """Get the current PII detection mode from environment variable.""" + mode_str = os.getenv("VALIDMIND_PII_DETECTION", "disabled").lower() try: - return PIIFilteringMode(mode_str) + return PIIDetectionMode(mode_str) except ValueError: logger.warning( - f"Invalid PII filtering mode '{mode_str}'. " - f"Valid options: {', '.join([mode.value for mode in PIIFilteringMode])}. " + f"Invalid PII detection mode '{mode_str}'. " + f"Valid options: {', '.join([mode.value for mode in PIIDetectionMode])}. " f"Defaulting to 'disabled'." ) - return PIIFilteringMode.DISABLED + return PIIDetectionMode.DISABLED # Lazy load presidio components to avoid import errors when not installed @@ -63,28 +63,28 @@ def _get_presidio_analyzer(): return _analyzer if _analyzer is not False else None -def is_pii_filtering_enabled_for_test_results() -> bool: - """Check if PII filtering is enabled for test results and available.""" - mode = _get_pii_filtering_mode() +def is_pii_detection_enabled_for_test_results() -> bool: + """Check if PII detection is enabled for test results and available.""" + mode = _get_pii_detection_mode() return ( - mode in [PIIFilteringMode.TEST_RESULTS, PIIFilteringMode.ALL] + mode in [PIIDetectionMode.TEST_RESULTS, PIIDetectionMode.ALL] and _get_presidio_analyzer() is not None ) -def is_pii_filtering_enabled_for_test_descriptions() -> bool: - """Check if PII filtering is enabled for test descriptions and available.""" - mode = _get_pii_filtering_mode() +def is_pii_detection_enabled_for_test_descriptions() -> bool: + """Check if PII detection is enabled for test descriptions and available.""" + mode = _get_pii_detection_mode() return ( - mode in [PIIFilteringMode.TEST_DESCRIPTIONS, PIIFilteringMode.ALL] + mode in [PIIDetectionMode.TEST_DESCRIPTIONS, PIIDetectionMode.ALL] and _get_presidio_analyzer() is not None ) -def is_pii_filtering_enabled() -> bool: - """Check if PII filtering is enabled for any mode and available.""" - mode = _get_pii_filtering_mode() - return mode != PIIFilteringMode.DISABLED and _get_presidio_analyzer() is not None +def is_pii_detection_enabled() -> bool: + """Check if PII detection is enabled for any mode and available.""" + mode = _get_pii_detection_mode() + return mode != PIIDetectionMode.DISABLED and _get_presidio_analyzer() is not None def detect_pii_in_text( @@ -171,7 +171,7 @@ def scan_dataframe_for_pii( Returns: Dictionary mapping column names to lists of detected PII entities """ - if not is_pii_filtering_enabled_for_test_results(): + if not is_pii_detection_enabled_for_test_results(): return {} pii_findings = {} @@ -210,7 +210,7 @@ def scan_dataframe_for_pii( def check_table_for_pii( table_data: Union[pd.DataFrame, List[Dict]], threshold: float = 0.5, - raise_on_detection: bool = False, + raise_on_detection: bool = True, ) -> None: """ Check a table (DataFrame or list of dicts) for PII content. @@ -218,12 +218,12 @@ def check_table_for_pii( Args: table_data: The table data to check threshold: Minimum confidence score for PII detection - raise_on_detection: If True, raises ValueError when PII is detected + raise_on_detection: If True, raises ValueError when PII is detected (default: True) Raises: ValueError: If PII is detected and raise_on_detection is True """ - if not is_pii_filtering_enabled_for_test_results(): + if not is_pii_detection_enabled_for_test_results(): return # Convert to DataFrame if it's a list of dicts @@ -245,53 +245,45 @@ def check_table_for_pii( raise ValueError( f"PII detected in table data. Entity types found: {', '.join(entity_types)}. " - f"Pass `unsafe=True` to bypass PII filtering." + f"Pass `unsafe=True` to bypass PII detection." ) -def filter_pii_from_text( +def check_text_for_pii( text: str, entities: Optional[List[str]] = None, language: str = "en", threshold: float = 0.5, - mask_char: str = "*", -) -> str: + raise_on_detection: bool = True, +) -> List[Dict]: """ - Filter PII from text by replacing detected entities with mask characters. + Check text for PII content and optionally raise an exception. Args: - text: The text to filter - entities: List of entity types to detect and filter + text: The text to check for PII + entities: List of entity types to detect language: Language code for analysis threshold: Minimum confidence score for PII detection - mask_char: Character to use for masking PII + raise_on_detection: If True, raises ValueError when PII is detected (default: True) Returns: - Text with PII entities masked + List of detected PII entities + + Raises: + ValueError: If PII is detected and raise_on_detection is True """ - if not is_pii_filtering_enabled_for_test_descriptions(): - return text + if not is_pii_detection_enabled_for_test_descriptions(): + return [] pii_entities = detect_pii_in_text( text=text, entities=entities, language=language, threshold=threshold ) - if not pii_entities: - return text - - # Sort entities by start position in reverse order to avoid index shifting - pii_entities.sort(key=lambda x: x["start"], reverse=True) - - filtered_text = text - for entity in pii_entities: - # Replace the PII text with mask characters - mask_length = entity["end"] - entity["start"] - mask = mask_char * mask_length - filtered_text = ( - filtered_text[: entity["start"]] + mask + filtered_text[entity["end"] :] + if pii_entities and raise_on_detection: + entity_types = set(entity["entity_type"] for entity in pii_entities) + raise ValueError( + f"PII detected in text content. Entity types found: {', '.join(entity_types)}. " + f"Pass `unsafe=True` to bypass PII detection." ) - if pii_entities: - logger.info(f"Masked {len(pii_entities)} PII entities from text") - - return filtered_text + return pii_entities From 3b1d1634700cabcf6383bccd02769df91ca0bb8d Mon Sep 17 00:00:00 2001 From: John Walz Date: Tue, 19 Aug 2025 10:56:43 -0400 Subject: [PATCH 05/11] feat: enhance PII detection capabilities with structured analysis support - Introduce Presidio Structured for improved PII detection in structured data. - Update `check_table_for_pii` and related functions to utilize structured analysis when available. - Implement lazy loading for Presidio Structured components to ensure compatibility. - Modify `generate_description` and `TestResult` classes to include PII checks for tables and descriptions. - Update dependencies in `pyproject.toml` and `poetry.lock` to include `presidio-structured`. - Enhance error handling and logging for PII detection failures. --- poetry.lock | 221 ++++++++++++++++++++++- pyproject.toml | 4 +- validmind/ai/test_descriptions.py | 28 ++- validmind/api_client.py | 13 ++ validmind/vm_models/result/pii_filter.py | 203 ++++++++++++++++++--- validmind/vm_models/result/result.py | 41 ++++- 6 files changed, 470 insertions(+), 40 deletions(-) diff --git a/poetry.lock b/poetry.lock index 236ecdda3..afa9a7e61 100644 --- a/poetry.lock +++ b/poetry.lock @@ -442,6 +442,28 @@ docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphi tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\""] +[[package]] +name = "azure-core" +version = "1.35.0" +description = "Microsoft Azure Core Library for Python" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"pii-detection\")" +files = [ + {file = "azure_core-1.35.0-py3-none-any.whl", hash = "sha256:8db78c72868a58f3de8991eb4d22c4d368fae226dac1002998d6c50437e7dad1"}, + {file = "azure_core-1.35.0.tar.gz", hash = "sha256:c0be528489485e9ede59b6971eb63c1eaacf83ef53001bfe3904e475e972be5c"}, +] + +[package.dependencies] +requests = ">=2.21.0" +six = ">=1.11.0" +typing-extensions = ">=4.6.0" + +[package.extras] +aio = ["aiohttp (>=3.0)"] +tracing = ["opentelemetry-api (>=1.26,<2.0)"] + [[package]] name = "babel" version = "2.17.0" @@ -1390,8 +1412,7 @@ version = "43.0.3" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" -groups = ["dev"] -markers = "python_version < \"3.10\" and sys_platform == \"linux\"" +groups = ["main", "dev"] files = [ {file = "cryptography-43.0.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf7a1932ac4176486eab36a19ed4c0492da5d97123f1406cf15e41b05e787d2e"}, {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63efa177ff54aec6e1c0aefaa1a241232dcd37413835a9b674b6e3f0ae2bfd3e"}, @@ -1421,6 +1442,7 @@ files = [ {file = "cryptography-43.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2ce6fae5bdad59577b44e4dfed356944fbf1d925269114c28be377692643b4ff"}, {file = "cryptography-43.0.3.tar.gz", hash = "sha256:315b9001266a492a6ff443b61238f956b214dbec9910a081ba5b6646a055a805"}, ] +markers = {main = "python_version < \"3.10\" and (extra == \"all\" or extra == \"pii-detection\")", dev = "python_version < \"3.10\" and sys_platform == \"linux\""} [package.dependencies] cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} @@ -5858,6 +5880,61 @@ server = ["flask (>=1.1)", "gunicorn"] stanza = ["stanza (>=1.10.1,<2.0.0)"] transformers = ["accelerate", "huggingface_hub", "spacy_huggingface_pipelines", "transformers"] +[[package]] +name = "presidio-anonymizer" +version = "2.2.357" +description = "Presidio Anonymizer package - replaces analyzed text with desired values." +optional = true +python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"pii-detection\")" +files = [ + {file = "presidio_anonymizer-2.2.357-py3-none-any.whl", hash = "sha256:0b3e5e0526f5950bb9b27941e5b1b01b6761295d178a8ba4cedd2771aa2aee52"}, +] + +[package.dependencies] +azure-core = "*" +pycryptodome = ">=3.10.1" + +[package.extras] +server = ["flask (>=1.1)", "gunicorn"] + +[[package]] +name = "presidio-anonymizer" +version = "2.2.359" +description = "Presidio Anonymizer package - replaces analyzed text with desired values." +optional = true +python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "python_version < \"3.10\" and (extra == \"all\" or extra == \"pii-detection\")" +files = [ + {file = "presidio_anonymizer-2.2.359-py3-none-any.whl", hash = "sha256:bc15a8fa4b6aa8ed1e01a1e3d05afd0bea2ab57f4c2e446c680e2662416b7ada"}, +] + +[package.dependencies] +cryptography = "<44.1" + +[package.extras] +server = ["flask (>=1.1)", "gunicorn"] + +[[package]] +name = "presidio-structured" +version = "0.0.6" +description = "Presidio structured package - analyzes and anonymizes structured and semi-structured data." +optional = true +python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "extra == \"all\" or extra == \"pii-detection\"" +files = [ + {file = "presidio_structured-0.0.6-py3-none-any.whl", hash = "sha256:f3454c86857a00db9828e684895da43411bcc7d750cac0a52e15d68f6c6455a1"}, +] + +[package.dependencies] +pandas = ">=1.5.2" +presidio-analyzer = ">=2.2" +presidio-anonymizer = ">=2.2" +spacy = {version = "<3.8.4", markers = "python_version < \"3.10\""} + [[package]] name = "prometheus-client" version = "0.22.1" @@ -6339,6 +6416,58 @@ files = [ {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] +[[package]] +name = "pycryptodome" +version = "3.23.0" +description = "Cryptographic library for Python" +optional = true +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main"] +markers = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"pii-detection\")" +files = [ + {file = "pycryptodome-3.23.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a176b79c49af27d7f6c12e4b178b0824626f40a7b9fed08f712291b6d54bf566"}, + {file = "pycryptodome-3.23.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:573a0b3017e06f2cffd27d92ef22e46aa3be87a2d317a5abf7cc0e84e321bd75"}, + {file = "pycryptodome-3.23.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:63dad881b99ca653302b2c7191998dd677226222a3f2ea79999aa51ce695f720"}, + {file = "pycryptodome-3.23.0-cp27-cp27m-win32.whl", hash = "sha256:b34e8e11d97889df57166eda1e1ddd7676da5fcd4d71a0062a760e75060514b4"}, + {file = "pycryptodome-3.23.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:7ac1080a8da569bde76c0a104589c4f414b8ba296c0b3738cf39a466a9fb1818"}, + {file = "pycryptodome-3.23.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:6fe8258e2039eceb74dfec66b3672552b6b7d2c235b2dfecc05d16b8921649a8"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:0011f7f00cdb74879142011f95133274741778abba114ceca229adbf8e62c3e4"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:90460fc9e088ce095f9ee8356722d4f10f86e5be06e2354230a9880b9c549aae"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4764e64b269fc83b00f682c47443c2e6e85b18273712b98aa43bcb77f8570477"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb8f24adb74984aa0e5d07a2368ad95276cf38051fe2dc6605cbcf482e04f2a7"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d97618c9c6684a97ef7637ba43bdf6663a2e2e77efe0f863cce97a76af396446"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9a53a4fe5cb075075d515797d6ce2f56772ea7e6a1e5e4b96cf78a14bac3d265"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:763d1d74f56f031788e5d307029caef067febf890cd1f8bf61183ae142f1a77b"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:954af0e2bd7cea83ce72243b14e4fb518b18f0c1649b576d114973e2073b273d"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-win32.whl", hash = "sha256:257bb3572c63ad8ba40b89f6fc9d63a2a628e9f9708d31ee26560925ebe0210a"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6501790c5b62a29fcb227bd6b62012181d886a767ce9ed03b303d1f22eb5c625"}, + {file = "pycryptodome-3.23.0-cp313-cp313t-win_arm64.whl", hash = "sha256:9a77627a330ab23ca43b48b130e202582e91cc69619947840ea4d2d1be21eb39"}, + {file = "pycryptodome-3.23.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:187058ab80b3281b1de11c2e6842a357a1f71b42cb1e15bce373f3d238135c27"}, + {file = "pycryptodome-3.23.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:cfb5cd445280c5b0a4e6187a7ce8de5a07b5f3f897f235caa11f1f435f182843"}, + {file = "pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67bd81fcbe34f43ad9422ee8fd4843c8e7198dd88dd3d40e6de42ee65fbe1490"}, + {file = "pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8987bd3307a39bc03df5c8e0e3d8be0c4c3518b7f044b0f4c15d1aa78f52575"}, + {file = "pycryptodome-3.23.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa0698f65e5b570426fc31b8162ed4603b0c2841cbb9088e2b01641e3065915b"}, + {file = "pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:53ecbafc2b55353edcebd64bf5da94a2a2cdf5090a6915bcca6eca6cc452585a"}, + {file = "pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:156df9667ad9f2ad26255926524e1c136d6664b741547deb0a86a9acf5ea631f"}, + {file = "pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:dea827b4d55ee390dc89b2afe5927d4308a8b538ae91d9c6f7a5090f397af1aa"}, + {file = "pycryptodome-3.23.0-cp37-abi3-win32.whl", hash = "sha256:507dbead45474b62b2bbe318eb1c4c8ee641077532067fec9c1aa82c31f84886"}, + {file = "pycryptodome-3.23.0-cp37-abi3-win_amd64.whl", hash = "sha256:c75b52aacc6c0c260f204cbdd834f76edc9fb0d8e0da9fbf8352ef58202564e2"}, + {file = "pycryptodome-3.23.0-cp37-abi3-win_arm64.whl", hash = "sha256:11eeeb6917903876f134b56ba11abe95c0b0fd5e3330def218083c7d98bbcb3c"}, + {file = "pycryptodome-3.23.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:350ebc1eba1da729b35ab7627a833a1a355ee4e852d8ba0447fafe7b14504d56"}, + {file = "pycryptodome-3.23.0-pp27-pypy_73-win32.whl", hash = "sha256:93837e379a3e5fd2bb00302a47aee9fdf7940d83595be3915752c74033d17ca7"}, + {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ddb95b49df036ddd264a0ad246d1be5b672000f12d6961ea2c267083a5e19379"}, + {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e95564beb8782abfd9e431c974e14563a794a4944c29d6d3b7b5ea042110b4"}, + {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e15c081e912c4b0d75632acd8382dfce45b258667aa3c67caf7a4d4c13f630"}, + {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7fc76bf273353dc7e5207d172b83f569540fc9a28d63171061c42e361d22353"}, + {file = "pycryptodome-3.23.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:45c69ad715ca1a94f778215a11e66b7ff989d792a4d63b68dc586a1da1392ff5"}, + {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:865d83c906b0fc6a59b510deceee656b6bc1c4fa0d82176e2b77e97a420a996a"}, + {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89d4d56153efc4d81defe8b65fd0821ef8b2d5ddf8ed19df31ba2f00872b8002"}, + {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3f2d0aaf8080bda0587d58fc9fe4766e012441e2eed4269a77de6aea981c8be"}, + {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64093fc334c1eccfd3933c134c4457c34eaca235eeae49d69449dc4728079339"}, + {file = "pycryptodome-3.23.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ce64e84a962b63a47a592690bdc16a7eaf709d2c2697ababf24a0def566899a6"}, + {file = "pycryptodome-3.23.0.tar.gz", hash = "sha256:447700a657182d60338bab09fdb27518f8856aecd80ae4c6bdddb67ff5da44ef"}, +] + [[package]] name = "pydantic" version = "2.11.7" @@ -7969,6 +8098,86 @@ files = [ {file = "soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a"}, ] +[[package]] +name = "spacy" +version = "3.8.3" +description = "Industrial-strength Natural Language Processing (NLP) in Python" +optional = true +python-versions = "<3.13,>=3.9" +groups = ["main"] +markers = "python_version < \"3.10\" and (extra == \"all\" or extra == \"pii-detection\")" +files = [ + {file = "spacy-3.8.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b530a5cbb077601d03bdd71bf1ded4de4b7fb0362b5443c5183c628cfa81ffdc"}, + {file = "spacy-3.8.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b28a5f7b77400ebf7e23aa24a82a2d35f97071cd5ef1ad0f859aa9b323fff59a"}, + {file = "spacy-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcfd24a00da30ca53570f5b1c3535c1fa95b633f2a12b3d08395c9552ffb53c"}, + {file = "spacy-3.8.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e3630ea33608a6db8045fad7e0ba22f864c61ea351445488a89af1734e434a37"}, + {file = "spacy-3.8.3-cp310-cp310-win_amd64.whl", hash = "sha256:20839fa04cc2156ab613e40db54c25031304fdc1dd369930bc01c366586d0079"}, + {file = "spacy-3.8.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b16b8f9c544cdccd1bd23fc6bf6e2f1d667a1ee285a9b31bdb4a89e2d61345b4"}, + {file = "spacy-3.8.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f62e45a2259acc51cd8eb185f978848928f2f698ba174b283253485fb7691b04"}, + {file = "spacy-3.8.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57a267ea25dd8b7ec3e55accd1592d2d0847f0c6277a55145af5bb08e318bab4"}, + {file = "spacy-3.8.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:45bc5fc8d399089607e3e759aee98362ffb007e39386531f195f42dcddcc94dc"}, + {file = "spacy-3.8.3-cp311-cp311-win_amd64.whl", hash = "sha256:9e348359d54418a5752305975f1268013135255bd656a783aa3397b3bd4dd5e9"}, + {file = "spacy-3.8.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b01e50086515fa6d43275be11a762a3a3285d9aabbe27b4f3b98a08083f1d2a1"}, + {file = "spacy-3.8.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:187f9732362d0dc52b16c80e67decf58ff91605e34b251c50c7dc5212082fcb4"}, + {file = "spacy-3.8.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7517bc969bca924cbdba4e14e0ce16e66d32967468ad27490e95c9b4d8d8aa8"}, + {file = "spacy-3.8.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:460948437c5571367105554b1e67549f957ba8dd6ee7e1594e719f9a88c398bb"}, + {file = "spacy-3.8.3-cp312-cp312-win_amd64.whl", hash = "sha256:1f14d4e2b1e6ab144ee546236f2c32b255f91f24939e62436c3a9c2ee200c6d1"}, + {file = "spacy-3.8.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f6020603633ec47374af71e936671d5992d68e592661dffac940f5596d77696"}, + {file = "spacy-3.8.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:72b492651534460bf4fe842f7efa462887f9e215de86146b862df6238b952650"}, + {file = "spacy-3.8.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a630119aaa7a6180635eb8f21b27509654882847480c8423a657582b4a9bdd3"}, + {file = "spacy-3.8.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8563ba9cbb71a629c7dc8c2db98f0348416dc0f0927de0e9ed8b448f707b5248"}, + {file = "spacy-3.8.3-cp39-cp39-win_amd64.whl", hash = "sha256:608beca075f7611083e93c91625d7e6c5885e2672cb5ec1b9f274cab6c82c816"}, + {file = "spacy-3.8.3.tar.gz", hash = "sha256:81a967dc3d6a5a0a9ab250559483fe2092306582a9192f98be7a63bdce2797f7"}, +] + +[package.dependencies] +catalogue = ">=2.0.6,<2.1.0" +cymem = ">=2.0.2,<2.1.0" +jinja2 = "*" +langcodes = ">=3.2.0,<4.0.0" +murmurhash = ">=0.28.0,<1.1.0" +numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""} +packaging = ">=20.0" +preshed = ">=3.0.2,<3.1.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +requests = ">=2.13.0,<3.0.0" +setuptools = "*" +spacy-legacy = ">=3.0.11,<3.1.0" +spacy-loggers = ">=1.0.0,<2.0.0" +srsly = ">=2.4.3,<3.0.0" +thinc = ">=8.3.0,<8.4.0" +tqdm = ">=4.38.0,<5.0.0" +typer = ">=0.3.0,<1.0.0" +wasabi = ">=0.9.1,<1.2.0" +weasel = ">=0.1.0,<0.5.0" + +[package.extras] +apple = ["thinc-apple-ops (>=1.0.0,<2.0.0)"] +cuda = ["cupy (>=5.0.0b4,<13.0.0)"] +cuda-autodetect = ["cupy-wheel (>=11.0.0,<13.0.0)"] +cuda100 = ["cupy-cuda100 (>=5.0.0b4,<13.0.0)"] +cuda101 = ["cupy-cuda101 (>=5.0.0b4,<13.0.0)"] +cuda102 = ["cupy-cuda102 (>=5.0.0b4,<13.0.0)"] +cuda110 = ["cupy-cuda110 (>=5.0.0b4,<13.0.0)"] +cuda111 = ["cupy-cuda111 (>=5.0.0b4,<13.0.0)"] +cuda112 = ["cupy-cuda112 (>=5.0.0b4,<13.0.0)"] +cuda113 = ["cupy-cuda113 (>=5.0.0b4,<13.0.0)"] +cuda114 = ["cupy-cuda114 (>=5.0.0b4,<13.0.0)"] +cuda115 = ["cupy-cuda115 (>=5.0.0b4,<13.0.0)"] +cuda116 = ["cupy-cuda116 (>=5.0.0b4,<13.0.0)"] +cuda117 = ["cupy-cuda117 (>=5.0.0b4,<13.0.0)"] +cuda11x = ["cupy-cuda11x (>=11.0.0,<13.0.0)"] +cuda12x = ["cupy-cuda12x (>=11.5.0,<13.0.0)"] +cuda80 = ["cupy-cuda80 (>=5.0.0b4,<13.0.0)"] +cuda90 = ["cupy-cuda90 (>=5.0.0b4,<13.0.0)"] +cuda91 = ["cupy-cuda91 (>=5.0.0b4,<13.0.0)"] +cuda92 = ["cupy-cuda92 (>=5.0.0b4,<13.0.0)"] +ja = ["sudachidict_core (>=20211220)", "sudachipy (>=0.5.2,!=0.6.1)"] +ko = ["natto-py (>=0.9.0)"] +lookups = ["spacy_lookups_data (>=1.0.3,<1.1.0)"] +th = ["pythainlp (>=2.0)"] +transformers = ["spacy_transformers (>=1.1.2,<1.4.0)"] + [[package]] name = "spacy" version = "3.8.7" @@ -7976,7 +8185,7 @@ description = "Industrial-strength Natural Language Processing (NLP) in Python" optional = true python-versions = "<3.14,>=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"pii-detection\"" +markers = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"pii-detection\")" files = [ {file = "spacy-3.8.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6ec0368ce96cd775fb14906f04b771c912ea8393ba30f8b35f9c4dc47a420b8e"}, {file = "spacy-3.8.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5672f8a0fe7a3847e925544890be60015fbf48a60a838803425f82e849dd4f18"}, @@ -9988,13 +10197,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["langchain-openai", "presidio-analyzer", "pycocoevalcap", "ragas", "sentencepiece", "torch", "transformers"] +all = ["langchain-openai", "presidio-analyzer", "presidio-structured", "pycocoevalcap", "ragas", "sentencepiece", "torch", "transformers"] huggingface = ["sentencepiece", "transformers"] llm = ["langchain-openai", "pycocoevalcap", "ragas", "sentencepiece", "torch", "transformers"] -pii-detection = ["presidio-analyzer"] +pii-detection = ["presidio-analyzer", "presidio-structured"] pytorch = ["torch"] [metadata] lock-version = "2.1" python-versions = ">=3.9.0,<3.12" -content-hash = "e19fe7e2074a245488f0dd442bb8744cede74b8e784ff37abc0e58a78d6366f7" +content-hash = "c0172be27de8ceadd00c26ba81f59d37c34deee382ebaf5b4bfebec15f911e4b" diff --git a/pyproject.toml b/pyproject.toml index bdb62f069..604821176 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ transformers = { version = "^4.32.0", optional = true } xgboost = ">=1.5.2,<3" yfinance = "^0.2.48" presidio-analyzer = { version = "^2.2.0", optional = true } +presidio-structured = { version = "*", optional = true } [tool.poetry.group.dev.dependencies] black = "^22.1.0" @@ -88,6 +89,7 @@ all = [ "sentencepiece", "langchain-openai", "presidio-analyzer", + "presidio-structured", ] huggingface = ["transformers", "sentencepiece"] llm = [ @@ -99,7 +101,7 @@ llm = [ "langchain-openai", ] pytorch = ["torch"] -pii-detection = ["presidio-analyzer"] +pii-detection = ["presidio-analyzer", "presidio-structured"] [build-system] build-backend = "poetry.core.masonry.api" diff --git a/validmind/ai/test_descriptions.py b/validmind/ai/test_descriptions.py index 8fe617192..4fb8182bc 100644 --- a/validmind/ai/test_descriptions.py +++ b/validmind/ai/test_descriptions.py @@ -37,22 +37,30 @@ def _get_llm_global_context(): return context if context_enabled and context else None -def _check_summary_for_pii(summary: Union[str, None]) -> None: - """Check summary text for PII content before sending to LLM.""" - if summary is None: +def _check_tables_for_pii(tables: Union[List[ResultTable], None]) -> None: + """Check structured tables for PII before converting them to text. + + Uses Presidio Structured through `check_table_for_pii` to scan the + underlying DataFrame of each `ResultTable`. + """ + if not tables: return try: - from ..vm_models.result.pii_filter import check_text_for_pii + from ..vm_models.result.pii_filter import check_table_for_pii - check_text_for_pii(summary, raise_on_detection=True) + for table in tables: + # Use the exact structure that goes into the summary (list of dicts) + serialized = table.serialize() + table_rows = serialized.get("data", []) + check_table_for_pii(table_data=table_rows, raise_on_detection=True) except ImportError: - logger.debug("PII detection not available - skipping PII check for summary") + logger.debug("PII detection not available - skipping PII check for tables") except ValueError: # Re-raise PII detection errors raise except Exception as e: - logger.warning(f"PII detection failed for summary: {e}") + logger.warning(f"PII detection failed for tables: {e}") def _truncate_summary( @@ -110,6 +118,9 @@ def generate_description( ) if tables: + # Check structured tables for PII before converting them to text + _check_tables_for_pii(tables) + summary = "\n---\n".join( [ json.dumps(table.serialize(), cls=NumpyEncoder, separators=(",", ":")) @@ -119,9 +130,6 @@ def generate_description( else: summary = None - # Check summary for PII before sending to LLM (will raise exception if PII found) - _check_summary_for_pii(summary) - return generate_test_result_description( { "test_name": test_name, diff --git a/validmind/api_client.py b/validmind/api_client.py index 7bee2290b..bb32969c3 100644 --- a/validmind/api_client.py +++ b/validmind/api_client.py @@ -430,6 +430,19 @@ def log_text( if not text or not isinstance(text, str): raise ValueError("`text` must be a non-empty string") + # PII detection for free-form text prior to logging + try: + from .vm_models.result.pii_filter import check_text_for_pii + + check_text_for_pii(text, raise_on_detection=True) + except ImportError: + logger.debug("PII detection not available - skipping PII check for text") + except ValueError: + # Re-raise PII detection errors + raise + except Exception as e: + logger.warning(f"PII detection failed for text: {e}") + if not is_html(text): text = md_to_html(text, mathml=True) diff --git a/validmind/vm_models/result/pii_filter.py b/validmind/vm_models/result/pii_filter.py index 5ce355ff4..a54879df9 100644 --- a/validmind/vm_models/result/pii_filter.py +++ b/validmind/vm_models/result/pii_filter.py @@ -9,7 +9,7 @@ import os from enum import Enum -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import pandas as pd @@ -63,12 +63,28 @@ def _get_presidio_analyzer(): return _analyzer if _analyzer is not False else None +def _get_presidio_structured_builder(): + """Lazy load Presidio Structured PandasAnalysisBuilder. + + Returns None if not available. + """ + try: + from presidio_structured import PandasAnalysisBuilder # type: ignore + + return PandasAnalysisBuilder + except ImportError: + logger.warning( + "Presidio Structured not available. Install with: pip install validmind[pii-detection]" + ) + return None + + def is_pii_detection_enabled_for_test_results() -> bool: """Check if PII detection is enabled for test results and available.""" mode = _get_pii_detection_mode() - return ( - mode in [PIIDetectionMode.TEST_RESULTS, PIIDetectionMode.ALL] - and _get_presidio_analyzer() is not None + return mode in [PIIDetectionMode.TEST_RESULTS, PIIDetectionMode.ALL] and ( + _get_presidio_structured_builder() is not None + or _get_presidio_analyzer() is not None ) @@ -84,7 +100,14 @@ def is_pii_detection_enabled_for_test_descriptions() -> bool: def is_pii_detection_enabled() -> bool: """Check if PII detection is enabled for any mode and available.""" mode = _get_pii_detection_mode() - return mode != PIIDetectionMode.DISABLED and _get_presidio_analyzer() is not None + if mode == PIIDetectionMode.DISABLED: + return False + + # Either text analyzer (for descriptions) or structured (for tables) should be available + return ( + _get_presidio_analyzer() is not None + or _get_presidio_structured_builder() is not None + ) def detect_pii_in_text( @@ -162,19 +185,28 @@ def scan_dataframe_for_pii( """ Scan a pandas DataFrame for PII content in text columns. + This implementation uses Microsoft Presidio Structured to analyze tabular data + and determine which columns contain PII entities. It returns a mapping of + column names to detected entity metadata. Unlike token-level detection, the + structured analysis reports at the column level. + Args: df: The DataFrame to scan columns: List of column names to scan (if None, scans all string columns) - threshold: Minimum confidence score for PII detection + threshold: Minimum confidence score for PII detection (not used directly by structured analysis) sample_size: Maximum number of rows to sample for PII detection Returns: - Dictionary mapping column names to lists of detected PII entities + Dictionary mapping column names to lists of detected PII entities. Each list + contains a single dict with at least the 'entity_type' key. """ - if not is_pii_detection_enabled_for_test_results(): + if not ( + is_pii_detection_enabled_for_test_results() + or is_pii_detection_enabled_for_test_descriptions() + ): return {} - pii_findings = {} + builder_cls = _get_presidio_structured_builder() # Determine which columns to scan if columns is None: @@ -184,14 +216,52 @@ def scan_dataframe_for_pii( # Limit the number of rows to scan for performance sample_df = df.head(sample_size) if len(df) > sample_size else df - for column in columns: - column_pii = [] + # Prefer Presidio Structured if available + if builder_cls is not None: + try: + builder = builder_cls() + # Use mixed strategy and map our threshold parameter to the mixed threshold + tabular_analysis = builder.generate_analysis( + sample_df, + selection_strategy="mixed", + mixed_strategy_threshold=threshold, + ) - # Scan non-null string values in the column + # The analysis exposes an entity mapping of column -> entity type + entity_mapping: Dict[str, str] = getattr( + tabular_analysis, "entity_mapping", {} + ) + + pii_findings: Dict[str, List[Dict]] = {} + for column in columns: + if column in entity_mapping and entity_mapping[column]: + entity_type = entity_mapping[column] + pii_findings[column] = [ + { + "entity_type": entity_type, + "column": column, + } + ] + logger.info( + f"Detected PII entity '{entity_type}' in column '{column}'" + ) + + return pii_findings + except Exception as e: + logger.warning(f"PII structured analysis failed: {e}") + # fall back to token-level analyzer below + + # Fallback: use token-level Presidio Analyzer on sampled rows if available + analyzer_available = _get_presidio_analyzer() is not None + if not analyzer_available: + return {} + + pii_findings = {} + for column in columns: + column_pii: List[Dict] = [] for idx, value in sample_df[column].dropna().items(): if isinstance(value, str) and len(value.strip()) > 0: pii_entities = detect_pii_in_text(text=str(value), threshold=threshold) - if pii_entities: column_pii.extend( [ @@ -199,7 +269,6 @@ def scan_dataframe_for_pii( for entity in pii_entities ] ) - if column_pii: pii_findings[column] = column_pii logger.info(f"Found {len(column_pii)} PII entities in column '{column}'") @@ -207,8 +276,46 @@ def scan_dataframe_for_pii( return pii_findings +def _coerce_to_dataframe(table_like: Any) -> Optional[pd.DataFrame]: + """Best-effort conversion of supported inputs into a DataFrame. + + Supports: + - pandas.DataFrame + - list[dict] + - objects with a `.data` attribute containing a DataFrame or list[dict] + - objects with `.serialize()` returning {"data": list[dict]} + """ + if table_like is None: + return None + + if isinstance(table_like, pd.DataFrame): + return table_like + + if isinstance(table_like, list): + return pd.DataFrame(table_like) if table_like else pd.DataFrame() + + data_attr = getattr(table_like, "data", None) + if data_attr is not None: + if isinstance(data_attr, pd.DataFrame): + return data_attr + if isinstance(data_attr, list): + return pd.DataFrame(data_attr) + + serialize_fn = getattr(table_like, "serialize", None) + if callable(serialize_fn): + try: + serialized = serialize_fn() + records = serialized.get("data") if isinstance(serialized, dict) else None + if isinstance(records, list): + return pd.DataFrame(records) + except Exception: + pass + + return None + + def check_table_for_pii( - table_data: Union[pd.DataFrame, List[Dict]], + table_data: Union[pd.DataFrame, List[Dict], Any], threshold: float = 0.5, raise_on_detection: bool = True, ) -> None: @@ -223,7 +330,42 @@ def check_table_for_pii( Raises: ValueError: If PII is detected and raise_on_detection is True """ - if not is_pii_detection_enabled_for_test_results(): + if not ( + is_pii_detection_enabled_for_test_results() + or is_pii_detection_enabled_for_test_descriptions() + ): + return + + df = _coerce_to_dataframe(table_data) + if df is None or df.empty: + return + + # Scan for PII + pii_findings = scan_dataframe_for_pii(df, threshold=threshold) + has_pii = bool(pii_findings) + + if has_pii and raise_on_detection: + entity_types = set() + for findings in pii_findings.values(): + entity_types.update(entity["entity_type"] for entity in findings) + + raise ValueError( + f"PII detected in table data. Entity types found: {', '.join(entity_types)}. " + f"Pass `unsafe=True` to bypass PII detection." + ) + + +def check_table_for_pii_in_descriptions( + table_data: Union[pd.DataFrame, List[Dict]], + threshold: float = 0.5, + raise_on_detection: bool = True, +) -> None: + """Check a table for PII when used in description generation. + + Enabled under the "test_descriptions" or "all" modes. Uses Presidio Structured + directly to analyze the DataFrame, independent of the test_results gating. + """ + if not is_pii_detection_enabled_for_test_descriptions(): return # Convert to DataFrame if it's a list of dicts @@ -234,18 +376,35 @@ def check_table_for_pii( else: df = table_data - # Scan for PII - pii_findings = scan_dataframe_for_pii(df, threshold=threshold) - has_pii = bool(pii_findings) + builder_cls = _get_presidio_structured_builder() + if builder_cls is None: + # If Structured is not available, try token-level analyzer as a fallback + pii_findings = scan_dataframe_for_pii(df, threshold=threshold) + else: + try: + builder = builder_cls() + tabular_analysis = builder.generate_analysis( + df, selection_strategy="mixed", mixed_strategy_threshold=threshold + ) + entity_mapping: Dict[str, str] = getattr( + tabular_analysis, "entity_mapping", {} + ) + pii_findings = { + col: [{"entity_type": ent}] + for col, ent in entity_mapping.items() + if ent + } + except Exception as e: + logger.warning(f"PII structured analysis (descriptions) failed: {e}") + pii_findings = {} - if has_pii and raise_on_detection: + if pii_findings and raise_on_detection: entity_types = set() for findings in pii_findings.values(): entity_types.update(entity["entity_type"] for entity in findings) raise ValueError( - f"PII detected in table data. Entity types found: {', '.join(entity_types)}. " - f"Pass `unsafe=True` to bypass PII detection." + f"PII detected in table data for description. Entity types found: {', '.join(entity_types)}." ) diff --git a/validmind/vm_models/result/result.py b/validmind/vm_models/result/result.py index ee39f9647..20424cd04 100644 --- a/validmind/vm_models/result/result.py +++ b/validmind/vm_models/result/result.py @@ -532,7 +532,30 @@ def log( if not unsafe: for table in self.tables or []: - check_for_sensitive_data(table.data) + # Robust table PII check that accepts ResultTable directly + try: + from .pii_filter import check_table_for_pii + + check_table_for_pii(table) + except Exception: + # Fall back to prior behavior if new helper fails unexpectedly + check_for_sensitive_data(table.data) + + # Check description text for PII when available + if self.description: + try: + from .pii_filter import check_text_for_pii + + check_text_for_pii(self.description, raise_on_detection=True) + except ImportError: + logger.debug( + "PII detection not available - skipping PII check for description" + ) + except ValueError: + # Re-raise PII detection errors + raise + except Exception as e: + logger.warning(f"PII detection failed for description: {e}") if section_id: self._validate_section_id_for_block(section_id, position) @@ -679,6 +702,22 @@ def log( position (int): The position (index) within the section to insert the test result. """ + # Check description text for PII when available + if self.description: + try: + from .pii_filter import check_text_for_pii + + check_text_for_pii(self.description, raise_on_detection=True) + except ImportError: + logger.debug( + "PII detection not available - skipping PII check for description" + ) + except ValueError: + # Re-raise PII detection errors + raise + except Exception as e: + logger.warning(f"PII detection failed for description: {e}") + run_async( self.log_async, content_id=content_id, From a044945b146b79c7f185d6771f1d90c939386e14 Mon Sep 17 00:00:00 2001 From: John Walz Date: Tue, 19 Aug 2025 10:59:02 -0400 Subject: [PATCH 06/11] chore: add noqa comments for complexity warnings in PII detection functions --- validmind/vm_models/result/pii_filter.py | 4 ++-- validmind/vm_models/result/result.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/validmind/vm_models/result/pii_filter.py b/validmind/vm_models/result/pii_filter.py index a54879df9..016e946ce 100644 --- a/validmind/vm_models/result/pii_filter.py +++ b/validmind/vm_models/result/pii_filter.py @@ -176,7 +176,7 @@ def detect_pii_in_text( return [] -def scan_dataframe_for_pii( +def scan_dataframe_for_pii( # noqa: C901 df: pd.DataFrame, columns: Optional[List[str]] = None, threshold: float = 0.5, @@ -276,7 +276,7 @@ def scan_dataframe_for_pii( return pii_findings -def _coerce_to_dataframe(table_like: Any) -> Optional[pd.DataFrame]: +def _coerce_to_dataframe(table_like: Any) -> Optional[pd.DataFrame]: # noqa: C901 """Best-effort conversion of supported inputs into a DataFrame. Supports: diff --git a/validmind/vm_models/result/result.py b/validmind/vm_models/result/result.py index 20424cd04..92960d22e 100644 --- a/validmind/vm_models/result/result.py +++ b/validmind/vm_models/result/result.py @@ -499,7 +499,7 @@ async def log_async( return await asyncio.gather(*tasks) - def log( + def log( # noqa: C901 self, section_id: str = None, content_id: str = None, From 2a0ea746d2da2872c76cc6fe7eb4b4cbc835273e Mon Sep 17 00:00:00 2001 From: John Walz Date: Tue, 19 Aug 2025 11:26:37 -0400 Subject: [PATCH 07/11] feat: add notebook for documenting pii detection --- README.md | 2 +- .../how_to/quickstart_pii_detection.ipynb | 199 ++++++++++++++++++ 2 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 notebooks/how_to/quickstart_pii_detection.ipynb diff --git a/README.md b/README.md index c11f7c2aa..ebf0946af 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ pip install validmind[pii-detection] **Configure PII detection:** ```bash -# Enable PII detection for test results only (default behavior) +# Enable PII detection for test results only export VALIDMIND_PII_DETECTION=test_results # Enable PII detection for test descriptions only diff --git a/notebooks/how_to/quickstart_pii_detection.ipynb b/notebooks/how_to/quickstart_pii_detection.ipynb new file mode 100644 index 000000000..6cc08cc02 --- /dev/null +++ b/notebooks/how_to/quickstart_pii_detection.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PII Detection Modes with a Custom Test\n", + "\n", + "This notebook shows how to initialize ValidMind, implement a custom test that emits PII, and observe behavior differences under each `VALIDMIND_PII_DETECTION` mode when running the test with `validmind.tests.run_test`.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "- `validmind` installed with PII extras:\n", + "\n", + "```bash\n", + "%pip install -q validmind[pii-detection]\n", + "```\n", + "\n", + "- A ValidMind model registered. We'll initialize the library using your model snippet.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q validmind[pii-detection]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize ValidMind\n", + "\n", + "Initialize using your model code snippet or a `.env` file, as shown in other quickstarts.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load your model identifier credentials from an `.env` file\n", + "%load_ext dotenv\n", + "%dotenv .env\n", + "\n", + "# Or initialize with your code snippet\n", + "import validmind as vm\n", + "\n", + "vm.init(\n", + " # api_host=\"...\",\n", + " # api_key=\"...\",\n", + " # api_secret=\"...\",\n", + " # model=\"...\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a custom test that emits PII\n", + "\n", + "We'll create a custom test that returns:\n", + "- A description string containing PII (name, email, phone)\n", + "- A small table containing PII in columns\n", + "\n", + "This mirrors the structure used in other custom test notebooks and will exercise both table and description PII detection paths.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from validmind import test\n", + "\n", + "@test(\"my_pii_demo.PIIEmittingTest\")\n", + "def pii_emitting_test():\n", + " \"\"\"A demo test that returns both a PII-bearing description and a PII-bearing table.\"\"\"\n", + " description = (\n", + " \"Primary contact: John Doe (john.doe@example.com), phone +1-415-555-1234.\"\n", + " )\n", + " table = pd.DataFrame(\n", + " {\n", + " \"name\": [\"Jane Smith\"],\n", + " \"email\": [\"jane.smith@bank.example\"],\n", + " \"phone\": [\"(212) 555-9876\"],\n", + " }\n", + " )\n", + " # Return order: (description, table)\n", + " return description, table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run the test under different PII detection modes\n", + "\n", + "We'll switch `VALIDMIND_PII_DETECTION` across modes and run the same test with `validmind.tests.run_test`. We catch exceptions to observe blocking behavior.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from validmind.tests import run_test\n", + "\n", + "MODES = [\"disabled\", \"test_results\", \"test_descriptions\", \"all\"]\n", + "\n", + "for mode in MODES:\n", + " print(\"\\n=== Mode:\", mode, \"===\")\n", + " os.environ[\"VALIDMIND_PII_DETECTION\"] = mode\n", + " try:\n", + " result = run_test(\"my_pii_demo.PIIEmittingTest\")\n", + "\n", + " # check if the description was generated\n", + " if not result._was_description_generated:\n", + " print(\"Blocked: Test Description Generation was not run due to PII\")\n", + "\n", + " # Try logging (this triggers PII checks before upload)\n", + " result.log()\n", + " print(\"Run + log succeeded\")\n", + " except Exception as e:\n", + " print(\"Blocked:\", type(e).__name__, str(e)[:200])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Expected behavior by mode\n", + "\n", + "- disabled: No PII checks; test runs and logs.\n", + "- test_results: Table PII triggers blocking; description may still proceed.\n", + "- test_descriptions: Description PII triggers blocking; tables may still proceed.\n", + "- all: Both table and description checks are enforced.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notes\n", + "\n", + "- If you see warnings that Presidio is unavailable, ensure you installed extras: `validmind[pii-detection]`.\n", + "- You can override blocking by passing `unsafe=True` to `result.log(unsafe=True)`, but this is not recommended outside controlled workflows.\n", + "- To test only a subset (tables or descriptions), adjust the test to emit only that type and re-run the mode loop.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Troubleshooting\n", + "\n", + "- If you see warnings like \"Presidio analyzer not available\", install the extras: `pip install validmind[pii-detection]`.\n", + "- If structured detection is unavailable, the library falls back to token-level text scans when possible.\n", + "- Ensure your environment is restarted after installing new packages if imports fail.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "validmind-BbKYUwN1-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 77ec96e8610b3b71432740eb050378997972fa7f Mon Sep 17 00:00:00 2001 From: John Walz Date: Wed, 20 Aug 2025 11:11:47 -0400 Subject: [PATCH 08/11] feat: rename notebook --- ...ickstart_pii_detection.ipynb => configure_pii_detection.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename notebooks/how_to/{quickstart_pii_detection.ipynb => configure_pii_detection.ipynb} (100%) diff --git a/notebooks/how_to/quickstart_pii_detection.ipynb b/notebooks/how_to/configure_pii_detection.ipynb similarity index 100% rename from notebooks/how_to/quickstart_pii_detection.ipynb rename to notebooks/how_to/configure_pii_detection.ipynb From e85c7db4ee86a7e62a2d09df19efeddf34f40fcd Mon Sep 17 00:00:00 2001 From: John Walz Date: Wed, 20 Aug 2025 12:10:56 -0400 Subject: [PATCH 09/11] chore: fix broken integration tests --- .github/workflows/integration.yaml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml index 303a10f92..e2f800d2d 100644 --- a/.github/workflows/integration.yaml +++ b/.github/workflows/integration.yaml @@ -50,12 +50,6 @@ jobs: - name: Remove Build Environment run: rm -rf .venv - - name: Setup Virtual Environment - run: python -m venv sdist-venv - - - name: Install Built Package - run: sdist-venv/bin/pip install --no-cache-dir "$(ls dist/validmind*.whl | head -n 1)[llm,huggingface]" - - name: 'Setup Virtual Environment for [all]' run: python -m venv all-venv @@ -64,13 +58,13 @@ jobs: run: all-venv/bin/pip install --no-cache-dir "$(ls dist/validmind*.whl | head -n 1)[all]" - name: Install Additional Dependencies - run: sdist-venv/bin/pip install nbformat papermill jupyter + run: all-venv/bin/pip install nbformat papermill jupyter - name: Create Jupyter Kernel - run: sdist-venv/bin/python -m ipykernel install --user --name sdist-venv + run: all-venv/bin/python -m ipykernel install --user --name all-venv - name: Integration Tests - run: sdist-venv/bin/python scripts/run_e2e_notebooks.py --kernel sdist-venv + run: all-venv/bin/python scripts/run_e2e_notebooks.py --kernel all-venv env: NOTEBOOK_RUNNER_DEFAULT_MODEL: ${{ secrets.NOTEBOOK_RUNNER_DEFAULT_PROJECT_ID }} NOTEBOOK_RUNNER_API_KEY: ${{ secrets.NOTEBOOK_RUNNER_API_KEY }} From 42e772dd1e7af7f54464206399e217b1cdddeff2 Mon Sep 17 00:00:00 2001 From: John Walz Date: Thu, 21 Aug 2025 11:29:55 -0400 Subject: [PATCH 10/11] feat: fixing pii detection --- .../how_to/configure_pii_detection.ipynb | 56 +- tests/test_results.py | 81 ++- validmind/ai/test_descriptions.py | 58 +- validmind/ai/utils.py | 4 +- validmind/api_client.py | 13 - validmind/vm_models/result/pii_filter.py | 494 +++++------------- validmind/vm_models/result/result.py | 37 +- validmind/vm_models/result/utils.py | 19 - 8 files changed, 220 insertions(+), 542 deletions(-) diff --git a/notebooks/how_to/configure_pii_detection.ipynb b/notebooks/how_to/configure_pii_detection.ipynb index 6cc08cc02..e9461279b 100644 --- a/notebooks/how_to/configure_pii_detection.ipynb +++ b/notebooks/how_to/configure_pii_detection.ipynb @@ -30,7 +30,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -q validmind[pii-detection]\n" + "%pip install -q \"validmind[pii-detection]\"" ] }, { @@ -88,19 +88,18 @@ "\n", "@test(\"my_pii_demo.PIIEmittingTest\")\n", "def pii_emitting_test():\n", - " \"\"\"A demo test that returns both a PII-bearing description and a PII-bearing table.\"\"\"\n", - " description = (\n", - " \"Primary contact: John Doe (john.doe@example.com), phone +1-415-555-1234.\"\n", - " )\n", - " table = pd.DataFrame(\n", + " \"\"\"A demo test that returns PII\"\"\"\n", + " return pd.DataFrame(\n", " {\n", - " \"name\": [\"Jane Smith\"],\n", - " \"email\": [\"jane.smith@bank.example\"],\n", - " \"phone\": [\"(212) 555-9876\"],\n", + " \"name\": [\"Jane Smith\", \"John Doe\", \"Alice Johnson\"],\n", + " \"email\": [\n", + " \"jane.smith@bank.example\",\n", + " \"john.doe@company.example\",\n", + " \"alice.johnson@service.example\",\n", + " ],\n", + " \"phone\": [\"(212) 555-9876\", \"(415) 555-1234\", \"(646) 555-5678\"],\n", " }\n", - " )\n", - " # Return order: (description, table)\n", - " return description, table" + " )" ] }, { @@ -132,12 +131,15 @@ " # check if the description was generated\n", " if not result._was_description_generated:\n", " print(\"Blocked: Test Description Generation was not run due to PII\")\n", + " else:\n", + " print(\"Description was generated by LLM\")\n", "\n", " # Try logging (this triggers PII checks before upload)\n", " result.log()\n", - " print(\"Run + log succeeded\")\n", + " print(\"Logging to API succeeded\")\n", " except Exception as e:\n", - " print(\"Blocked:\", type(e).__name__, str(e)[:200])" + " print(\"Blocked: Test Result was not logged due to PII\")\n", + " # print(e)" ] }, { @@ -146,10 +148,10 @@ "source": [ "### Expected behavior by mode\n", "\n", - "- disabled: No PII checks; test runs and logs.\n", - "- test_results: Table PII triggers blocking; description may still proceed.\n", - "- test_descriptions: Description PII triggers blocking; tables may still proceed.\n", - "- all: Both table and description checks are enforced.\n" + "- disabled: No PII checks.\n", + "- test_results: Description is generated but result is not logged.\n", + "- test_descriptions: Description generation is blocked but result is logged.\n", + "- all: Description generation and logging are both blocked.\n" ] }, { @@ -159,25 +161,13 @@ "## Notes\n", "\n", "- If you see warnings that Presidio is unavailable, ensure you installed extras: `validmind[pii-detection]`.\n", - "- You can override blocking by passing `unsafe=True` to `result.log(unsafe=True)`, but this is not recommended outside controlled workflows.\n", - "- To test only a subset (tables or descriptions), adjust the test to emit only that type and re-run the mode loop.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Troubleshooting\n", - "\n", - "- If you see warnings like \"Presidio analyzer not available\", install the extras: `pip install validmind[pii-detection]`.\n", - "- If structured detection is unavailable, the library falls back to token-level text scans when possible.\n", - "- Ensure your environment is restarted after installing new packages if imports fail.\n" + "- You can override blocking by passing `unsafe=True` to `result.log(unsafe=True)`, but this is not recommended outside controlled workflows.\n" ] } ], "metadata": { "kernelspec": { - "display_name": "validmind-BbKYUwN1-py3.11", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -191,7 +181,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.12.11" } }, "nbformat": 4, diff --git a/tests/test_results.py b/tests/test_results.py index 9c7c289d2..afa1e7dea 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -17,10 +17,10 @@ ) from validmind.vm_models.figure import Figure from validmind.errors import InvalidParameterError -from validmind.ai.utils import DescriptionFuture loop = asyncio.new_event_loop() + class MockAsyncResponse: def __init__(self, status, text=None, json=None): self.status = status @@ -40,6 +40,7 @@ async def __aexit__(self, exc_type, exc, tb): async def __aenter__(self): return self + class TestResultClasses(unittest.TestCase): def tearDownClass(): loop.close() @@ -49,17 +50,17 @@ def run_async(self, func, *args, **kwargs): def test_raw_data_initialization(self): """Test RawData initialization and methods""" - raw_data = RawData(log=True, dataset_duplicates=pd.DataFrame({'col1': [1, 2]})) - + raw_data = RawData(log=True, dataset_duplicates=pd.DataFrame({"col1": [1, 2]})) + self.assertTrue(raw_data.log) self.assertIsInstance(raw_data.dataset_duplicates, pd.DataFrame) self.assertEqual(raw_data.__repr__(), "RawData(log, dataset_duplicates)") def test_result_table_initialization(self): """Test ResultTable initialization and methods""" - df = pd.DataFrame({'col1': [1, 2, 3]}) + df = pd.DataFrame({"col1": [1, 2, 3]}) table = ResultTable(data=df, title="Test Table") - + self.assertEqual(table.title, "Test Table") self.assertIsInstance(table.data, pd.DataFrame) self.assertEqual(table.__repr__(), 'ResultTable(title="Test Table")') @@ -68,15 +69,13 @@ def test_error_result(self): """Test ErrorResult initialization and methods""" error = ValueError("Test error") error_result = ErrorResult( - result_id="test_error", - error=error, - message="Test error message" + result_id="test_error", error=error, message="Test error message" ) - + self.assertEqual(error_result.name, "Failed Test") self.assertEqual(error_result.error, error) self.assertEqual(error_result.message, "Test error message") - + widget = error_result.to_widget() self.assertIsInstance(widget, HTML) @@ -87,9 +86,9 @@ def test_test_result_initialization(self): name="Test 1", description="Test description", metric=0.95, - passed=True + passed=True, ) - + self.assertEqual(test_result.result_id, "test_1") self.assertEqual(test_result.name, "Test 1") self.assertEqual(test_result.description, "Test description") @@ -99,8 +98,8 @@ def test_test_result_initialization(self): def test_test_result_add_table(self): """Test adding tables to TestResult""" test_result = TestResult(result_id="test_1") - df = pd.DataFrame({'col1': [1, 2, 3]}) - + df = pd.DataFrame({"col1": [1, 2, 3]}) + test_result.add_table(df, title="Test Table") self.assertEqual(len(test_result.tables), 1) self.assertEqual(test_result.tables[0].title, "Test Table") @@ -110,7 +109,7 @@ def test_test_result_add_figure(self): test_result = TestResult(result_id="test_1") fig = plt.figure() plt.plot([1, 2, 3]) - + test_result.add_figure(fig) self.assertEqual(len(test_result.figures), 1) self.assertIsInstance(test_result.figures[0], Figure) @@ -118,8 +117,8 @@ def test_test_result_add_figure(self): def test_test_result_remove_table(self): """Test removing tables from TestResult""" test_result = TestResult(result_id="test_1") - df = pd.DataFrame({'col1': [1, 2, 3]}) - + df = pd.DataFrame({"col1": [1, 2, 3]}) + test_result.add_table(df) test_result.remove_table(0) self.assertEqual(len(test_result.tables), 0) @@ -129,7 +128,7 @@ def test_test_result_remove_figure(self): test_result = TestResult(result_id="test_1") fig = plt.figure() plt.plot([1, 2, 3]) - + test_result.add_figure(fig) test_result.remove_figure(0) self.assertEqual(len(test_result.figures), 0) @@ -142,9 +141,9 @@ def test_test_result_serialize(self): ref_id="ref_1", params={"param1": 1}, passed=True, - inputs={} # Initialize empty inputs dictionary + inputs={}, # Initialize empty inputs dictionary ) - + serialized = test_result.serialize() self.assertEqual(serialized["test_name"], "test_1") self.assertEqual(serialized["title"], "Test Title") @@ -156,57 +155,55 @@ def test_test_result_serialize(self): @patch("validmind.api_client.alog_test_result") @patch("validmind.api_client.alog_figure") @patch("validmind.api_client.alog_metric") - async def test_test_result_log_async(self, mock_metric, mock_figure, mock_test_result): + async def test_test_result_log_async( + self, mock_metric, mock_figure, mock_test_result + ): """Test async logging of TestResult""" mock_test_result.return_value = MockAsyncResponse(200, json={"cuid": "123"}) mock_figure.return_value = MockAsyncResponse(200, json={"cuid": "456"}) mock_metric.return_value = MockAsyncResponse(200, json={"cuid": "789"}) test_result = TestResult( - result_id="test_1", - metric=0.95, - description="Test description" + result_id="test_1", metric=0.95, description="Test description" ) - + await test_result.log_async(section_id="section_1", position=0) - + mock_test_result.assert_called_once() mock_metric.assert_called_once() def test_text_generation_result(self): """Test TextGenerationResult initialization and methods""" text_result = TextGenerationResult( - result_id="text_1", - title="Text Test", - description="Generated text" + result_id="text_1", title="Text Test", description="Generated text" ) - + self.assertEqual(text_result.name, "Text Generation Result") self.assertEqual(text_result.title, "Text Test") self.assertEqual(text_result.description, "Generated text") - + widget = text_result.to_widget() self.assertIsInstance(widget, VBox) def test_validate_log_config(self): """Test validation of log configuration""" test_result = TestResult(result_id="test_1") - + # Test valid config valid_config = { "hideTitle": True, "hideText": False, "hideParams": True, "hideTables": False, - "hideFigures": True + "hideFigures": True, } test_result.validate_log_config(valid_config) # Should not raise exception - + # Test invalid keys invalid_config = {"invalidKey": True} with self.assertRaises(InvalidParameterError): test_result.validate_log_config(invalid_config) - + # Test non-boolean values invalid_type_config = {"hideTitle": "true"} with self.assertRaises(InvalidParameterError): @@ -219,20 +216,18 @@ async def test_metadata_update_content_id_handling(self, mock_update_metadata): test_result = TestResult( result_id="test_1", description="Test description", - _was_description_generated=False + _was_description_generated=False, ) await test_result.log_async(content_id="custom_content_id") mock_update_metadata.assert_called_with( - content_id="custom_content_id::default", - text="Test description" + content_id="custom_content_id::default", text="Test description" ) # Test case 2: Without content_id mock_update_metadata.reset_mock() await test_result.log_async() mock_update_metadata.assert_called_with( - content_id="test_description:test_1::default", - text="Test description" + content_id="test_description:test_1::default", text="Test description" ) # Test case 3: With AI generated description @@ -240,9 +235,9 @@ async def test_metadata_update_content_id_handling(self, mock_update_metadata): mock_update_metadata.reset_mock() await test_result.log_async() mock_update_metadata.assert_called_with( - content_id="test_description:test_1::ai", - text="Test description" + content_id="test_description:test_1::ai", text="Test description" ) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/validmind/ai/test_descriptions.py b/validmind/ai/test_descriptions.py index 4fb8182bc..dedcab394 100644 --- a/validmind/ai/test_descriptions.py +++ b/validmind/ai/test_descriptions.py @@ -14,6 +14,11 @@ from ..utils import NumpyEncoder, md_to_html, test_id_to_name from ..vm_models.figure import Figure from ..vm_models.result import ResultTable +from ..vm_models.result.pii_filter import ( + PIIDetectionMode, + get_pii_detection_mode, + scan_df, +) from .utils import DescriptionFuture __executor = ThreadPoolExecutor() @@ -37,32 +42,6 @@ def _get_llm_global_context(): return context if context_enabled and context else None -def _check_tables_for_pii(tables: Union[List[ResultTable], None]) -> None: - """Check structured tables for PII before converting them to text. - - Uses Presidio Structured through `check_table_for_pii` to scan the - underlying DataFrame of each `ResultTable`. - """ - if not tables: - return - - try: - from ..vm_models.result.pii_filter import check_table_for_pii - - for table in tables: - # Use the exact structure that goes into the summary (list of dicts) - serialized = table.serialize() - table_rows = serialized.get("data", []) - check_table_for_pii(table_data=table_rows, raise_on_detection=True) - except ImportError: - logger.debug("PII detection not available - skipping PII check for tables") - except ValueError: - # Re-raise PII detection errors - raise - except Exception as e: - logger.warning(f"PII detection failed for tables: {e}") - - def _truncate_summary( summary: Union[str, None], test_id: str, max_tokens: int = 100_000 ): @@ -118,8 +97,12 @@ def generate_description( ) if tables: - # Check structured tables for PII before converting them to text - _check_tables_for_pii(tables) + if get_pii_detection_mode() in [ + PIIDetectionMode.TEST_DESCRIPTIONS, + PIIDetectionMode.ALL, + ]: + for table in tables: + scan_df(table.data) summary = "\n---\n".join( [ @@ -154,13 +137,16 @@ def background_generate_description( ): def wrapped(): try: - return generate_description( - test_id=test_id, - test_description=test_description, - tables=tables, - figures=figures, - metric=metric, - title=title, + return ( + generate_description( + test_id=test_id, + test_description=test_description, + tables=tables, + figures=figures, + metric=metric, + title=title, + ), + True, ) except Exception as e: if "maximum context length" in str(e): @@ -175,7 +161,7 @@ def wrapped(): logger.warning(f"Failed to generate description for {test_id}: {e}") logger.warning(f"Using default description for {test_id}") - return test_description + return test_description, False return DescriptionFuture(__executor.submit(wrapped)) diff --git a/validmind/ai/utils.py b/validmind/ai/utils.py index 0e40d2ef5..8f3405baf 100644 --- a/validmind/ai/utils.py +++ b/validmind/ai/utils.py @@ -35,13 +35,13 @@ def __init__(self, future): self._future = future def get_description(self): - if isinstance(self._future, str): + if isinstance(self._future, tuple): description = self._future else: # This will block until the future is completed description = self._future.result() - return md_to_html(description, mathml=True) + return md_to_html(description[0], mathml=True), description[1] def get_client_and_model(): diff --git a/validmind/api_client.py b/validmind/api_client.py index e3924946d..1eb1cd5c4 100644 --- a/validmind/api_client.py +++ b/validmind/api_client.py @@ -434,19 +434,6 @@ def log_text( if not text or not isinstance(text, str): raise ValueError("`text` must be a non-empty string") - # PII detection for free-form text prior to logging - try: - from .vm_models.result.pii_filter import check_text_for_pii - - check_text_for_pii(text, raise_on_detection=True) - except ImportError: - logger.debug("PII detection not available - skipping PII check for text") - except ValueError: - # Re-raise PII detection errors - raise - except Exception as e: - logger.warning(f"PII detection failed for text: {e}") - if not is_html(text): text = md_to_html(text, mathml=True) diff --git a/validmind/vm_models/result/pii_filter.py b/validmind/vm_models/result/pii_filter.py index 016e946ce..23807a5e1 100644 --- a/validmind/vm_models/result/pii_filter.py +++ b/validmind/vm_models/result/pii_filter.py @@ -9,7 +9,7 @@ import os from enum import Enum -from typing import Any, Dict, List, Optional, Union +from typing import Dict import pandas as pd @@ -27,422 +27,176 @@ class PIIDetectionMode(Enum): ALL = "all" -def _get_pii_detection_mode() -> PIIDetectionMode: - """Get the current PII detection mode from environment variable.""" +# Default entities to detect common PII types +DEFAULT_ENTITIES = [ + "PERSON", + "EMAIL_ADDRESS", + "PHONE_NUMBER", + "CREDIT_CARD", + "US_SSN", + "US_DRIVER_LICENSE", + "IP_ADDRESS", + "LOCATION", + "DATE_TIME", + "US_PASSPORT", + "US_BANK_NUMBER", + "IBAN_CODE", +] + +# Default confidence threshold +DEFAULT_THRESHOLD = 0.5 + +# Default sample size for DataFrame PII scanning +SAMPLE_SIZE = 100 + + +def get_pii_detection_mode() -> PIIDetectionMode: + """ + Get the current PII detection mode. + + Returns: + PIIDetectionMode.DISABLED if: + - Environment variable is not set + - Environment variable is set to "disabled" + - Presidio packages are not installed + - Invalid mode value + + Otherwise returns the specified mode (test_results, test_descriptions, or all) + """ mode_str = os.getenv("VALIDMIND_PII_DETECTION", "disabled").lower() try: - return PIIDetectionMode(mode_str) + mode = PIIDetectionMode(mode_str) except ValueError: logger.warning( f"Invalid PII detection mode '{mode_str}'. " f"Valid options: {', '.join([mode.value for mode in PIIDetectionMode])}. " f"Defaulting to 'disabled'." ) - return PIIDetectionMode.DISABLED + mode = PIIDetectionMode.DISABLED - -# Lazy load presidio components to avoid import errors when not installed -_analyzer = None - - -def _get_presidio_analyzer(): - """Lazy load Presidio analyzer to avoid import errors when not installed.""" - global _analyzer - if _analyzer is None: - try: - from presidio_analyzer import AnalyzerEngine # type: ignore - - _analyzer = AnalyzerEngine() - logger.debug("Presidio analyzer initialized successfully") - except ImportError: + # If mode is not disabled, check if Presidio is actually available + if mode != PIIDetectionMode.DISABLED: + if not _is_presidio_available(): logger.warning( - "Presidio analyzer not available. Install with: pip install validmind[pii-detection]" + f"PII detection mode '{mode.value}' requested but Presidio not available. " + "Falling back to 'disabled' mode. Install with: pip install validmind[pii-detection]" ) - _analyzer = False - return _analyzer if _analyzer is not False else None - - -def _get_presidio_structured_builder(): - """Lazy load Presidio Structured PandasAnalysisBuilder. - - Returns None if not available. - """ - try: - from presidio_structured import PandasAnalysisBuilder # type: ignore + mode = PIIDetectionMode.DISABLED - return PandasAnalysisBuilder - except ImportError: - logger.warning( - "Presidio Structured not available. Install with: pip install validmind[pii-detection]" - ) - return None + return mode -def is_pii_detection_enabled_for_test_results() -> bool: - """Check if PII detection is enabled for test results and available.""" - mode = _get_pii_detection_mode() - return mode in [PIIDetectionMode.TEST_RESULTS, PIIDetectionMode.ALL] and ( - _get_presidio_structured_builder() is not None - or _get_presidio_analyzer() is not None - ) +def _is_presidio_available() -> bool: + """Check if any Presidio components are available.""" + return _get_presidio_text() is not None or _get_presidio_df() is not None -def is_pii_detection_enabled_for_test_descriptions() -> bool: - """Check if PII detection is enabled for test descriptions and available.""" - mode = _get_pii_detection_mode() - return ( - mode in [PIIDetectionMode.TEST_DESCRIPTIONS, PIIDetectionMode.ALL] - and _get_presidio_analyzer() is not None - ) +def _get_presidio_text(): + """Get Presidio analyzer for text analysis.""" + from presidio_analyzer import AnalyzerEngine + return AnalyzerEngine() -def is_pii_detection_enabled() -> bool: - """Check if PII detection is enabled for any mode and available.""" - mode = _get_pii_detection_mode() - if mode == PIIDetectionMode.DISABLED: - return False - # Either text analyzer (for descriptions) or structured (for tables) should be available - return ( - _get_presidio_analyzer() is not None - or _get_presidio_structured_builder() is not None - ) +def _get_presidio_df(): + """Get Presidio Structured PandasAnalysisBuilder for DataFrame analysis.""" + from presidio_structured import PandasAnalysisBuilder + return PandasAnalysisBuilder() -def detect_pii_in_text( - text: str, - entities: Optional[List[str]] = None, - language: str = "en", - threshold: float = 0.5, -) -> List[Dict]: - """ - Detect PII entities in text using Presidio analyzer. - Args: - text: The text to analyze for PII - entities: List of entity types to detect. If None, detects all supported entities - language: Language code for analysis (default: "en") - threshold: Minimum confidence score for PII detection (default: 0.5) - - Returns: - List of detected PII entities with their positions and confidence scores - """ - analyzer = _get_presidio_analyzer() - if analyzer is None: - logger.debug("PII detection skipped - Presidio not available") - return [] - - try: - # Default entities to detect common PII types - if entities is None: - entities = [ - "PERSON", - "EMAIL_ADDRESS", - "PHONE_NUMBER", - "CREDIT_CARD", - "US_SSN", - "US_DRIVER_LICENSE", - "IP_ADDRESS", - "LOCATION", - "DATE_TIME", - "US_PASSPORT", - "US_BANK_NUMBER", - "IBAN_CODE", - ] - - results = analyzer.analyze(text=text, entities=entities, language=language) - - # Filter results by confidence threshold - filtered_results = [ - { - "entity_type": result.entity_type, - "start": result.start, - "end": result.end, - "score": result.score, - "text": text[result.start : result.end], - } - for result in results - if result.score >= threshold - ] - - if filtered_results: - logger.debug(f"Detected {len(filtered_results)} PII entities in text") - - return filtered_results - - except Exception as e: - logger.warning(f"PII detection failed: {e}") - return [] - - -def scan_dataframe_for_pii( # noqa: C901 - df: pd.DataFrame, - columns: Optional[List[str]] = None, - threshold: float = 0.5, - sample_size: int = 100, -) -> Dict[str, List[Dict]]: +def scan_text(text: str) -> bool: """ - Scan a pandas DataFrame for PII content in text columns. - - This implementation uses Microsoft Presidio Structured to analyze tabular data - and determine which columns contain PII entities. It returns a mapping of - column names to detected entity metadata. Unlike token-level detection, the - structured analysis reports at the column level. + Scan text for PII content. Raises ValueError if PII is found. Args: - df: The DataFrame to scan - columns: List of column names to scan (if None, scans all string columns) - threshold: Minimum confidence score for PII detection (not used directly by structured analysis) - sample_size: Maximum number of rows to sample for PII detection + text: The text to scan for PII Returns: - Dictionary mapping column names to lists of detected PII entities. Each list - contains a single dict with at least the 'entity_type' key. - """ - if not ( - is_pii_detection_enabled_for_test_results() - or is_pii_detection_enabled_for_test_descriptions() - ): - return {} - - builder_cls = _get_presidio_structured_builder() - - # Determine which columns to scan - if columns is None: - # Scan all string/object columns - columns = [col for col in df.columns if df[col].dtype == "object"] - - # Limit the number of rows to scan for performance - sample_df = df.head(sample_size) if len(df) > sample_size else df - - # Prefer Presidio Structured if available - if builder_cls is not None: - try: - builder = builder_cls() - # Use mixed strategy and map our threshold parameter to the mixed threshold - tabular_analysis = builder.generate_analysis( - sample_df, - selection_strategy="mixed", - mixed_strategy_threshold=threshold, - ) - - # The analysis exposes an entity mapping of column -> entity type - entity_mapping: Dict[str, str] = getattr( - tabular_analysis, "entity_mapping", {} - ) - - pii_findings: Dict[str, List[Dict]] = {} - for column in columns: - if column in entity_mapping and entity_mapping[column]: - entity_type = entity_mapping[column] - pii_findings[column] = [ - { - "entity_type": entity_type, - "column": column, - } - ] - logger.info( - f"Detected PII entity '{entity_type}' in column '{column}'" - ) - - return pii_findings - except Exception as e: - logger.warning(f"PII structured analysis failed: {e}") - # fall back to token-level analyzer below - - # Fallback: use token-level Presidio Analyzer on sampled rows if available - analyzer_available = _get_presidio_analyzer() is not None - if not analyzer_available: - return {} - - pii_findings = {} - for column in columns: - column_pii: List[Dict] = [] - for idx, value in sample_df[column].dropna().items(): - if isinstance(value, str) and len(value.strip()) > 0: - pii_entities = detect_pii_in_text(text=str(value), threshold=threshold) - if pii_entities: - column_pii.extend( - [ - {**entity, "row_index": idx, "column": column} - for entity in pii_entities - ] - ) - if column_pii: - pii_findings[column] = column_pii - logger.info(f"Found {len(column_pii)} PII entities in column '{column}'") - - return pii_findings - - -def _coerce_to_dataframe(table_like: Any) -> Optional[pd.DataFrame]: # noqa: C901 - """Best-effort conversion of supported inputs into a DataFrame. - - Supports: - - pandas.DataFrame - - list[dict] - - objects with a `.data` attribute containing a DataFrame or list[dict] - - objects with `.serialize()` returning {"data": list[dict]} - """ - if table_like is None: - return None - - if isinstance(table_like, pd.DataFrame): - return table_like - - if isinstance(table_like, list): - return pd.DataFrame(table_like) if table_like else pd.DataFrame() - - data_attr = getattr(table_like, "data", None) - if data_attr is not None: - if isinstance(data_attr, pd.DataFrame): - return data_attr - if isinstance(data_attr, list): - return pd.DataFrame(data_attr) - - serialize_fn = getattr(table_like, "serialize", None) - if callable(serialize_fn): - try: - serialized = serialize_fn() - records = serialized.get("data") if isinstance(serialized, dict) else None - if isinstance(records, list): - return pd.DataFrame(records) - except Exception: - pass - - return None - - -def check_table_for_pii( - table_data: Union[pd.DataFrame, List[Dict], Any], - threshold: float = 0.5, - raise_on_detection: bool = True, -) -> None: - """ - Check a table (DataFrame or list of dicts) for PII content. - - Args: - table_data: The table data to check - threshold: Minimum confidence score for PII detection - raise_on_detection: If True, raises ValueError when PII is detected (default: True) + True if no PII is found Raises: - ValueError: If PII is detected and raise_on_detection is True + ValueError: If PII is detected """ - if not ( - is_pii_detection_enabled_for_test_results() - or is_pii_detection_enabled_for_test_descriptions() - ): - return - - df = _coerce_to_dataframe(table_data) - if df is None or df.empty: - return - - # Scan for PII - pii_findings = scan_dataframe_for_pii(df, threshold=threshold) - has_pii = bool(pii_findings) - - if has_pii and raise_on_detection: - entity_types = set() - for findings in pii_findings.values(): - entity_types.update(entity["entity_type"] for entity in findings) - + # sanity check + mode = get_pii_detection_mode() + if mode == PIIDetectionMode.DISABLED: + return True + + analyzer = _get_presidio_text() + results = analyzer.analyze(text=text, entities=DEFAULT_ENTITIES, language="en") + + # Filter results by confidence threshold + pii_entities = [ + { + "entity_type": result.entity_type, + "start": result.start, + "end": result.end, + "score": result.score, + "text": text[result.start : result.end], + } + for result in results + if result.score >= DEFAULT_THRESHOLD + ] + + if pii_entities: + entity_types = set(entity["entity_type"] for entity in pii_entities) raise ValueError( - f"PII detected in table data. Entity types found: {', '.join(entity_types)}. " - f"Pass `unsafe=True` to bypass PII detection." + f"PII detected in text content. Entity types found: {', '.join(entity_types)}." ) - -def check_table_for_pii_in_descriptions( - table_data: Union[pd.DataFrame, List[Dict]], - threshold: float = 0.5, - raise_on_detection: bool = True, -) -> None: - """Check a table for PII when used in description generation. - - Enabled under the "test_descriptions" or "all" modes. Uses Presidio Structured - directly to analyze the DataFrame, independent of the test_results gating. - """ - if not is_pii_detection_enabled_for_test_descriptions(): - return - - # Convert to DataFrame if it's a list of dicts - if isinstance(table_data, list): - if not table_data: - return - df = pd.DataFrame(table_data) - else: - df = table_data - - builder_cls = _get_presidio_structured_builder() - if builder_cls is None: - # If Structured is not available, try token-level analyzer as a fallback - pii_findings = scan_dataframe_for_pii(df, threshold=threshold) - else: - try: - builder = builder_cls() - tabular_analysis = builder.generate_analysis( - df, selection_strategy="mixed", mixed_strategy_threshold=threshold - ) - entity_mapping: Dict[str, str] = getattr( - tabular_analysis, "entity_mapping", {} - ) - pii_findings = { - col: [{"entity_type": ent}] - for col, ent in entity_mapping.items() - if ent - } - except Exception as e: - logger.warning(f"PII structured analysis (descriptions) failed: {e}") - pii_findings = {} - - if pii_findings and raise_on_detection: - entity_types = set() - for findings in pii_findings.values(): - entity_types.update(entity["entity_type"] for entity in findings) - - raise ValueError( - f"PII detected in table data for description. Entity types found: {', '.join(entity_types)}." - ) + return True -def check_text_for_pii( - text: str, - entities: Optional[List[str]] = None, - language: str = "en", - threshold: float = 0.5, - raise_on_detection: bool = True, -) -> List[Dict]: +def scan_df(df: pd.DataFrame) -> bool: """ - Check text for PII content and optionally raise an exception. + Scan a pandas DataFrame for PII content. Raises ValueError if PII is found. Args: - text: The text to check for PII - entities: List of entity types to detect - language: Language code for analysis - threshold: Minimum confidence score for PII detection - raise_on_detection: If True, raises ValueError when PII is detected (default: True) + df: The DataFrame to scan Returns: - List of detected PII entities + True if no PII is found Raises: - ValueError: If PII is detected and raise_on_detection is True + ValueError: If PII is detected """ - if not is_pii_detection_enabled_for_test_descriptions(): - return [] + # sanity check + mode = get_pii_detection_mode() + if mode == PIIDetectionMode.DISABLED: + return True + + # Scan all string/object columns + columns = [col for col in df.columns if df[col].dtype == "object"] + + if not columns: + return True - pii_entities = detect_pii_in_text( - text=text, entities=entities, language=language, threshold=threshold + # Limit the number of rows to scan for performance + sample_df = df.head(SAMPLE_SIZE) if len(df) > SAMPLE_SIZE else df + + # Use structured analysis + builder = _get_presidio_df() + tabular_analysis = builder.generate_analysis( + sample_df, + selection_strategy="mixed", + mixed_strategy_threshold=DEFAULT_THRESHOLD, ) - if pii_entities and raise_on_detection: - entity_types = set(entity["entity_type"] for entity in pii_entities) + entity_mapping: Dict[str, str] = getattr(tabular_analysis, "entity_mapping", {}) + + pii_columns = [ + column + for column in columns + if column in entity_mapping and entity_mapping[column] + ] + + if pii_columns: + entity_types = [entity_mapping[col] for col in pii_columns] raise ValueError( - f"PII detected in text content. Entity types found: {', '.join(entity_types)}. " - f"Pass `unsafe=True` to bypass PII detection." + f"PII detected in DataFrame columns: {', '.join(pii_columns)}. " + f"Entity types found: {', '.join(entity_types)}." ) - return pii_entities + return True diff --git a/validmind/vm_models/result/result.py b/validmind/vm_models/result/result.py index 76b059a02..db7000902 100644 --- a/validmind/vm_models/result/result.py +++ b/validmind/vm_models/result/result.py @@ -31,10 +31,10 @@ ) from ..figure import Figure, create_figure from ..input import VMInput +from .pii_filter import PIIDetectionMode, get_pii_detection_mode, scan_df, scan_text from .utils import ( AI_REVISION_NAME, DEFAULT_REVISION_NAME, - check_for_sensitive_data, figures_to_widgets, get_result_template, tables_to_widgets, @@ -222,8 +222,10 @@ def __getattribute__(self, name): description = super().__getattribute__("description") if isinstance(description, DescriptionFuture): - self._was_description_generated = True - self.description = description.get_description() + ( + self.description, + self._was_description_generated, + ) = description.get_description() return super().__getattribute__(name) @@ -554,32 +556,15 @@ def log( # noqa: C901 self.check_result_id_exist() - if not unsafe: + if not unsafe and get_pii_detection_mode() in [ + PIIDetectionMode.TEST_RESULTS, + PIIDetectionMode.ALL, + ]: for table in self.tables or []: - # Robust table PII check that accepts ResultTable directly - try: - from .pii_filter import check_table_for_pii - - check_table_for_pii(table) - except Exception: - # Fall back to prior behavior if new helper fails unexpectedly - check_for_sensitive_data(table.data) + scan_df(table.data) - # Check description text for PII when available if self.description: - try: - from .pii_filter import check_text_for_pii - - check_text_for_pii(self.description, raise_on_detection=True) - except ImportError: - logger.debug( - "PII detection not available - skipping PII check for description" - ) - except ValueError: - # Re-raise PII detection errors - raise - except Exception as e: - logger.warning(f"PII detection failed for description: {e}") + scan_text(self.description) if section_id: self._validate_section_id_for_block(section_id, position) diff --git a/validmind/vm_models/result/utils.py b/validmind/vm_models/result/utils.py index 6161c852c..508aac46d 100644 --- a/validmind/vm_models/result/utils.py +++ b/validmind/vm_models/result/utils.py @@ -5,7 +5,6 @@ import os from typing import TYPE_CHECKING, Dict, List, Union -import pandas as pd from ipywidgets import HTML, GridBox, Layout from jinja2 import Template @@ -50,24 +49,6 @@ async def update_metadata(content_id: str, text: str, _json: Union[Dict, List] = await api_client.alog_metadata(content_id, text, _json) -def check_for_sensitive_data(data: pd.DataFrame): - """Check if the data contains sensitive information (PII).""" - # Check for PII content - try: - from .pii_filter import check_table_for_pii - - check_table_for_pii(table_data=data, threshold=0.5, raise_on_detection=True) - - except ImportError: - logger.debug("PII filtering not installed - skipping PII check") - except ValueError as e: - # PII was detected and raise_on_detection is True - raise e - except Exception as e: - # Log other PII checking errors but don't fail the entire operation - logger.warning(f"PII checking failed: {e}") - - def tables_to_widgets(tables: List["ResultTable"]): """Convert a list of tables to ipywidgets.""" widgets = [ From bec495f1dac691c01ea006c3348e8b2d31a095c3 Mon Sep 17 00:00:00 2001 From: John Walz Date: Thu, 21 Aug 2025 11:41:25 -0400 Subject: [PATCH 11/11] chore: upgrading linter and fixing complaints --- poetry.lock | 186 ++++++++---------- pyproject.toml | 2 +- validmind/api_client.py | 2 - .../data_validation/ScoreBandDefaultRates.py | 2 +- .../embeddings/PCAComponentsPairwisePlots.py | 2 +- .../embeddings/TSNEComponentsPairwisePlots.py | 2 +- .../sklearn/RobustnessDiagnosis.py | 2 +- .../CumulativePredictionProbabilities.py | 2 +- .../PredictionProbabilitiesHistogram.py | 2 +- .../statsmodels/ScorecardHistogram.py | 2 +- .../CalibrationCurveDrift.py | 2 +- .../PredictionProbabilitiesHistogramDrift.py | 2 +- .../ongoing_monitoring/ScoreBandsDrift.py | 2 +- .../ScorecardHistogramDrift.py | 2 +- validmind/tests/plots/BoxPlot.py | 4 +- validmind/tests/plots/HistogramPlot.py | 8 +- validmind/tests/stats/DescriptiveStats.py | 4 +- 17 files changed, 102 insertions(+), 126 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7d985dd57..da85d1a0c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand. [[package]] name = "aiodns" @@ -662,6 +662,10 @@ files = [ {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec"}, {file = "Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2"}, {file = "Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128"}, {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc"}, @@ -674,8 +678,14 @@ files = [ {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b"}, {file = "Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50"}, {file = "Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2"}, {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451"}, @@ -686,8 +696,24 @@ files = [ {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839"}, {file = "Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0"}, {file = "Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, + {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, + {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, {file = "Brotli-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a090ca607cbb6a34b0391776f0cb48062081f5f60ddcce5d11838e67a01928d1"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de9d02f5bda03d27ede52e8cfe7b865b066fa49258cbab568720aa5be80a47d"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2333e30a5e00fe0fe55903c8832e08ee9c3b1382aacf4db26664a16528d51b4b"}, @@ -697,6 +723,10 @@ files = [ {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:069a121ac97412d1fe506da790b3e69f52254b9df4eb665cd42460c837193354"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e93dfc1a1165e385cc8239fab7c036fb2cd8093728cbd85097b284d7b99249a2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:aea440a510e14e818e67bfc4027880e2fb500c2ccb20ab21c7a7c8b5b4703d75"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:6974f52a02321b36847cd19d1b8e381bf39939c21efd6ee2fc13a28b0d99348c"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:a7e53012d2853a07a4a79c00643832161a910674a893d296c9f1259859a289d2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:d7702622a8b40c49bffb46e1e3ba2e81268d5c04a34f460978c6b5517a34dd52"}, {file = "Brotli-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:a599669fd7c47233438a56936988a2478685e74854088ef5293802123b5b2460"}, {file = "Brotli-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d143fd47fad1db3d7c27a1b1d66162e855b5d50a89666af46e1679c496e8e579"}, {file = "Brotli-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:11d00ed0a83fa22d29bc6b64ef636c4552ebafcef57154b4ddd132f5638fbd1c"}, @@ -708,6 +738,10 @@ files = [ {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:919e32f147ae93a09fe064d77d5ebf4e35502a8df75c29fb05788528e330fe74"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:23032ae55523cc7bccb4f6a0bf368cd25ad9bcdcc1990b64a647e7bbcce9cb5b"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:224e57f6eac61cc449f498cc5f0e1725ba2071a3d4f48d5d9dffba42db196438"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cb1dac1770878ade83f2ccdf7d25e494f05c9165f5246b46a621cc849341dc01"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:3ee8a80d67a4334482d9712b8e83ca6b1d9bc7e351931252ebef5d8f7335a547"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5e55da2c8724191e5b557f8e18943b1b4839b8efc3ef60d65985bcf6f587dd38"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:d342778ef319e1026af243ed0a07c97acf3bad33b9f29e7ae6a1f68fd083e90c"}, {file = "Brotli-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:587ca6d3cef6e4e868102672d3bd9dc9698c309ba56d41c2b9c85bbb903cdb95"}, {file = "Brotli-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2954c1c23f81c2eaf0b0717d9380bd348578a94161a65b3a2afc62c86467dd68"}, {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3"}, @@ -720,6 +754,10 @@ files = [ {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a"}, {file = "Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b"}, {file = "Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0"}, {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, @@ -732,6 +770,10 @@ files = [ {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, @@ -1808,20 +1850,20 @@ markers = {main = "extra == \"all\" or extra == \"llm\" or extra == \"pytorch\" [[package]] name = "flake8" -version = "4.0.1" +version = "7.3.0" description = "the modular source code checker: pep8 pyflakes and co" optional = false -python-versions = ">=3.6" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "flake8-4.0.1-py2.py3-none-any.whl", hash = "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d"}, - {file = "flake8-4.0.1.tar.gz", hash = "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"}, + {file = "flake8-7.3.0-py2.py3-none-any.whl", hash = "sha256:b9696257b9ce8beb888cdbe31cf885c90d31928fe202be0889a7cdafad32f01e"}, + {file = "flake8-7.3.0.tar.gz", hash = "sha256:fe044858146b9fc69b551a4b490d69cf960fcb78ad1edcb84e7fbb1b4a8e3872"}, ] [package.dependencies] -mccabe = ">=0.6.0,<0.7.0" -pycodestyle = ">=2.8.0,<2.9.0" -pyflakes = ">=2.4.0,<2.5.0" +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.14.0,<2.15.0" +pyflakes = ">=3.4.0,<3.5.0" [[package]] name = "fonttools" @@ -4091,14 +4133,14 @@ traitlets = "*" [[package]] name = "mccabe" -version = "0.6.1" +version = "0.7.0" description = "McCabe checker, plugin for flake8" optional = false -python-versions = "*" +python-versions = ">=3.6" groups = ["dev"] files = [ - {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, - {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] [[package]] @@ -5882,6 +5924,23 @@ cryptography = "<44.1" [package.extras] server = ["flask (>=1.1)", "gunicorn"] +[[package]] +name = "presidio-structured" +version = "0.0.4a0" +description = "Presidio structured package - analyzes and anonymizes structured and semi-structured data." +optional = true +python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "python_version < \"3.11\" and extra == \"pii-detection\"" +files = [ + {file = "presidio_structured-0.0.4a0-py3-none-any.whl", hash = "sha256:7cc63b48038a177684cb9512d481571814c04331a0f4ddeb09299cc76803258b"}, +] + +[package.dependencies] +pandas = ">=1.5.2" +presidio-analyzer = ">=2.2" +presidio-anonymizer = ">=2.2" + [[package]] name = "presidio-structured" version = "0.0.6" @@ -5889,7 +5948,7 @@ description = "Presidio structured package - analyzes and anonymizes structured optional = true python-versions = "<4.0,>=3.9" groups = ["main"] -markers = "extra == \"pii-detection\"" +markers = "python_version >= \"3.11\" and extra == \"pii-detection\"" files = [ {file = "presidio_structured-0.0.6-py3-none-any.whl", hash = "sha256:f3454c86857a00db9828e684895da43411bcc7d750cac0a52e15d68f6c6455a1"}, ] @@ -5898,7 +5957,6 @@ files = [ pandas = ">=1.5.2" presidio-analyzer = ">=2.2" presidio-anonymizer = ">=2.2" -spacy = {version = "<3.8.4", markers = "python_version < \"3.10\""} [[package]] name = "prometheus-client" @@ -6342,14 +6400,14 @@ all = ["matplotlib (>=2.1.0)"] [[package]] name = "pycodestyle" -version = "2.8.0" +version = "2.14.0" description = "Python style guide checker" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "pycodestyle-2.8.0-py2.py3-none-any.whl", hash = "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20"}, - {file = "pycodestyle-2.8.0.tar.gz", hash = "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"}, + {file = "pycodestyle-2.14.0-py2.py3-none-any.whl", hash = "sha256:dd6bf7cb4ee77f8e016f9c8e74a35ddd9f67e1d5fd4184d86c3b98e07099f42d"}, + {file = "pycodestyle-2.14.0.tar.gz", hash = "sha256:c4b5b517d278089ff9d0abdec919cd97262a3367449ea1c8b49b91529167b783"}, ] [[package]] @@ -6543,14 +6601,14 @@ dev = ["build", "coverage", "furo", "invoke", "mypy", "pytest", "pytest-cov", "p [[package]] name = "pyflakes" -version = "2.4.0" +version = "3.4.0" description = "passive checker of Python programs" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "pyflakes-2.4.0-py2.py3-none-any.whl", hash = "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"}, - {file = "pyflakes-2.4.0.tar.gz", hash = "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c"}, + {file = "pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f"}, + {file = "pyflakes-3.4.0.tar.gz", hash = "sha256:b24f96fafb7d2ab0ec5075b7350b3d2d2218eab42003821c06344973d3ea2f58"}, ] [[package]] @@ -8021,86 +8079,6 @@ files = [ {file = "soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a"}, ] -[[package]] -name = "spacy" -version = "3.8.3" -description = "Industrial-strength Natural Language Processing (NLP) in Python" -optional = true -python-versions = "<3.13,>=3.9" -groups = ["main"] -markers = "python_version < \"3.11\" and extra == \"pii-detection\"" -files = [ - {file = "spacy-3.8.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b530a5cbb077601d03bdd71bf1ded4de4b7fb0362b5443c5183c628cfa81ffdc"}, - {file = "spacy-3.8.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b28a5f7b77400ebf7e23aa24a82a2d35f97071cd5ef1ad0f859aa9b323fff59a"}, - {file = "spacy-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcfd24a00da30ca53570f5b1c3535c1fa95b633f2a12b3d08395c9552ffb53c"}, - {file = "spacy-3.8.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e3630ea33608a6db8045fad7e0ba22f864c61ea351445488a89af1734e434a37"}, - {file = "spacy-3.8.3-cp310-cp310-win_amd64.whl", hash = "sha256:20839fa04cc2156ab613e40db54c25031304fdc1dd369930bc01c366586d0079"}, - {file = "spacy-3.8.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b16b8f9c544cdccd1bd23fc6bf6e2f1d667a1ee285a9b31bdb4a89e2d61345b4"}, - {file = "spacy-3.8.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f62e45a2259acc51cd8eb185f978848928f2f698ba174b283253485fb7691b04"}, - {file = "spacy-3.8.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57a267ea25dd8b7ec3e55accd1592d2d0847f0c6277a55145af5bb08e318bab4"}, - {file = "spacy-3.8.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:45bc5fc8d399089607e3e759aee98362ffb007e39386531f195f42dcddcc94dc"}, - {file = "spacy-3.8.3-cp311-cp311-win_amd64.whl", hash = "sha256:9e348359d54418a5752305975f1268013135255bd656a783aa3397b3bd4dd5e9"}, - {file = "spacy-3.8.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b01e50086515fa6d43275be11a762a3a3285d9aabbe27b4f3b98a08083f1d2a1"}, - {file = "spacy-3.8.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:187f9732362d0dc52b16c80e67decf58ff91605e34b251c50c7dc5212082fcb4"}, - {file = "spacy-3.8.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7517bc969bca924cbdba4e14e0ce16e66d32967468ad27490e95c9b4d8d8aa8"}, - {file = "spacy-3.8.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:460948437c5571367105554b1e67549f957ba8dd6ee7e1594e719f9a88c398bb"}, - {file = "spacy-3.8.3-cp312-cp312-win_amd64.whl", hash = "sha256:1f14d4e2b1e6ab144ee546236f2c32b255f91f24939e62436c3a9c2ee200c6d1"}, - {file = "spacy-3.8.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f6020603633ec47374af71e936671d5992d68e592661dffac940f5596d77696"}, - {file = "spacy-3.8.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:72b492651534460bf4fe842f7efa462887f9e215de86146b862df6238b952650"}, - {file = "spacy-3.8.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a630119aaa7a6180635eb8f21b27509654882847480c8423a657582b4a9bdd3"}, - {file = "spacy-3.8.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8563ba9cbb71a629c7dc8c2db98f0348416dc0f0927de0e9ed8b448f707b5248"}, - {file = "spacy-3.8.3-cp39-cp39-win_amd64.whl", hash = "sha256:608beca075f7611083e93c91625d7e6c5885e2672cb5ec1b9f274cab6c82c816"}, - {file = "spacy-3.8.3.tar.gz", hash = "sha256:81a967dc3d6a5a0a9ab250559483fe2092306582a9192f98be7a63bdce2797f7"}, -] - -[package.dependencies] -catalogue = ">=2.0.6,<2.1.0" -cymem = ">=2.0.2,<2.1.0" -jinja2 = "*" -langcodes = ">=3.2.0,<4.0.0" -murmurhash = ">=0.28.0,<1.1.0" -numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""} -packaging = ">=20.0" -preshed = ">=3.0.2,<3.1.0" -pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" -requests = ">=2.13.0,<3.0.0" -setuptools = "*" -spacy-legacy = ">=3.0.11,<3.1.0" -spacy-loggers = ">=1.0.0,<2.0.0" -srsly = ">=2.4.3,<3.0.0" -thinc = ">=8.3.0,<8.4.0" -tqdm = ">=4.38.0,<5.0.0" -typer = ">=0.3.0,<1.0.0" -wasabi = ">=0.9.1,<1.2.0" -weasel = ">=0.1.0,<0.5.0" - -[package.extras] -apple = ["thinc-apple-ops (>=1.0.0,<2.0.0)"] -cuda = ["cupy (>=5.0.0b4,<13.0.0)"] -cuda-autodetect = ["cupy-wheel (>=11.0.0,<13.0.0)"] -cuda100 = ["cupy-cuda100 (>=5.0.0b4,<13.0.0)"] -cuda101 = ["cupy-cuda101 (>=5.0.0b4,<13.0.0)"] -cuda102 = ["cupy-cuda102 (>=5.0.0b4,<13.0.0)"] -cuda110 = ["cupy-cuda110 (>=5.0.0b4,<13.0.0)"] -cuda111 = ["cupy-cuda111 (>=5.0.0b4,<13.0.0)"] -cuda112 = ["cupy-cuda112 (>=5.0.0b4,<13.0.0)"] -cuda113 = ["cupy-cuda113 (>=5.0.0b4,<13.0.0)"] -cuda114 = ["cupy-cuda114 (>=5.0.0b4,<13.0.0)"] -cuda115 = ["cupy-cuda115 (>=5.0.0b4,<13.0.0)"] -cuda116 = ["cupy-cuda116 (>=5.0.0b4,<13.0.0)"] -cuda117 = ["cupy-cuda117 (>=5.0.0b4,<13.0.0)"] -cuda11x = ["cupy-cuda11x (>=11.0.0,<13.0.0)"] -cuda12x = ["cupy-cuda12x (>=11.5.0,<13.0.0)"] -cuda80 = ["cupy-cuda80 (>=5.0.0b4,<13.0.0)"] -cuda90 = ["cupy-cuda90 (>=5.0.0b4,<13.0.0)"] -cuda91 = ["cupy-cuda91 (>=5.0.0b4,<13.0.0)"] -cuda92 = ["cupy-cuda92 (>=5.0.0b4,<13.0.0)"] -ja = ["sudachidict_core (>=20211220)", "sudachipy (>=0.5.2,!=0.6.1)"] -ko = ["natto-py (>=0.9.0)"] -lookups = ["spacy_lookups_data (>=1.0.3,<1.1.0)"] -th = ["pythainlp (>=2.0)"] -transformers = ["spacy_transformers (>=1.1.2,<1.4.0)"] - [[package]] name = "spacy" version = "3.8.7" @@ -8108,7 +8086,7 @@ description = "Industrial-strength Natural Language Processing (NLP) in Python" optional = true python-versions = "<3.14,>=3.9" groups = ["main"] -markers = "python_version >= \"3.11\" and extra == \"pii-detection\"" +markers = "extra == \"pii-detection\"" files = [ {file = "spacy-3.8.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6ec0368ce96cd775fb14906f04b771c912ea8393ba30f8b35f9c4dc47a420b8e"}, {file = "spacy-3.8.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5672f8a0fe7a3847e925544890be60015fbf48a60a838803425f82e849dd4f18"}, @@ -10056,4 +10034,4 @@ xgboost = ["xgboost"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<3.13" -content-hash = "5f5c67a8609d9f02cef3b073e42591def4d93b7a7c59dda5466fe797ffd89338" +content-hash = "8ee77fe173c5abeed209af25844ec7bf38da5cc43a648ef9561a2eee08cbe84c" diff --git a/pyproject.toml b/pyproject.toml index 8dc969bd7..ead49fa17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,7 +88,7 @@ black = "^22.1.0" click = "*" cython = "^0.29.34" docstring_parser = "*" -flake8 = "^4.0.1" +flake8 = "^7.0.0" griffe = "*" ipykernel = "^6.22.0" isort = "^5.12.0" diff --git a/validmind/api_client.py b/validmind/api_client.py index 1eb1cd5c4..0071b8884 100644 --- a/validmind/api_client.py +++ b/validmind/api_client.py @@ -40,8 +40,6 @@ @atexit.register def _close_session(): """Closes the async client session at exit.""" - global __api_session - if __api_session and not __api_session.closed: try: loop = asyncio.get_event_loop() diff --git a/validmind/tests/data_validation/ScoreBandDefaultRates.py b/validmind/tests/data_validation/ScoreBandDefaultRates.py index 5bab7303e..796728ebb 100644 --- a/validmind/tests/data_validation/ScoreBandDefaultRates.py +++ b/validmind/tests/data_validation/ScoreBandDefaultRates.py @@ -83,7 +83,7 @@ def ScoreBandDefaultRates( # Create band labels band_labels = [ - f"{score_bands[i]}-{score_bands[i+1]}" for i in range(len(score_bands) - 1) + f"{score_bands[i]}-{score_bands[i + 1]}" for i in range(len(score_bands) - 1) ] band_labels.insert(0, f"<{score_bands[0]}") band_labels.append(f">{score_bands[-1]}") diff --git a/validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py b/validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py index a3cb21813..9138e5403 100644 --- a/validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +++ b/validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py @@ -75,7 +75,7 @@ def PCAComponentsPairwisePlots( # Prepare DataFrame for Plotly pca_df = pd.DataFrame( - pca_results, columns=[f"PC{i+1}" for i in range(n_components)] + pca_results, columns=[f"PC{i + 1}" for i in range(n_components)] ) # List to store each plot diff --git a/validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py b/validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py index 2e81c2637..3f830a9ad 100644 --- a/validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +++ b/validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py @@ -79,7 +79,7 @@ def TSNEComponentsPairwisePlots( # Prepare DataFrame for Plotly tsne_df = pd.DataFrame( - tsne_results, columns=[f"Component {i+1}" for i in range(n_components)] + tsne_results, columns=[f"Component {i + 1}" for i in range(n_components)] ) # List to store each plot diff --git a/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py b/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py index b688d68a1..f758ac142 100644 --- a/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +++ b/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py @@ -129,7 +129,7 @@ def _plot_robustness( datasets = results["Dataset"].unique() pallete = [ - f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}" + f"#{int(r * 255):02x}{int(g * 255):02x}{int(b * 255):02x}" for r, g, b in sns.color_palette("husl", len(datasets)) ] diff --git a/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py b/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py index bd5f1e4f4..a87481d98 100644 --- a/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +++ b/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py @@ -88,7 +88,7 @@ def _plot_cumulative_prob(df, target_col, title): classes = sorted(df[target_col].unique()) colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB color_dict = { - cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})" + cls: f"rgb({int(rgb[0] * 255)}, {int(rgb[1] * 255)}, {int(rgb[2] * 255)})" for cls, rgb in zip(classes, colors) } diff --git a/validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py b/validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py index 820f7dfa5..f7fbb952c 100644 --- a/validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +++ b/validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py @@ -83,7 +83,7 @@ def _plot_prob_histogram(df, target_col, title): classes = sorted(df[target_col].unique()) colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB color_dict = { - cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})" + cls: f"rgb({int(rgb[0] * 255)}, {int(rgb[1] * 255)}, {int(rgb[2] * 255)})" for cls, rgb in zip(classes, colors) } diff --git a/validmind/tests/model_validation/statsmodels/ScorecardHistogram.py b/validmind/tests/model_validation/statsmodels/ScorecardHistogram.py index 4e08b321a..0352a8028 100644 --- a/validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +++ b/validmind/tests/model_validation/statsmodels/ScorecardHistogram.py @@ -85,7 +85,7 @@ def _plot_score_histogram(df, score_col, target_col, title): classes = sorted(df[target_col].unique()) colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB color_dict = { - cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})" + cls: f"rgb({int(rgb[0] * 255)}, {int(rgb[1] * 255)}, {int(rgb[2] * 255)})" for cls, rgb in zip(classes, colors) } diff --git a/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py b/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py index 7e2482c04..2acc551f9 100644 --- a/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +++ b/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py @@ -96,7 +96,7 @@ def CalibrationCurveDrift( # Create bin labels bin_edges = np.linspace(0, 1, n_bins + 1) - bin_labels = [f"{bin_edges[i]:.1f}-{bin_edges[i+1]:.1f}" for i in range(n_bins)] + bin_labels = [f"{bin_edges[i]:.1f}-{bin_edges[i + 1]:.1f}" for i in range(n_bins)] # Create predicted probabilities table pred_metrics = [] diff --git a/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py index 78ab49c8b..3a4a34252 100644 --- a/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +++ b/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py @@ -204,7 +204,7 @@ def PredictionProbabilitiesHistogramDrift( # Add separate legend for each subplot fig.update_layout( **{ - f'legend{i+1 if i > 0 else ""}': dict( + f'legend{i + 1 if i > 0 else ""}': dict( yanchor="middle", y=1 - (i / len(classes)) - (0.5 / len(classes)), xanchor="left", diff --git a/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py b/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py index 23874934f..9678d8f9d 100644 --- a/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +++ b/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py @@ -84,7 +84,7 @@ def ScoreBandsDrift( # Create band labels band_labels = [ - f"{score_bands[i]}-{score_bands[i+1]}" for i in range(len(score_bands) - 1) + f"{score_bands[i]}-{score_bands[i + 1]}" for i in range(len(score_bands) - 1) ] band_labels.insert(0, f"<{score_bands[0]}") band_labels.append(f">{score_bands[-1]}") diff --git a/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py b/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py index 09d18f998..9dd0979c9 100644 --- a/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +++ b/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py @@ -209,7 +209,7 @@ def ScorecardHistogramDrift( # Add separate legend for each subplot fig.update_layout( **{ - f'legend{i+1 if i > 0 else ""}': dict( + f'legend{i + 1 if i > 0 else ""}': dict( yanchor="middle", y=1 - (i / len(classes)) - (0.5 / len(classes)), xanchor="left", diff --git a/validmind/tests/plots/BoxPlot.py b/validmind/tests/plots/BoxPlot.py index 7c2861ef4..9074237d8 100644 --- a/validmind/tests/plots/BoxPlot.py +++ b/validmind/tests/plots/BoxPlot.py @@ -149,8 +149,8 @@ def _create_multiple_boxplots( text=f"No data available
for {column}", x=0.5, y=0.5, - xref=f"x{idx+1} domain" if idx > 0 else "x domain", - yref=f"y{idx+1} domain" if idx > 0 else "y domain", + xref=f"x{idx + 1} domain" if idx > 0 else "x domain", + yref=f"y{idx + 1} domain" if idx > 0 else "y domain", showarrow=False, row=row, col=col, diff --git a/validmind/tests/plots/HistogramPlot.py b/validmind/tests/plots/HistogramPlot.py index b5fbbaf35..bbec6a1aa 100644 --- a/validmind/tests/plots/HistogramPlot.py +++ b/validmind/tests/plots/HistogramPlot.py @@ -95,8 +95,8 @@ def _add_stats_annotation(fig, data, idx, row, col): text=stats_text, x=0.02, y=0.98, - xref=f"x{idx+1} domain" if idx > 0 else "x domain", - yref=f"y{idx+1} domain" if idx > 0 else "y domain", + xref=f"x{idx + 1} domain" if idx > 0 else "x domain", + yref=f"y{idx + 1} domain" if idx > 0 else "y domain", showarrow=False, align="left", bgcolor="rgba(255,255,255,0.8)", @@ -195,8 +195,8 @@ def HistogramPlot( text=f"No data available
for {column}", x=0.5, y=0.5, - xref=f"x{idx+1}" if idx > 0 else "x", - yref=f"y{idx+1}" if idx > 0 else "y", + xref=f"x{idx + 1}" if idx > 0 else "x", + yref=f"y{idx + 1}" if idx > 0 else "y", showarrow=False, row=row, col=col, diff --git a/validmind/tests/stats/DescriptiveStats.py b/validmind/tests/stats/DescriptiveStats.py index a36e61536..983080df2 100644 --- a/validmind/tests/stats/DescriptiveStats.py +++ b/validmind/tests/stats/DescriptiveStats.py @@ -96,8 +96,8 @@ def _compute_advanced_stats(column: str, data, confidence_level: float): "Skewness": skewness, "Kurtosis": kurtosis_val, "CV %": cv, - f"CI Lower ({confidence_level*100:.0f}%)": ci_lower, - f"CI Upper ({confidence_level*100:.0f}%)": ci_upper, + f"CI Lower ({confidence_level * 100:.0f}%)": ci_lower, + f"CI Upper ({confidence_level * 100:.0f}%)": ci_upper, "Normality Test": normality_test, "Normality Stat": normality_stat, "Normality p-value": normality_p,