Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
357 changes: 26 additions & 331 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ torch = {version = ">=1.10.0", optional = true}
tqdm = "*"
transformers = {version = "^4.32.0", optional = true}
xgboost = ">=1.5.2,<3"
ydata-profiling = "*"
yfinance = "^0.2.48"

[tool.poetry.group.dev.dependencies]
Expand Down
14 changes: 7 additions & 7 deletions tests/unit_tests/data_validation/test_DatasetDescription.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ def setUp(self):
[True, False, True, True, False, True, False], dtype=bool
), # Explicitly boolean
"text": [
"hello",
"world",
"hello@gmail.com",
"this is a longer text",
"hello world",
"test",
"hello",
"test",
"world",
"this is a longer text",
"this is a longer text",
"another example of text",
"this is a longer text",
], # Text
"all_null": [
None,
Expand Down Expand Up @@ -129,7 +129,7 @@ def test_column_types_and_stats(self):

# Check text column
self.assertEqual(column_info["text"]["Type"], "Text")
self.assertEqual(column_info["text"]["Distinct"], 4) # 4 unique strings
self.assertEqual(column_info["text"]["Distinct"],4) # 4 unique strings
self.assertEqual(column_info["text"]["Missing"], 0) # No missing values
self.assertEqual(column_info["text"]["Count"], 7) # All present

Expand Down
25 changes: 2 additions & 23 deletions validmind/tests/data_validation/DatasetDescription.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@
from collections import Counter

import numpy as np
from ydata_profiling.config import Settings
from ydata_profiling.model.typeset import ProfilingTypeSet

from validmind import RawData, tags, tasks
from validmind.errors import UnsupportedColumnTypeError
from validmind.logging import get_logger
from validmind.utils import infer_datatypes
from validmind.vm_models import VMDataset

DEFAULT_HISTOGRAM_BINS = 10
Expand All @@ -20,25 +18,6 @@
logger = get_logger(__name__)


def infer_datatypes(df):
column_type_mappings = {}
typeset = ProfilingTypeSet(Settings())
variable_types = typeset.infer_type(df)

for column, type in variable_types.items():
if str(type) == "Unsupported":
if df[column].isnull().all():
column_type_mappings[column] = {"id": column, "type": "Null"}
else:
raise UnsupportedColumnTypeError(
f"Unsupported type for column {column}. Please review all values in this dataset column."
)
else:
column_type_mappings[column] = {"id": column, "type": str(type)}

return list(column_type_mappings.values())


def get_numerical_histograms(df, column):
"""
Returns a collection of histograms for a numerical column, each one
Expand All @@ -50,7 +29,7 @@ def get_numerical_histograms(df, column):
# bins='sturges'. Cannot use 'auto' until we review and fix its performance
# on datasets with too many unique values
#
# 'sturges': Rs default method, only accounts for data size. Only optimal
# 'sturges': R's default method, only accounts for data size. Only optimal
# for gaussian data and underestimates number of bins for large non-gaussian datasets.
default_hist = np.histogram(values_cleaned, bins="sturges")

Expand Down
13 changes: 7 additions & 6 deletions validmind/tests/data_validation/Skewness.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from ydata_profiling.config import Settings
from ydata_profiling.model.typeset import ProfilingTypeSet

from validmind import tags, tasks
from validmind.utils import infer_datatypes


@tags("data_quality", "tabular_data")
Expand Down Expand Up @@ -49,16 +47,19 @@ def Skewness(dataset, max_threshold=1):
- Subjective threshold for risk grading, requiring expert input and recurrent iterations for refinement.
"""

typeset = ProfilingTypeSet(Settings())
dataset_types = typeset.infer_type(dataset.df)
# Use the imported infer_datatypes function
dataset_types = infer_datatypes(dataset.df)

# Convert the list of dictionaries to a dictionary for easy access
dataset_types_dict = {item["id"]: item["type"] for item in dataset_types}

skewness = dataset.df.skew(numeric_only=True)

results_table = []
passed = True

for col in skewness.index:
if str(dataset_types[col]) != "Numeric":
if dataset_types_dict.get(col) != "Numeric":
continue

col_skewness = skewness[col]
Expand Down
189 changes: 189 additions & 0 deletions validmind/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,3 +601,192 @@ def serialize(obj):
elif isinstance(obj, (pd.DataFrame, pd.Series)):
return "" # Simple empty string for non-serializable objects
return obj


def is_text_column(series, threshold=0.05):
"""
Determines if a series is likely to contain text data using heuristics.

Args:
series (pd.Series): The pandas Series to analyze
threshold (float): The minimum threshold to classify a pattern match as significant

Returns:
bool: True if the series likely contains text data, False otherwise
"""
# Filter to non-null string values and sample if needed
string_series = series.dropna().astype(str)
if len(string_series) == 0:
return False
if len(string_series) > 1000:
string_series = string_series.sample(1000, random_state=42)

# Calculate basic metrics
total_values = len(string_series)
unique_ratio = len(string_series.unique()) / total_values if total_values > 0 else 0
avg_length = string_series.str.len().mean()
avg_words = string_series.str.split(r"\s+").str.len().mean()

# Check for special text patterns
patterns = {
"url": r"https?://\S+|www\.\S+",
"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
"filepath": r'([a-zA-Z]:|[\\/])([\\/][^\\/:*?"<>|]+)+',
}

# Check if any special patterns exceed threshold
for pattern in patterns.values():
if string_series.str.contains(pattern, regex=True, na=False).mean() > threshold:
return True

# Calculate proportion of alphabetic characters
total_chars = string_series.str.len().sum()
if total_chars > 0:
alpha_ratio = string_series.str.count(r"[a-zA-Z]").sum() / total_chars
else:
alpha_ratio = 0

# Check for free-form text indicators
text_indicators = [
unique_ratio > 0.8 and avg_length > 20, # High uniqueness and long strings
unique_ratio > 0.4
and avg_length > 15
and string_series.str.contains(r"[.,;:!?]", regex=True, na=False).mean()
> 0.3, # Moderate uniqueness with punctuation
string_series.str.contains(
r"\b\w+\b\s+\b\w+\b\s+\b\w+\b\s+\b\w+\b", regex=True, na=False
).mean()
> 0.3, # Contains long phrases
avg_words > 5 and alpha_ratio > 0.6, # Many words with mostly letters
unique_ratio > 0.95 and avg_length > 10, # Very high uniqueness
]

return any(text_indicators)


def _get_numeric_type_detail(column, dtype, series):
"""Helper function to determine numeric type details."""
if pd.api.types.is_integer_dtype(dtype):
return {"type": "Numeric", "subtype": "Integer"}
elif pd.api.types.is_float_dtype(dtype):
return {"type": "Numeric", "subtype": "Float"}
else:
return {"type": "Numeric", "subtype": "Other"}


def _get_text_type_detail(series):
"""Helper function to determine text/categorical type details."""
string_series = series.dropna().astype(str)

if len(string_series) == 0:
return {"type": "Categorical"}

# Check for common patterns
url_pattern = r"https?://\S+|www\.\S+"
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
filepath_pattern = r'([a-zA-Z]:|[\\/])([\\/][^\\/:*?"<>|]+)+'

url_ratio = string_series.str.contains(url_pattern, regex=True, na=False).mean()
email_ratio = string_series.str.contains(email_pattern, regex=True, na=False).mean()
filepath_ratio = string_series.str.contains(
filepath_pattern, regex=True, na=False
).mean()

# Check if general text using enhanced function
if url_ratio > 0.7:
return {"type": "Text", "subtype": "URL"}
elif email_ratio > 0.7:
return {"type": "Text", "subtype": "Email"}
elif filepath_ratio > 0.7:
return {"type": "Text", "subtype": "Path"}
elif is_text_column(series):
return {"type": "Text", "subtype": "FreeText"}

# Must be categorical
n_unique = series.nunique()
if n_unique == 2:
return {"type": "Categorical", "subtype": "Binary"}
else:
return {"type": "Categorical", "subtype": "Nominal"}


def get_column_type_detail(df, column):
"""
Get detailed column type information beyond basic type detection.
Similar to ydata-profiling's type system.

Args:
df (pd.DataFrame): DataFrame containing the column
column (str): Column name to analyze

Returns:
dict: Detailed type information including primary type and subtype
"""
series = df[column]
dtype = series.dtype

# Initialize result with id and basic type
result = {"id": column, "type": "Unknown"}

# Determine type details based on dtype
type_detail = None

if pd.api.types.is_numeric_dtype(dtype):
type_detail = _get_numeric_type_detail(column, dtype, series)
elif pd.api.types.is_bool_dtype(dtype):
type_detail = {"type": "Boolean"}
elif pd.api.types.is_datetime64_any_dtype(dtype):
type_detail = {"type": "Datetime"}
elif pd.api.types.is_categorical_dtype(dtype) or pd.api.types.is_object_dtype(
dtype
):
type_detail = _get_text_type_detail(series)

# Update result with type details
if type_detail:
result.update(type_detail)

return result


def infer_datatypes(df, detailed=False):
"""
Infer data types for columns in a DataFrame.

Args:
df (pd.DataFrame): DataFrame to analyze
detailed (bool): Whether to return detailed type information including subtypes

Returns:
list: Column type mappings
"""
if detailed:
return [get_column_type_detail(df, column) for column in df.columns]

column_type_mappings = {}
# Use pandas to infer data types
for column in df.columns:
# Check if all values are None
if df[column].isna().all():
column_type_mappings[column] = {"id": column, "type": "Null"}
continue

dtype = df[column].dtype
if pd.api.types.is_numeric_dtype(dtype):
column_type_mappings[column] = {"id": column, "type": "Numeric"}
elif pd.api.types.is_bool_dtype(dtype):
column_type_mappings[column] = {"id": column, "type": "Boolean"}
elif pd.api.types.is_datetime64_any_dtype(dtype):
column_type_mappings[column] = {"id": column, "type": "Datetime"}
elif pd.api.types.is_categorical_dtype(dtype) or pd.api.types.is_object_dtype(
dtype
):
# Check if this is more likely to be text than categorical
if is_text_column(df[column]):
column_type_mappings[column] = {"id": column, "type": "Text"}
else:
column_type_mappings[column] = {"id": column, "type": "Categorical"}
else:
column_type_mappings[column] = {"id": column, "type": "Unsupported"}

return list(column_type_mappings.values())
Loading