feat: warn user on inconsistent dtypes (#128)

lukaszkolodziejczyk · web-flow · commit 13f86166d1de · 2025-03-25T15:21:37.000Z
diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py
@@ -28,6 +28,8 @@
 import logging
 import random
 import time
+from typing import Any
+from pandas.core.dtypes.common import is_numeric_dtype, is_datetime64_dtype
 
 import numpy as np
 import pandas as pd
@@ -56,6 +58,7 @@ def pull_data_for_accuracy(
     tgt_context_key: str | None = None,
     max_sample_size: int | None = None,
     setup: str | None = None,
+    trn_dtypes: dict[str, str] | None = None,
 ) -> pd.DataFrame:
     """
     Prepare single dataset for accuracy report.
@@ -130,6 +133,14 @@ def pull_data_for_accuracy(
     # harmonize dtypes
     df = df.apply(harmonize_dtype)
 
+    # coerce dtypes to trn_dtypes
+    for trn_col, trn_dtype in (trn_dtypes or {}).items():
+        if is_numeric_dtype(trn_dtype):
+            df[trn_col] = pd.to_numeric(df[trn_col], errors="coerce")
+        elif is_datetime64_dtype(trn_dtype):
+            df[trn_col] = pd.to_datetime(df[trn_col], errors="coerce")
+        df[trn_col] = df[trn_col].astype(trn_dtype)
+
     # sample tokens from text-like columns
     df = sample_text_tokens(df)
 
@@ -303,10 +314,10 @@ def calculate_embeddings(
 def sample_text_tokens(df: pd.DataFrame) -> pd.DataFrame:
     tokenizer = load_tokenizer()
 
-    def tokenize_and_sample(text: str | None) -> str | None:
+    def tokenize_and_sample(text: Any) -> str | None:
         if pd.isna(text) or text == "":
             return None
-        tokens = tokenizer.tokenize(text)
+        tokens = tokenizer.tokenize(str(text))
         tokens = (t.replace("Ġ", "▁") for t in tokens)  # replace initial space with thick underscore
         return random.choice(list(tokens))
 
@@ -337,7 +348,7 @@ def is_timestamp_dtype(x: pd.Series) -> bool:
         else:
             x = x.astype("object")
     except Exception:
-        # leave dtype as-is, but just log a warning message
+        # leave dtype as-is
         pass
     return x
 
diff --git a/mostlyai/qa/reporting.py b/mostlyai/qa/reporting.py
@@ -148,6 +148,10 @@ def report(
         if hol_ctx_data is not None and trn_ctx_data is not None:
             hol_ctx_data = hol_ctx_data[trn_ctx_data.columns]
 
+        # warn if dtypes are inconsistent across datasets
+        _warn_if_dtypes_inconsistent(syn_tgt_data, trn_tgt_data, hol_tgt_data)
+        _warn_if_dtypes_inconsistent(syn_ctx_data, trn_ctx_data, hol_ctx_data)
+
         # prepare report_path
         if report_path is None:
             report_path = Path.cwd() / "model-report.html"
@@ -200,36 +204,29 @@ def report(
         else:
             setup = "1:1"
 
-        _LOG.info("prepare synthetic data for accuracy started")
-        syn = pull_data_for_accuracy(
-            df_tgt=syn_tgt_data,
-            df_ctx=syn_ctx_data,
+        _LOG.info("prepare training data for accuracy started")
+        trn = pull_data_for_accuracy(
+            df_tgt=trn_tgt_data,
+            df_ctx=trn_ctx_data,
             ctx_primary_key=ctx_primary_key,
             tgt_context_key=tgt_context_key,
             max_sample_size=max_sample_size_accuracy,
             setup=setup,
         )
         progress.update(completed=5, total=100)
 
-        _LOG.info("prepare training data for accuracy started")
-        trn = pull_data_for_accuracy(
-            df_tgt=trn_tgt_data,
-            df_ctx=trn_ctx_data,
+        _LOG.info("prepare synthetic data for accuracy started")
+        syn = pull_data_for_accuracy(
+            df_tgt=syn_tgt_data,
+            df_ctx=syn_ctx_data,
             ctx_primary_key=ctx_primary_key,
             tgt_context_key=tgt_context_key,
             max_sample_size=max_sample_size_accuracy,
             setup=setup,
+            trn_dtypes=trn.dtypes.to_dict(),
         )
         progress.update(completed=10, total=100)
 
-        # coerce dtypes to match the original training data dtypes
-        for col in trn:
-            if is_numeric_dtype(trn[col]):
-                syn[col] = pd.to_numeric(syn[col], errors="coerce")
-            elif is_datetime64_dtype(trn[col]):
-                syn[col] = pd.to_datetime(syn[col], errors="coerce")
-            syn[col] = syn[col].astype(trn[col].dtype)
-
         _LOG.info("report accuracy and correlations")
         acc_uni, acc_biv, corr_trn = _report_accuracy_and_correlations(
             trn=trn,
@@ -396,6 +393,29 @@ def report(
         return report_path, metrics
 
 
+def _warn_if_dtypes_inconsistent(syn_df: pd.DataFrame | None, trn_df: pd.DataFrame | None, hol_df: pd.DataFrame | None):
+    dfs = [df for df in (syn_df, trn_df, hol_df) if df is not None]
+    if not dfs:
+        return
+    common_columns = set.intersection(*[set(df.columns) for df in dfs])
+    column_dtypes = {col: [df[col].dtype for df in dfs] for col in common_columns}
+    inconsistent_columns = []
+    for col, dtypes in column_dtypes.items():
+        any_datetimes = any(is_datetime64_dtype(dtype) for dtype in dtypes)
+        any_numbers = any(is_numeric_dtype(dtype) for dtype in dtypes)
+        any_others = any(not is_datetime64_dtype(dtype) and not is_numeric_dtype(dtype) for dtype in dtypes)
+        if sum([any_datetimes, any_numbers, any_others]) > 1:
+            inconsistent_columns.append(col)
+    if inconsistent_columns:
+        warnings.warn(
+            UserWarning(
+                f"The column(s) {inconsistent_columns} have inconsistent data types across `syn`, `trn`, and `hol`. "
+                "To achieve the most accurate results, please harmonize the data types of these inputs. "
+                "Proceeding with a best-effort attempt..."
+            )
+        )
+
+
 def _calculate_metrics(
     *,
     acc_uni: pd.DataFrame,
diff --git a/tests/end_to_end/test_report.py b/tests/end_to_end/test_report.py
@@ -14,11 +14,13 @@
 
 import uuid
 from pathlib import Path
+import warnings
 
 import pandas as pd
 import numpy as np
 
 from mostlyai import qa
+from datetime import datetime, timedelta
 
 
 def mock_data(n):
@@ -278,3 +280,40 @@ def test_missing(tmp_path):
         trn_tgt_data=df1,
     )
     assert metrics is not None
+
+
+def test_mixed_dtypes(tmp_path):
+    # test that datetime columns drawn from the same distribution, but having different dtype
+    # are still yielding somewhat good results and warning is issued
+
+    def generate_dates(start_date, end_date, num_samples):
+        days_range = (end_date - start_date).days
+        return [start_date + timedelta(days=int(days)) for days in np.random.randint(0, days_range, num_samples)]
+
+    num_samples = 200
+    start_date = datetime(2020, 1, 1)
+    end_date = datetime(2023, 12, 31)
+    df = pd.DataFrame(
+        {
+            "trn_dt": pd.Series(generate_dates(start_date, end_date, num_samples)).values.astype(str),
+            "syn_dt": pd.Series(generate_dates(start_date, end_date, num_samples), dtype="datetime64[ns]"),
+        }
+    )
+    trn_df, syn_df = df["trn_dt"].to_frame("dt"), df["syn_dt"].to_frame("dt")
+
+    with warnings.catch_warnings(record=True) as w:
+        _, statistics = qa.report(
+            syn_tgt_data=syn_df,
+            trn_tgt_data=trn_df,
+            report_path=tmp_path / "report.html",
+        )
+        expected_warning = (
+            "The column(s) ['dt'] have inconsistent data types across `syn`, `trn`, and `hol`. "
+            "To achieve the most accurate results, please harmonize the data types of these inputs. "
+            "Proceeding with a best-effort attempt..."
+        )
+        assert any(expected_warning in str(warning.message) for warning in w), (
+            "Expected a warning about dtype mismatch for column 'dt'"
+        )
+    assert statistics.accuracy.overall > 0.6
+    assert 0.4 < statistics.similarity.discriminator_auc_training_synthetic < 0.6