mostly-ai
diff --git a/‎mostlyai/qa/_distances.py‎
Lines changed: 144 additions & 33 deletions b/‎mostlyai/qa/_distances.py‎
Lines changed: 144 additions & 33 deletions
diff --git a/‎mostlyai/qa/_sampling.py‎
Lines changed: 0 additions & 17 deletions b/‎mostlyai/qa/_sampling.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎mostlyai/qa/assets/html/head.html‎
Lines changed: 7 additions & 0 deletions b/‎mostlyai/qa/assets/html/head.html‎
Lines changed: 7 additions & 0 deletions
@@ -17,50 +17,164 @@
 import time
 
 import numpy as np
+import pandas as pd
+from sklearn.preprocessing import QuantileTransformer
 
 from mostlyai.qa._common import (
     CHARTS_COLORS,
     CHARTS_FONTS,
+    EMPTY_BIN,
+    NA_BIN,
+    RARE_BIN,
 )
 from mostlyai.qa._filesystem import TemporaryWorkspace
 from plotly import graph_objs as go
 
+from mostlyai.qa.assets import load_embedder
+from sklearn.decomposition import PCA
+
 _LOG = logging.getLogger(__name__)
 
 
+def encode_numerics(
+    syn: pd.DataFrame, trn: pd.DataFrame, hol: pd.DataFrame | None = None
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None]:
+    """
+    Encode numeric features by mapping this via QuantileTransformer to a uniform distribution from [-0.5, 0.5].
+    """
+    syn_num, trn_num, hol_num = {}, {}, {}
+    if hol is None:
+        hol = pd.DataFrame(columns=trn.columns)
+    for col in trn.columns:
+        # convert to numerics
+        syn_num[col] = pd.to_numeric(syn[col], errors="coerce")
+        trn_num[col] = pd.to_numeric(trn[col], errors="coerce")
+        hol_num[col] = pd.to_numeric(hol[col], errors="coerce")
+        # retain NAs (needed for datetime)
+        syn_num[col] = syn_num[col].where(~syn[col].isna(), np.nan)
+        trn_num[col] = trn_num[col].where(~trn[col].isna(), np.nan)
+        hol_num[col] = hol_num[col].where(~hol[col].isna(), np.nan)
+        # normalize numeric features based on trn
+        qt_scaler = QuantileTransformer(
+            output_distribution="uniform",
+            random_state=42,
+            n_quantiles=min(100, len(trn) + len(hol)),
+        )
+        ori_num = pd.concat([trn_num[col], hol_num[col]]) if len(hol) > 0 else pd.DataFrame(trn_num[col])
+        qt_scaler.fit(ori_num.values.reshape(-1, 1))
+        syn_num[col] = qt_scaler.transform(syn_num[col].values.reshape(-1, 1))[:, 0] - 0.5
+        trn_num[col] = qt_scaler.transform(trn_num[col].values.reshape(-1, 1))[:, 0] - 0.5
+        hol_num[col] = qt_scaler.transform(hol_num[col].values.reshape(-1, 1))[:, 0] - 0.5 if len(hol) > 0 else None
+        # replace NAs with 0.0
+        syn_num[col] = np.nan_to_num(syn_num[col], nan=0.0)
+        trn_num[col] = np.nan_to_num(trn_num[col], nan=0.0)
+        hol_num[col] = np.nan_to_num(hol_num[col], nan=0.0)
+        # add extra columns for NAs
+        if trn[col].isna().any() or hol[col].isna().any():
+            syn_num[col + " - N/A"] = syn[col].isna().astype(float)
+            trn_num[col + " - N/A"] = trn[col].isna().astype(float)
+            hol_num[col + " - N/A"] = hol[col].isna().astype(float)
+    syn_num = pd.DataFrame(syn_num, index=syn.index)
+    trn_num = pd.DataFrame(trn_num, index=trn.index)
+    hol_num = pd.DataFrame(hol_num, index=hol.index) if len(hol) > 0 else None
+    return syn_num, trn_num, hol_num
+
+
+def encode_strings(
+    syn: pd.DataFrame, trn: pd.DataFrame, hol: pd.DataFrame | None = None
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None]:
+    """
+    Encode string features by mapping them to a low-dimensional space using PCA of their embeddings.
+    """
+    trn_str, syn_str, hol_str = {}, {}, {}
+    if hol is None:
+        hol = pd.DataFrame(columns=trn.columns)
+    for col in trn.columns:
+        # prepare inputs
+        syn_col = syn[col].astype(str).fillna(NA_BIN).replace("", EMPTY_BIN)
+        trn_col = trn[col].astype(str).fillna(NA_BIN).replace("", EMPTY_BIN)
+        hol_col = hol[col].astype(str).fillna(NA_BIN).replace("", EMPTY_BIN)
+        # get unique original values
+        uvals = pd.concat([trn_col, hol_col]).value_counts().index.to_list()
+        # map out of range values to RARE_BIN
+        syn_col = syn_col.where(syn_col.isin(uvals), RARE_BIN)
+        # embed unique values into high-dimensional space
+        embedder = load_embedder()
+        embeds = embedder.encode(uvals + [RARE_BIN])
+        # project embeddings into a low-dimensional space
+        dims = 2  # potentially adapt to the number of unique values
+        pca_model = PCA(n_components=dims)
+        embeds = pca_model.fit_transform(embeds)
+        # create mapping from unique values to PCA
+        embeds = pd.DataFrame(embeds)
+        embeds.index = uvals + [RARE_BIN]
+        # map values to PCA
+        syn_str[col] = embeds.reindex(syn_col.values).reset_index(drop=True)
+        trn_str[col] = embeds.reindex(trn_col.values).reset_index(drop=True)
+        hol_str[col] = embeds.reindex(hol_col.values).reset_index(drop=True)
+        # assign column names
+        columns = [f"{col} - PCA {i + 1}" for i in range(dims)]
+        syn_str[col].columns = columns
+        trn_str[col].columns = columns
+        hol_str[col].columns = columns
+    syn_str = pd.concat(syn_str.values(), axis=1) if syn_str else pd.DataFrame()
+    syn_str.index = syn.index
+    trn_str = pd.concat(trn_str.values(), axis=1) if trn_str else pd.DataFrame()
+    trn_str.index = trn.index
+    if len(hol) > 0:
+        hol_str = pd.concat(hol_str.values(), axis=1) if hol_str else pd.DataFrame()
+        hol_str.index = hol.index
+    else:
+        hol_str = None
+    return syn_str, trn_str, hol_str
+
+
+def encode_data(
+    syn: pd.DataFrame, trn: pd.DataFrame, hol: pd.DataFrame | None = None
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None]:
+    """
+    Encode all columns corresponding to their data type.
+    """
+    num_dat_cols = trn.select_dtypes(include=["number", "datetime"]).columns
+    string_cols = [col for col in trn.columns if col not in num_dat_cols]
+    syn_num, trn_num, hol_num = encode_numerics(
+        syn[num_dat_cols], trn[num_dat_cols], hol[num_dat_cols] if hol is not None else None
+    )
+    syn_str, trn_str, hol_str = encode_strings(
+        syn[string_cols], trn[string_cols], hol[string_cols] if hol is not None else None
+    )
+    syn_encoded = pd.concat([syn_num, syn_str], axis=1)
+    trn_encoded = pd.concat([trn_num, trn_str], axis=1)
+    hol_encoded = pd.concat([hol_num, hol_str], axis=1) if hol is not None else None
+    return syn_encoded, trn_encoded, hol_encoded
+
+
 def calculate_dcrs_nndrs(
     data: np.ndarray | None, query: np.ndarray | None
 ) -> tuple[np.ndarray | None, np.ndarray | None]:
     """
     Calculate Distance to Closest Records (DCRs) and Nearest Neighbor Distance Ratios (NNDRs).
-
-    Args:
-        data: Embeddings of the training data.
-        query: Embeddings of the query set.
-
-    Returns:
     """
-    if data is None or query is None:
+    if data is None or query is None or data.shape[0] == 0 or query.shape[0] == 0:
         return None, None
     _LOG.info(f"calculate DCRs for {data.shape=} and {query.shape=}")
     t0 = time.time()
     data = data[data[:, 0].argsort()]  # sort data by first dimension to enforce deterministic results
+
     if platform.system() == "Linux":
         # use FAISS on Linux for best performance
         import faiss  # type: ignore
 
-        index = faiss.IndexFlatIP(data.shape[1])  # inner product for cosine similarity with normalized vectors
+        index = faiss.IndexFlatL2(data.shape[1])
         index.add(data)
-        similarities, _ = index.search(query, 2)
-        dcrs = np.clip(1 - similarities, 0, 1)
+        dcrs, _ = index.search(query, 2)
+        dcrs = np.sqrt(dcrs)  # FAISS returns squared distances
     else:
         # use sklearn as a fallback on non-Linux systems to avoid segfaults; these occurred when using QA as part of SDK
         from sklearn.neighbors import NearestNeighbors  # type: ignore
         from joblib import cpu_count  # type: ignore
 
-        index = NearestNeighbors(
-            n_neighbors=2, algorithm="auto", metric="cosine", n_jobs=min(16, max(1, cpu_count() - 1))
-        )
+        index = NearestNeighbors(n_neighbors=2, algorithm="auto", metric="l2", n_jobs=min(16, max(1, cpu_count() - 1)))
         index.fit(data)
         dcrs, _ = index.kneighbors(query)
     dcr = dcrs[:, 0]
@@ -70,34 +184,31 @@ def calculate_dcrs_nndrs(
 
 
 def calculate_distances(
-    *, syn_embeds: np.ndarray, trn_embeds: np.ndarray, hol_embeds: np.ndarray | None
+    *, syn_encoded: np.ndarray, trn_encoded: np.ndarray, hol_encoded: np.ndarray | None
 ) -> dict[str, np.ndarray]:
     """
     Calculates distances to the closest records (DCR).
-
-    Args:
-        syn_embeds: Embeddings of synthetic data.
-        trn_embeds: Embeddings of training data.
-        hol_embeds: Embeddings of holdout data.
-
-    Returns:
-        Dictionary containing:
-            - dcr_syn_trn: DCR for synthetic to training.
-            - dcr_syn_hol: DCR for synthetic to holdout.
-            - dcr_trn_hol: DCR for training to holdout.
-            - nndr_syn_trn: NNDR for synthetic to training.
-            - nndr_syn_hol: NNDR for synthetic to holdout.
-            - nndr_trn_hol: NNDR for training to holdout.
     """
-    if hol_embeds is not None:
-        assert trn_embeds.shape == hol_embeds.shape
+    assert syn_encoded.shape == trn_encoded.shape
+    if hol_encoded is not None and hol_encoded.shape[0] > 0:
+        assert trn_encoded.shape == hol_encoded.shape
+
+    # cap dimensionality of encoded data
+    max_dims = 256
+    if trn_encoded.shape[1] > max_dims:
+        _LOG.info(f"capping dimensionality of encoded data from {trn_encoded.shape[1]} to {max_dims}")
+        pca_model = PCA(n_components=max_dims)
+        pca_model.fit(np.vstack((trn_encoded, hol_encoded)))
+        trn_encoded = pca_model.transform(trn_encoded)
+        hol_encoded = pca_model.transform(hol_encoded)
+        syn_encoded = pca_model.transform(syn_encoded)
 
     # calculate DCR / NNDR for synthetic to training
-    dcr_syn_trn, nndr_syn_trn = calculate_dcrs_nndrs(data=trn_embeds, query=syn_embeds)
+    dcr_syn_trn, nndr_syn_trn = calculate_dcrs_nndrs(data=trn_encoded, query=syn_encoded)
     # calculate DCR / NNDR for synthetic to holdout
-    dcr_syn_hol, nndr_syn_hol = calculate_dcrs_nndrs(data=hol_embeds, query=syn_embeds)
+    dcr_syn_hol, nndr_syn_hol = calculate_dcrs_nndrs(data=hol_encoded, query=syn_encoded)
     # calculate DCR / NNDR for holdout to training
-    dcr_trn_hol, nndr_trn_hol = calculate_dcrs_nndrs(data=trn_embeds, query=hol_embeds)
+    dcr_trn_hol, nndr_trn_hol = calculate_dcrs_nndrs(data=trn_encoded, query=hol_encoded)
 
     # log statistics
     def deciles(x):
 
@@ -25,11 +25,9 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-import datetime
 import logging
 import random
 import time
-import string
 import xxhash
 from typing import Any
 from pandas.core.dtypes.common import is_numeric_dtype, is_datetime64_dtype
@@ -230,7 +228,6 @@ def pull_data_for_embeddings(
     ctx_primary_key: str | None = None,
     tgt_context_key: str | None = None,
     max_sample_size: int | None = None,
-    bins: dict[str, list] | None = None,
 ) -> list[str]:
     _LOG.info("pulling data for embeddings")
     t0 = time.time()
@@ -265,20 +262,6 @@ def pull_data_for_embeddings(
     df_tgt = df_tgt.rename(columns={tgt_context_key: key})
     tgt_context_key = key
 
-    # bin columns; also to prevent distortion of embeddings by adding extra precision or unknown values
-    bins = bins or {}
-    df_tgt.columns = [TGT_COLUMN_PREFIX + c if c != key else c for c in df_tgt.columns]
-    df_tgt, _ = bin_data(df_tgt, bins=bins, non_categorical_label_style="lower")
-    # add some prefix to make numeric and date values unique in the embedding space
-    for col in df_tgt.columns:
-        if col in bins:
-            if isinstance(
-                bins[col][0], (int, float, np.integer, np.floating, datetime.date, datetime.datetime, np.datetime64)
-            ):
-                prefixes = string.ascii_lowercase + string.ascii_uppercase
-                prefix = prefixes[xxhash.xxh32_intdigest(col) % len(prefixes)]
-                df_tgt[col] = prefix + df_tgt[col].astype(str)
-
     # split into chunks while keeping groups together and process in parallel
     n_jobs = min(16, max(1, cpu_count() - 1))
     hash_ids = df_tgt[tgt_context_key].apply(lambda x: xxhash.xxh32_intdigest(str(x))) % n_jobs
 
@@ -20,6 +20,13 @@
       font-size: normal;
       color: var(--muted-color);
     }
+    .ref-metric {
+      color: var(--muted-color);
+      margin-top: -2px;
+      font-size: 0.8em;
+      height: 24px;
+      font-weight: normal;
+    }
   </style>
   <script>{{ html_assets['bootstrap-5.3.3.bundle.min.js'] }}</script>
   <script>{{ html_assets['plotly-3.0.1.min.js'] }}</script>