fix compatibility with pandas 2.0

mplatzer · web-flow · commit de46272cf867 · 2024-12-04T22:52:00.000+01:00
diff --git a/README.md b/README.md
@@ -20,6 +20,12 @@ The latest release of `mostlyai-qa` can be installed via pip:
 pip install -U mostlyai-qa
 ```
 
+The latest development version can be installed directly from GitHub:
+
+```bash
+pip install -U git+https://github.com/mostly-ai/mostlyai-qa.git@main
+```
+
 ## Quick Start
 
 ```python
diff --git a/mostlyai/qa/__init__.py b/mostlyai/qa/__init__.py
@@ -15,6 +15,7 @@
 import os
 
 import pandas as pd
+from packaging.version import Version
 
 from mostlyai.qa.report import report
 from mostlyai.qa.report_from_statistics import report_from_statistics
@@ -23,4 +24,5 @@
 __version__ = "1.3.0"
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-pd.set_option("future.no_silent_downcasting", True)
+if Version(pd.__version__) >= Version("2.2.0"):
+    pd.set_option("future.no_silent_downcasting", True)
diff --git a/mostlyai/qa/report.py b/mostlyai/qa/report.py
@@ -233,40 +233,54 @@ def report(
             hol_sample_size or float("inf"),
         )
 
-        if max_sample_size_embeddings_final >= 10_000 and max_sample_size_embeddings is None:
+        if max_sample_size_embeddings_final > 10_000 and max_sample_size_embeddings is None:
             warnings.warn(
                 UserWarning(
                     "More than 10k embeddings will be calculated per dataset. "
                     "Consider setting a limit via `max_sample_size_embeddings`."
                 )
             )
 
-        def _calc_pull_embeds(
-            df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, progress_from: int, progress_to: int
-        ) -> np.ndarray:
-            strings = pull_data_for_embeddings(
-                df_tgt=df_tgt,
-                df_ctx=df_ctx,
+        _LOG.info("calculate embeddings for synthetic")
+        syn_embeds = calculate_embeddings(
+            strings=pull_data_for_embeddings(
+                df_tgt=syn_tgt_data,
+                df_ctx=syn_ctx_data,
                 ctx_primary_key=ctx_primary_key,
                 tgt_context_key=tgt_context_key,
                 max_sample_size=max_sample_size_embeddings_final,
-            )
-            # split into buckets for calculating embeddings to avoid memory issues and report continuous progress
-            buckets = np.array_split(strings, progress_to - progress_from)
-            buckets = [b for b in buckets if len(b) > 0]
-            embeds = []
-            for i, bucket in enumerate(buckets, 1):
-                embeds += [calculate_embeddings(bucket.tolist())]
-                progress.update(completed=progress_from + i, total=100)
-            progress.update(completed=progress_to, total=100)
-            embeds = np.concatenate(embeds, axis=0)
-            _LOG.info(f"calculated embeddings {embeds.shape}")
-            return embeds
-
-        syn_embeds = _calc_pull_embeds(df_tgt=syn_tgt_data, df_ctx=syn_ctx_data, progress_from=20, progress_to=40)
-        trn_embeds = _calc_pull_embeds(df_tgt=trn_tgt_data, df_ctx=trn_ctx_data, progress_from=40, progress_to=60)
+            ),
+            progress=progress,
+            progress_from=20,
+            progress_to=40,
+        )
+        _LOG.info("calculate embeddings for training")
+        trn_embeds = calculate_embeddings(
+            strings=pull_data_for_embeddings(
+                df_tgt=trn_tgt_data,
+                df_ctx=trn_ctx_data,
+                ctx_primary_key=ctx_primary_key,
+                tgt_context_key=tgt_context_key,
+                max_sample_size=max_sample_size_embeddings_final,
+            ),
+            progress=progress,
+            progress_from=40,
+            progress_to=60,
+        )
         if hol_tgt_data is not None:
-            hol_embeds = _calc_pull_embeds(df_tgt=hol_tgt_data, df_ctx=hol_ctx_data, progress_from=60, progress_to=80)
+            _LOG.info("calculate embeddings for holdout")
+            hol_embeds = calculate_embeddings(
+                strings=pull_data_for_embeddings(
+                    df_tgt=hol_tgt_data,
+                    df_ctx=hol_ctx_data,
+                    ctx_primary_key=ctx_primary_key,
+                    tgt_context_key=tgt_context_key,
+                    max_sample_size=max_sample_size_embeddings_final,
+                ),
+                progress=progress,
+                progress_from=60,
+                progress_to=80,
+            )
         else:
             hol_embeds = None
         progress.update(completed=80, total=100)
diff --git a/mostlyai/qa/report_from_statistics.py b/mostlyai/qa/report_from_statistics.py
@@ -107,13 +107,16 @@ def report_from_statistics(
 
         _LOG.info("calculate embeddings for synthetic")
         syn_embeds = calculate_embeddings(
-            pull_data_for_embeddings(
+            strings=pull_data_for_embeddings(
                 df_tgt=syn_tgt_data,
                 df_ctx=syn_ctx_data,
                 ctx_primary_key=ctx_primary_key,
                 tgt_context_key=tgt_context_key,
                 max_sample_size=max_sample_size_embeddings,
-            )
+            ),
+            progress=progress,
+            progress_from=30,
+            progress_to=50,
         )
 
         _LOG.info("report similarity")
diff --git a/mostlyai/qa/sampling.py b/mostlyai/qa/sampling.py
@@ -40,6 +40,7 @@
     NXT_COLUMN_PREFIX,
     COUNT_COLUMN,
     ACCURACY_MAX_COLUMNS,
+    ProgressCallbackWrapper,
 )
 from mostlyai.qa.assets import load_embedder, load_tokenizer
 
@@ -221,8 +222,9 @@ def sequence_to_string(sequence: pd.DataFrame) -> str:
         return ", ".join(sequence.apply(row_to_string, axis=1))
 
     strings = (
-        df_tgt.groupby(tgt_context_key)
-        .apply(sequence_to_string, include_groups=False)
+        df_tgt.set_index(tgt_context_key)
+        .groupby(tgt_context_key)
+        .apply(sequence_to_string)
         .sample(frac=1)
         .reset_index(drop=True)
     )
@@ -233,13 +235,30 @@ def sequence_to_string(sequence: pd.DataFrame) -> str:
     return strings.to_list()
 
 
-def calculate_embeddings(strings: list[str]) -> np.ndarray:
+def calculate_embeddings(
+    strings: list[str],
+    progress: ProgressCallbackWrapper | None = None,
+    progress_from: int | None = None,
+    progress_to: int | None = None,
+) -> np.ndarray:
     t0 = time.time()
+    # load embedder
     embedder = load_embedder(device="cuda" if torch.cuda.is_available() else "cpu")
-    embeddings = embedder.encode(strings)
-    time_elapsed = time.time() - t0
-    _LOG.info(f"created embeddings for {len(strings):,} records ({time_elapsed=:.2f}s)")
-    return embeddings
+    # split into buckets for calculating embeddings to avoid memory issues and report continuous progress
+    steps = progress_to - progress_from if progress_to is not None and progress_from is not None else 1
+    buckets = np.array_split(strings, steps)
+    buckets = [b for b in buckets if len(b) > 0]
+    # calculate embeddings for each bucket
+    embeds = []
+    for i, bucket in enumerate(buckets, 1):
+        embeds += [embedder.encode(bucket.tolist(), show_progress_bar=False)]
+        if progress is not None:
+            progress.update(completed=progress_from + i, total=100)
+    if progress is not None:
+        progress.update(completed=progress_to, total=100)
+    embeds = np.concatenate(embeds, axis=0)
+    _LOG.info(f"calculated embeddings {embeds.shape} in {time.time() - t0:.2f}s")
+    return embeds
 
 
 def sample_text_tokens(df: pd.DataFrame) -> pd.DataFrame:
diff --git a/mostlyai/qa/similarity.py b/mostlyai/qa/similarity.py
@@ -96,7 +96,7 @@ def calculate_mean_auc(embeds1, embeds2):
 
                 # calculate the AUC score
                 auc_score = roc_auc_score(y_holdout, y_holdout_pred)
-                auc_scores.append(auc_score)
+                auc_scores.append(round(auc_score, 4))
 
             _LOG.info(f"{auc_scores=}")
 
diff --git a/tests/unit/test_similarity.py b/tests/unit/test_similarity.py
@@ -13,18 +13,17 @@
 # limitations under the License.
 
 import numpy as np
-import pandas as pd
 
 from mostlyai.qa.similarity import calculate_cosine_similarities, calculate_discriminator_auc
 from mostlyai.qa.sampling import calculate_embeddings
 
 
 def test_calculate_embeddings():
-    trn = pd.Series(["apple recipe", "car engine repair", "apple recipe"])
+    trn = ["apple recipe", "car engine repair", "apple recipe"]
     # semantically close synthetic data
-    syn_close = pd.Series(["apple pie", "car maintenance"])
+    syn_close = ["apple pie", "car maintenance"]
     # semantically distant synthetic data
-    syn_distant = pd.Series(["quantum physics theory", "deep space exploration"])
+    syn_distant = ["quantum physics theory", "deep space exploration"]
 
     trn_embeds = calculate_embeddings(trn)
     syn_close_embeds = calculate_embeddings(syn_close)