Skip to content

Commit de46272

Browse files
authored
fix compatibility with pandas 2.0
1 parent 8b3c5ce commit de46272

File tree

7 files changed

+81
-38
lines changed

7 files changed

+81
-38
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ The latest release of `mostlyai-qa` can be installed via pip:
2020
pip install -U mostlyai-qa
2121
```
2222

23+
The latest development version can be installed directly from GitHub:
24+
25+
```bash
26+
pip install -U git+https://github.com/mostly-ai/mostlyai-qa.git@main
27+
```
28+
2329
## Quick Start
2430

2531
```python

mostlyai/qa/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import os
1616

1717
import pandas as pd
18+
from packaging.version import Version
1819

1920
from mostlyai.qa.report import report
2021
from mostlyai.qa.report_from_statistics import report_from_statistics
@@ -23,4 +24,5 @@
2324
__version__ = "1.3.0"
2425

2526
os.environ["TOKENIZERS_PARALLELISM"] = "false"
26-
pd.set_option("future.no_silent_downcasting", True)
27+
if Version(pd.__version__) >= Version("2.2.0"):
28+
pd.set_option("future.no_silent_downcasting", True)

mostlyai/qa/report.py

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -233,40 +233,54 @@ def report(
233233
hol_sample_size or float("inf"),
234234
)
235235

236-
if max_sample_size_embeddings_final >= 10_000 and max_sample_size_embeddings is None:
236+
if max_sample_size_embeddings_final > 10_000 and max_sample_size_embeddings is None:
237237
warnings.warn(
238238
UserWarning(
239239
"More than 10k embeddings will be calculated per dataset. "
240240
"Consider setting a limit via `max_sample_size_embeddings`."
241241
)
242242
)
243243

244-
def _calc_pull_embeds(
245-
df_tgt: pd.DataFrame, df_ctx: pd.DataFrame, progress_from: int, progress_to: int
246-
) -> np.ndarray:
247-
strings = pull_data_for_embeddings(
248-
df_tgt=df_tgt,
249-
df_ctx=df_ctx,
244+
_LOG.info("calculate embeddings for synthetic")
245+
syn_embeds = calculate_embeddings(
246+
strings=pull_data_for_embeddings(
247+
df_tgt=syn_tgt_data,
248+
df_ctx=syn_ctx_data,
250249
ctx_primary_key=ctx_primary_key,
251250
tgt_context_key=tgt_context_key,
252251
max_sample_size=max_sample_size_embeddings_final,
253-
)
254-
# split into buckets for calculating embeddings to avoid memory issues and report continuous progress
255-
buckets = np.array_split(strings, progress_to - progress_from)
256-
buckets = [b for b in buckets if len(b) > 0]
257-
embeds = []
258-
for i, bucket in enumerate(buckets, 1):
259-
embeds += [calculate_embeddings(bucket.tolist())]
260-
progress.update(completed=progress_from + i, total=100)
261-
progress.update(completed=progress_to, total=100)
262-
embeds = np.concatenate(embeds, axis=0)
263-
_LOG.info(f"calculated embeddings {embeds.shape}")
264-
return embeds
265-
266-
syn_embeds = _calc_pull_embeds(df_tgt=syn_tgt_data, df_ctx=syn_ctx_data, progress_from=20, progress_to=40)
267-
trn_embeds = _calc_pull_embeds(df_tgt=trn_tgt_data, df_ctx=trn_ctx_data, progress_from=40, progress_to=60)
252+
),
253+
progress=progress,
254+
progress_from=20,
255+
progress_to=40,
256+
)
257+
_LOG.info("calculate embeddings for training")
258+
trn_embeds = calculate_embeddings(
259+
strings=pull_data_for_embeddings(
260+
df_tgt=trn_tgt_data,
261+
df_ctx=trn_ctx_data,
262+
ctx_primary_key=ctx_primary_key,
263+
tgt_context_key=tgt_context_key,
264+
max_sample_size=max_sample_size_embeddings_final,
265+
),
266+
progress=progress,
267+
progress_from=40,
268+
progress_to=60,
269+
)
268270
if hol_tgt_data is not None:
269-
hol_embeds = _calc_pull_embeds(df_tgt=hol_tgt_data, df_ctx=hol_ctx_data, progress_from=60, progress_to=80)
271+
_LOG.info("calculate embeddings for holdout")
272+
hol_embeds = calculate_embeddings(
273+
strings=pull_data_for_embeddings(
274+
df_tgt=hol_tgt_data,
275+
df_ctx=hol_ctx_data,
276+
ctx_primary_key=ctx_primary_key,
277+
tgt_context_key=tgt_context_key,
278+
max_sample_size=max_sample_size_embeddings_final,
279+
),
280+
progress=progress,
281+
progress_from=60,
282+
progress_to=80,
283+
)
270284
else:
271285
hol_embeds = None
272286
progress.update(completed=80, total=100)

mostlyai/qa/report_from_statistics.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,16 @@ def report_from_statistics(
107107

108108
_LOG.info("calculate embeddings for synthetic")
109109
syn_embeds = calculate_embeddings(
110-
pull_data_for_embeddings(
110+
strings=pull_data_for_embeddings(
111111
df_tgt=syn_tgt_data,
112112
df_ctx=syn_ctx_data,
113113
ctx_primary_key=ctx_primary_key,
114114
tgt_context_key=tgt_context_key,
115115
max_sample_size=max_sample_size_embeddings,
116-
)
116+
),
117+
progress=progress,
118+
progress_from=30,
119+
progress_to=50,
117120
)
118121

119122
_LOG.info("report similarity")

mostlyai/qa/sampling.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
NXT_COLUMN_PREFIX,
4141
COUNT_COLUMN,
4242
ACCURACY_MAX_COLUMNS,
43+
ProgressCallbackWrapper,
4344
)
4445
from mostlyai.qa.assets import load_embedder, load_tokenizer
4546

@@ -221,8 +222,9 @@ def sequence_to_string(sequence: pd.DataFrame) -> str:
221222
return ", ".join(sequence.apply(row_to_string, axis=1))
222223

223224
strings = (
224-
df_tgt.groupby(tgt_context_key)
225-
.apply(sequence_to_string, include_groups=False)
225+
df_tgt.set_index(tgt_context_key)
226+
.groupby(tgt_context_key)
227+
.apply(sequence_to_string)
226228
.sample(frac=1)
227229
.reset_index(drop=True)
228230
)
@@ -233,13 +235,30 @@ def sequence_to_string(sequence: pd.DataFrame) -> str:
233235
return strings.to_list()
234236

235237

236-
def calculate_embeddings(strings: list[str]) -> np.ndarray:
238+
def calculate_embeddings(
239+
strings: list[str],
240+
progress: ProgressCallbackWrapper | None = None,
241+
progress_from: int | None = None,
242+
progress_to: int | None = None,
243+
) -> np.ndarray:
237244
t0 = time.time()
245+
# load embedder
238246
embedder = load_embedder(device="cuda" if torch.cuda.is_available() else "cpu")
239-
embeddings = embedder.encode(strings)
240-
time_elapsed = time.time() - t0
241-
_LOG.info(f"created embeddings for {len(strings):,} records ({time_elapsed=:.2f}s)")
242-
return embeddings
247+
# split into buckets for calculating embeddings to avoid memory issues and report continuous progress
248+
steps = progress_to - progress_from if progress_to is not None and progress_from is not None else 1
249+
buckets = np.array_split(strings, steps)
250+
buckets = [b for b in buckets if len(b) > 0]
251+
# calculate embeddings for each bucket
252+
embeds = []
253+
for i, bucket in enumerate(buckets, 1):
254+
embeds += [embedder.encode(bucket.tolist(), show_progress_bar=False)]
255+
if progress is not None:
256+
progress.update(completed=progress_from + i, total=100)
257+
if progress is not None:
258+
progress.update(completed=progress_to, total=100)
259+
embeds = np.concatenate(embeds, axis=0)
260+
_LOG.info(f"calculated embeddings {embeds.shape} in {time.time() - t0:.2f}s")
261+
return embeds
243262

244263

245264
def sample_text_tokens(df: pd.DataFrame) -> pd.DataFrame:

mostlyai/qa/similarity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def calculate_mean_auc(embeds1, embeds2):
9696

9797
# calculate the AUC score
9898
auc_score = roc_auc_score(y_holdout, y_holdout_pred)
99-
auc_scores.append(auc_score)
99+
auc_scores.append(round(auc_score, 4))
100100

101101
_LOG.info(f"{auc_scores=}")
102102

tests/unit/test_similarity.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,17 @@
1313
# limitations under the License.
1414

1515
import numpy as np
16-
import pandas as pd
1716

1817
from mostlyai.qa.similarity import calculate_cosine_similarities, calculate_discriminator_auc
1918
from mostlyai.qa.sampling import calculate_embeddings
2019

2120

2221
def test_calculate_embeddings():
23-
trn = pd.Series(["apple recipe", "car engine repair", "apple recipe"])
22+
trn = ["apple recipe", "car engine repair", "apple recipe"]
2423
# semantically close synthetic data
25-
syn_close = pd.Series(["apple pie", "car maintenance"])
24+
syn_close = ["apple pie", "car maintenance"]
2625
# semantically distant synthetic data
27-
syn_distant = pd.Series(["quantum physics theory", "deep space exploration"])
26+
syn_distant = ["quantum physics theory", "deep space exploration"]
2827

2928
trn_embeds = calculate_embeddings(trn)
3029
syn_close_embeds = calculate_embeddings(syn_close)

0 commit comments

Comments
 (0)