feat: follow-up to reproducibility PR (#199)

lukaszkolodziejczyk · web-flow · commit 82cb0cf5eda4 · 2025-05-14T14:42:01.000+02:00
diff --git a/mostlyai/qa/_distances.py b/mostlyai/qa/_distances.py
@@ -16,7 +16,6 @@
 import time
 import numpy as np
 import networkx as nx
-import xxhash
 from sklearn.neighbors import NearestNeighbors
 from joblib import cpu_count
 
@@ -159,13 +158,9 @@ def split_columns_into_random_groups(X, k):
     """
     n_cols = X.shape[1]
 
-    # create a deterministic seed based on the input matrix
-    seed = xxhash.xxh32(X.sum()).intdigest()
-    rng = np.random.default_rng(seed)
-
     # shuffle all column indices
     all_indices = np.arange(n_cols)
-    rng.shuffle(all_indices)
+    np.random.shuffle(all_indices)
 
     # evenly divide shuffled indices into k groups
     base_size = n_cols // k
@@ -228,7 +223,6 @@ def correlation_graph(X):
         n_clusters=k,
         affinity="precomputed",  # uses adj_matrix directly as similarity
         assign_labels="kmeans",  # clustering on the embedding
-        random_state=42,
     )
     try:
         labels = sc.fit_predict(adj_matrix)
diff --git a/mostlyai/qa/_embeddings.py b/mostlyai/qa/_embeddings.py
@@ -52,7 +52,6 @@ def encode_numerics(
         # normalize numeric features based on trn
         qt_scaler = QuantileTransformer(
             output_distribution="uniform",
-            random_state=42,
             n_quantiles=min(100, len(trn) + len(hol)),
         )
         ori_num = pd.concat([trn_num[col], hol_num[col]]) if len(hol) > 0 else pd.DataFrame(trn_num[col])
diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py
@@ -82,7 +82,7 @@ def calculate_mean_auc(embeds1, embeds2):
         y = np.hstack((labels1, labels2))
 
         # initialize the cross-validator
-        kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
+        kf = StratifiedKFold(n_splits=10, shuffle=True)
 
         # initialize a list to store AUC scores
         auc_scores = []
@@ -99,7 +99,6 @@ def calculate_mean_auc(embeds1, embeds2):
                     max_depth=10,
                     min_samples_leaf=5,
                     max_features=0.5,
-                    random_state=42,
                 )
                 clf.fit(X_train, y_train)
 

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,6 @@ def encode_numerics(`
`52`	`52`	`# normalize numeric features based on trn`
`53`	`53`	`qt_scaler = QuantileTransformer(`
`54`	`54`	`output_distribution="uniform",`
`55`		`- random_state=42,`
`56`	`55`	`n_quantiles=min(100, len(trn) + len(hol)),`
`57`	`56`	`)`
`58`	`57`	`ori_num = pd.concat([trn_num[col], hol_num[col]]) if len(hol) > 0 else pd.DataFrame(trn_num[col])`