@@ -181,6 +181,8 @@ def report(
181181 check_min_sample_size (trn_sample_size , 90 , "training" )
182182 if hol_tgt_data is not None :
183183 check_min_sample_size (hol_sample_size , 10 , "holdout" )
184+ if trn_tgt_data .shape [1 ] == 0 or syn_tgt_data .shape [1 ] == 0 :
185+ raise PrerequisiteNotMetError ("Provided data has no columns." )
184186 except PrerequisiteNotMetError as err :
185187 _LOG .info (err )
186188 statistics .mark_early_exit ()
@@ -205,7 +207,6 @@ def report(
205207 else :
206208 setup = "1:1"
207209
208- _LOG .info ("prepare training data for accuracy" )
209210 trn = prepare_data_for_accuracy (
210211 df_tgt = trn_tgt_data ,
211212 df_ctx = trn_ctx_data ,
@@ -214,8 +215,8 @@ def report(
214215 max_sample_size = max_sample_size_accuracy ,
215216 setup = setup ,
216217 )
218+ _LOG .info (f"prepared training data for accuracy: { trn .shape } " )
217219 if hol_tgt_data is not None :
218- _LOG .info ("prepare holdout data for accuracy" )
219220 hol = prepare_data_for_accuracy (
220221 df_tgt = hol_tgt_data ,
221222 df_ctx = hol_ctx_data ,
@@ -225,13 +226,13 @@ def report(
225226 setup = setup ,
226227 ori_dtypes = trn .dtypes .to_dict (),
227228 )
229+ _LOG .info (f"prepared holdout data for accuracy: { hol .shape } " )
228230 ori = pd .concat ([trn , hol ], axis = 0 , ignore_index = True )
229231 else :
230232 hol = None
231233 ori = trn
232234 progress .update (completed = 5 , total = 100 )
233235
234- _LOG .info ("prepare synthetic data for accuracy" )
235236 syn = prepare_data_for_accuracy (
236237 df_tgt = syn_tgt_data ,
237238 df_ctx = syn_ctx_data ,
@@ -241,29 +242,29 @@ def report(
241242 setup = setup ,
242243 ori_dtypes = trn .dtypes .to_dict (),
243244 )
245+ _LOG .info (f"prepared synthetic data for accuracy: { syn .shape } " )
244246 progress .update (completed = 10 , total = 100 )
245247
246248 # do coherence analysis only if there are non-fk columns in the target data
247249 do_coherence = setup == "1:N" and len (trn_tgt_data .columns ) > 1
248250 if do_coherence :
249- _LOG .info ("prepare original data for coherence started" )
250251 ori_coh , ori_coh_bins = prepare_data_for_coherence (
251252 df_tgt = pd .concat ([trn_tgt_data , hol_tgt_data ]) if hol_tgt_data is not None else trn_tgt_data ,
252253 tgt_context_key = tgt_context_key ,
253254 max_sample_size = max_sample_size_coherence ,
254255 )
255- _LOG .info ("prepare synthetic data for coherence started " )
256+ _LOG .info (f"prepared original data for coherence: { ori_coh . shape } " )
256257 syn_coh , _ = prepare_data_for_coherence (
257258 df_tgt = syn_tgt_data ,
258259 tgt_context_key = tgt_context_key ,
259260 bins = ori_coh_bins ,
260261 max_sample_size = max_sample_size_coherence ,
261262 )
262- _LOG .info ("store bins used for training data for coherence" )
263+ _LOG .info (f"prepared synthetic data for coherence: { syn_coh . shape } " )
263264 statistics .store_coherence_bins (bins = ori_coh_bins )
265+ _LOG .info ("stored bins used for training data for coherence" )
264266 progress .update (completed = 15 , total = 100 )
265267
266- _LOG .info ("calculate embeddings" )
267268 syn_embeds , trn_embeds , hol_embeds = prepare_data_for_embeddings (
268269 syn_tgt_data = syn_tgt_data ,
269270 trn_tgt_data = trn_tgt_data ,
@@ -275,6 +276,9 @@ def report(
275276 tgt_context_key = tgt_context_key ,
276277 max_sample_size = max_sample_size_embeddings ,
277278 )
279+ _LOG .info (
280+ f"calculated embeddings: syn={ syn_embeds .shape } , trn={ trn_embeds .shape } , hol={ hol_embeds .shape if hol_embeds is not None else None } "
281+ )
278282 progress .update (completed = 20 , total = 100 )
279283
280284 ## 1. ACCURACY ##
0 commit comments