improve semantic loss eval

sfluegel · sfluegel · commit 1f6a5ff4a98a · 2024-04-09T10:49:33.000+02:00
diff --git a/chebai/result/analyse_sem.py b/chebai/result/analyse_sem.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import sys
-
+import traceback
 from datetime import datetime
 from chebai.loss.semantic import DisjointLoss
 from chebai.preprocessing.datasets.chebi import ChEBIOver100
@@ -59,15 +59,15 @@ def _sort_results_by_label(n_labels, results, filter):
 def get_best_epoch(run):
     files = run.files()
     best_ep = None
-    best_val_loss = 0
+    best_micro_f1 = 0
     for file in files:
         if file.name.startswith("checkpoints/best_epoch"):
-            val_loss = float(file.name.split("=")[2].split("_")[0])
-            if val_loss < best_val_loss or best_ep is None:
+            micro_f1 = float(file.name.split("=")[-1][:-5])
+            if micro_f1 > best_micro_f1 or best_ep is None:
                 best_ep = int(file.name.split("=")[1].split("_")[0])
-                best_val_loss = val_loss
+                best_micro_f1 = micro_f1
     if best_ep is None:
-        raise Exception("Could not find any 'best' checkpoint")
+        raise Exception(f"Could not find any 'best' checkpoint for run {run.name}")
     else:
         print(f"Best epoch for run {run.name}: {best_ep}")
     return best_ep
@@ -88,7 +88,42 @@ def load_preds_labels_from_wandb(
         f"{data_module.__class__.__name__}_{kind}",
     )
 
-    model = get_checkpoint_from_wandb(epoch, run)
+    model = get_checkpoint_from_wandb(epoch, run, map_device_to="cuda:0")
+    print(f"Calculating predictions...")
+    evaluate_model(
+        model,
+        data_module,
+        buffer_dir=buffer_dir,
+        filename=f"{kind}.pt",
+        skip_existing_preds=True,
+    )
+    preds, labels = load_results_from_buffer(buffer_dir, device=DEVICE)
+    del model
+    gc.collect()
+
+    return preds, labels
+
+
+def load_preds_labels_from_nonwandb(
+    name, epoch, chebi_version, test_on_data_cls=ChEBIOver100, kind="test"
+):
+    data_module = test_on_data_cls(chebi_version=chebi_version)
+
+    buffer_dir = os.path.join(
+        "results_buffer",
+        f"{name}_ep{epoch}",
+        f"{data_module.__class__.__name__}_{kind}",
+    )
+    ckpt_path = None
+    for file in os.listdir(os.path.join("logs", "downloaded_ckpts", name)):
+        if file.startswith(f"best_epoch={epoch}"):
+            ckpt_path = os.path.join(
+                os.path.join("logs", "downloaded_ckpts", name, file)
+            )
+    assert (
+        ckpt_path is not None
+    ), f"Could not find ckpt for epoch {epoch} in directory {os.path.join('logs', 'downloaded_ckpts', name)}"
+    model = Electra.load_from_checkpoint(ckpt_path, map_location="cuda:0", strict=False)
     print(f"Calculating predictions...")
     evaluate_model(
         model,
@@ -130,7 +165,6 @@ def analyse_run(
         (dl.implication_filter_l, dl.implication_filter_r, "impl"),
         (dl.disjoint_filter_l, dl.disjoint_filter_r, "disj"),
     ]:
-        print(f"Calculating on {filter_type} loss")
         # prepare predictions
         n_loss_terms = dl_filter_l.shape[0]
         preds_exp = preds.unsqueeze(2).expand((-1, -1, n_loss_terms)).swapaxes(1, 2)
@@ -218,34 +252,135 @@ def analyse_run(
     gc.collect()
 
 
-def run_all(run_ids, datasets=None, chebi_version=231):
+def run_all(
+    run_ids,
+    datasets=None,
+    chebi_version=231,
+    skip_analyse=False,
+    skip_preds=False,
+    nonwandb_runs=None,
+):
     # evaluate a list of runs on Hazardous and ChEBIOver100 datasets
     if datasets is None:
         datasets = [(Hazardous, "all"), (ChEBIOver100, "test")]
     timestamp = datetime.now().strftime("%y%m%d-%H%M")
     results_path = os.path.join(
         "_semloss_eval", f"semloss_results_pc-dis-200k_{timestamp}.csv"
     )
-
+    api = wandb.Api()
     for run_id in run_ids:
+        try:
+            run = api.run(f"chebai/chebai/{run_id}")
+            epoch = get_best_epoch(run)
+            for test_on, kind in datasets:
+                df = {
+                    "run-id": run_id,
+                    "epoch": int(epoch),
+                    "kind": kind,
+                    "data_module": test_on.__name__,
+                    "chebi_version": chebi_version,
+                }
+                if not skip_preds:
+                    preds, labels = load_preds_labels_from_wandb(
+                        run, epoch, chebi_version, test_on, kind
+                    )
+                else:
+                    buffer_dir = os.path.join(
+                        "results_buffer",
+                        f"{run.name}_ep{epoch}",
+                        f"{test_on.__name__}_{kind}",
+                    )
+                    preds, labels = load_results_from_buffer(buffer_dir, device=DEVICE)
+                if not skip_analyse:
+                    print(
+                        f"Calculating metrics for run {run.name} on {test_on.__name__} ({kind})"
+                    )
+                    analyse_run(
+                        preds,
+                        labels,
+                        df_hyperparams=df,
+                        chebi_version=chebi_version,
+                        results_path=results_path,
+                    )
+        except Exception as e:
+            print(f"Failed for run {run_id}: {e}")
+            print(traceback.format_exc())
+
+    if nonwandb_runs:
+        for run_name, epoch in nonwandb_runs:
+            try:
+                for test_on, kind in datasets:
+                    df = {
+                        "run-id": run_name,
+                        "epoch": int(epoch),
+                        "kind": kind,
+                        "data_module": test_on.__name__,
+                        "chebi_version": chebi_version,
+                    }
+                    if not skip_preds:
+                        preds, labels = load_preds_labels_from_nonwandb(
+                            run_name, epoch, chebi_version, test_on, kind
+                        )
+                    else:
+                        buffer_dir = os.path.join(
+                            "results_buffer",
+                            f"{run_name}_ep{epoch}",
+                            f"{test_on.__name__}_{kind}",
+                        )
+                        preds, labels = load_results_from_buffer(
+                            buffer_dir, device=DEVICE
+                        )
+                    if not skip_analyse:
+                        print(
+                            f"Calculating metrics for run {run_name} on {test_on.__name__} ({kind})"
+                        )
+                        analyse_run(
+                            preds,
+                            labels,
+                            df_hyperparams=df,
+                            chebi_version=chebi_version,
+                            results_path=results_path,
+                        )
+            except Exception as e:
+                print(f"Failed for run {run_name}: {e}")
+                print(traceback.format_exc())
+
+
+def run_semloss_eval(mode="eval"):
+    non_wandb_runs = (
+        []
+    )  # ("chebi100_semprodk2_weighted_v231_pc_200k_dis_24042-2000", 195)]
+    if mode == "preds":
         api = wandb.Api()
-        run = api.run(f"chebai/chebai/{run_id}")
-        epoch = get_best_epoch(run)
-        for test_on, kind in datasets:
-            df = {
-                "run-id": run_id,
-                "epoch": int(epoch),
-                "kind": kind,
-                "data_module": test_on.__class__.__name__,
-                "chebi_version": chebi_version,
-            }
-            preds, labels = load_preds_labels_from_wandb(
-                run, epoch, chebi_version, test_on, kind
-            )
-            analyse_run(
-                preds,
-                labels,
-                df_hyperparams=df,
-                chebi_version=chebi_version,
-                results_path=results_path,
-            )
+        runs = api.runs("chebai/chebai", filters={"tags": "eval_semloss_paper"})
+        print(f"Found {len(runs)} tagged wandb runs")
+        ids = [run.id for run in runs]
+        run_all(ids, skip_analyse=True, nonwandb_runs=non_wandb_runs)
+
+    if mode == "eval":
+        new_14 = [
+            "e4ba0ff8",
+            "5ko8knb4",
+            "hk8555ff",
+            "r50ioujs",
+            "w0h3zr5s",
+            "e0lxw8py",
+            "0c0s48nh",
+            "lfg384bp",
+            "75o8bc3h",
+            "lig23cmg",
+            "qeghvubh",
+            "uke62a8m",
+            "061fd85t",
+            "tk15yznc",
+        ]
+        baseline = ["i4wtz1k4", "zd020wkv", "rc1q3t49"]
+        ids = baseline
+        run_all(ids, skip_preds=True, nonwandb_runs=non_wandb_runs)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        run_semloss_eval(sys.argv[1])
+    else:
+        run_semloss_eval()
diff --git a/chebai/result/utils.py b/chebai/result/utils.py
@@ -9,7 +9,11 @@
 
 
 def get_checkpoint_from_wandb(
-    epoch, run, root=os.path.join("logs", "downloaded_ckpts"), model_class=None
+    epoch,
+    run,
+    root=os.path.join("logs", "downloaded_ckpts"),
+    model_class=None,
+    map_device_to=None,
 ):
     """Gets wandb checkpoint based on run and epoch, downloads it if necessary"""
     api = wandb.Api()
@@ -26,7 +30,7 @@ def get_checkpoint_from_wandb(
                 print(f"Downloading checkpoint to {dest_path}")
                 wandb_util.download_file_from_url(dest_path, file.url, api.api_key)
             return model_class.load_from_checkpoint(
-                dest_path, strict=False, map_location="cuda:0"
+                dest_path, strict=False, map_location=map_device_to
             )
     print(f"No model found for epoch {epoch}")
     return None
@@ -54,8 +58,9 @@ def evaluate_model(
         os.makedirs(buffer_dir, exist_ok=True)
     save_ind = 0
     save_batch_size = 4
-    n_saved = 0
+    n_saved = 1
 
+    print(f"")
     for i in tqdm.tqdm(range(0, len(data_list), batch_size)):
         if not (
             skip_existing_preds
@@ -74,7 +79,6 @@ def evaluate_model(
             preds_list.append(preds)
             labels_list.append(labels)
             if buffer_dir is not None:
-                n_saved += 1
                 if n_saved >= save_batch_size:
                     torch.save(
                         torch.cat(preds_list),
@@ -87,8 +91,10 @@ def evaluate_model(
                         )
                     preds_list = []
                     labels_list = []
-                    save_ind += 1
-                    n_saved = 0
+        if n_saved >= save_batch_size:
+            save_ind += 1
+            n_saved = 0
+        n_saved += 1
 
     if buffer_dir is None:
         test_preds = torch.cat(preds_list)
diff --git a/configs/data/chebi100.yml b/configs/data/chebi100.yml
@@ -1,3 +1 @@
-class_path: chebai.preprocessing.datasets.chebi.ChEBIOver100
-init_args:
-  chebi_version: 231
+class_path: chebai.preprocessing.datasets.chebi.ChEBIOver100