analysis: also consider f1-score

AlexanderSchultheiss · AlexanderSchultheiss · commit 8c63a63d8160 · 2024-10-09T16:24:33.000+02:00
diff --git a/src/main/python/result_analysis/eval_setup.py b/src/main/python/result_analysis/eval_setup.py
@@ -174,8 +174,7 @@ def __repr__(self):
 
 
 class Metric(Enum):
-    Precision = "precision"
-    Recall = "recall"
+    F1Score = "f1_score"
     Automation = "patch_automation"
     EditDistance = "avg_edit_distance"
     Runtime = "avg_runtime"
@@ -188,8 +187,7 @@ def __repr__(self):
 
     def nice_name(self):
         return {
-            Metric.Precision: "Precision",
-            Metric.Recall: "Recall",
+            Metric.F1Score: "F1 Score",
             Metric.Automation: "Autom. (\\%)",
             Metric.EditDistance: "Req. Fixes",
             Metric.Runtime: "Time (s)",
@@ -202,22 +200,31 @@ def __init__(
         patcher: Patcher,
         precision: float,
         recall: float,
+        f1_score: float,
         patch_automation: float,
         avg_edit_distance: float,
         avg_runtime: float,
     ):
         self.precision = np.array([precision])
         self.recall = np.array([recall])
+        self.f1_score = np.array([f1_score])
         self.patcher = patcher
         self.patch_automation = np.array([patch_automation])
         self.avg_edit_distance = np.array([avg_edit_distance])
         self.avg_runtime = np.array([avg_runtime])
 
     def add_data(
-        self, precision, recall, patch_automation, avg_edit_distance, avg_runtime
+        self,
+        precision,
+        recall,
+        f1_score,
+        patch_automation,
+        avg_edit_distance,
+        avg_runtime,
     ):
         self.precision = np.append(self.precision, precision)
         self.recall = np.append(self.recall, recall)
+        self.f1_score = np.append(self.f1_score, f1_score)
         self.patch_automation = np.append(self.patch_automation, patch_automation)
         self.avg_edit_distance = np.append(self.avg_edit_distance, avg_edit_distance)
         self.avg_runtime = np.append(self.avg_runtime, avg_runtime)
diff --git a/src/main/python/result_analysis/latex.py b/src/main/python/result_analysis/latex.py
@@ -34,7 +34,7 @@ def generate_metrics_result_table(
         # Write the multi-rows and their corresponding rows
         file.write("\\toprule\n")
         for metric in Metric:
-            if metric != Metric.Precision:
+            if metric != Metric.F1Score:
                 file.write("\\midrule\n")
             file.write(
                 "\\multirow{"
@@ -52,9 +52,7 @@ def generate_metrics_result_table(
                     results = results_per_patcher[patcher][language].per_patch
                     value = np.nanmean(results.get(metric))
                     postfix = ""
-                    if metric == Metric.Precision:
-                        best_type = "max"
-                    elif metric == Metric.Recall:
+                    if metric == Metric.F1Score:
                         best_type = "max"
                     elif metric == Metric.Automation:
                         value = 100 * value
@@ -125,9 +123,7 @@ def determine_best_average(differences, patcher_names, metric):
         results = differences[patcher][metric][0]
         values.append(results)
 
-    if metric == Metric.Precision:
-        best_type = "max"
-    elif metric == Metric.Recall:
+    if metric == Metric.F1Score:
         best_type = "max"
     elif metric == Metric.Automation:
         best_type = "max"
diff --git a/src/main/python/result_analysis/metrics.py b/src/main/python/result_analysis/metrics.py
@@ -5,3 +5,11 @@ def calculate_precision_recall(tp, fp, fn):
     precision = tp / float(tp + fp) if (tp + fp) > 0 else np.nan
     recall = tp / float(tp + fn) if (tp + fn) > 0 else np.nan
     return precision, recall
+
+
+def calculate_f1_score(precision, recall):
+    return (
+        2 * (precision * recall) / (precision + recall)
+        if (precision + recall) > 0
+        else np.nan
+    )
diff --git a/src/main/python/result_analysis/result_handling.py b/src/main/python/result_analysis/result_handling.py
@@ -7,7 +7,7 @@
 from result_analysis.eval_setup import Patcher
 from result_analysis.eval_setup import RQ3PatcherData
 from result_analysis.io import load_all_results
-from result_analysis.metrics import calculate_precision_recall
+from result_analysis.metrics import calculate_f1_score, calculate_precision_recall
 from collections import defaultdict
 import numpy as np
 
@@ -188,6 +188,7 @@ def accumulate_data_per_patcher(
                         oc.fp(),
                         oc.fn(),
                     )
+                    f1 = calculate_f1_score(p, r)
                     # if np.isnan(p) or np.isnan(r):
                     #    continue
                     num_incorrect = oc.num_incorrect()
@@ -200,6 +201,7 @@ def accumulate_data_per_patcher(
                             patcher=patcher,
                             precision=p,
                             recall=r,
+                            f1_score=f1,
                             patch_automation=a,
                             avg_edit_distance=num_incorrect,
                             avg_runtime=res.patch_duration,
@@ -208,6 +210,7 @@ def accumulate_data_per_patcher(
                         data_per_patch.add_data(
                             precision=p,
                             recall=r,
+                            f1_score=f1,
                             patch_automation=a,
                             avg_edit_distance=num_incorrect,
                             avg_runtime=res.patch_duration,
@@ -219,6 +222,8 @@ def accumulate_data_per_patcher(
                     fn=fn,
                 )
 
+                f1_score = calculate_f1_score(precision, recall)
+
                 oa = overall_automation(results)
                 (average_ed, _) = edit_distance(results)
                 (average_run, _) = runtime(results)
@@ -230,6 +235,7 @@ def accumulate_data_per_patcher(
                     accumulated_data.accumulated_data.add_data(
                         precision=precision,
                         recall=recall,
+                        f1_score=f1_score,
                         patch_automation=oa,
                         avg_edit_distance=average_ed,
                         avg_runtime=average_run,
@@ -240,6 +246,7 @@ def accumulate_data_per_patcher(
                         patcher=patcher,
                         precision=precision,
                         recall=recall,
+                        f1_score=f1_score,
                         patch_automation=oa,
                         avg_edit_distance=average_ed,
                         avg_runtime=average_run,