Skip to content

Commit 8c63a63

Browse files
analysis: also consider f1-score
1 parent 23125e6 commit 8c63a63

File tree

4 files changed

+31
-13
lines changed

4 files changed

+31
-13
lines changed

src/main/python/result_analysis/eval_setup.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,7 @@ def __repr__(self):
174174

175175

176176
class Metric(Enum):
177-
Precision = "precision"
178-
Recall = "recall"
177+
F1Score = "f1_score"
179178
Automation = "patch_automation"
180179
EditDistance = "avg_edit_distance"
181180
Runtime = "avg_runtime"
@@ -188,8 +187,7 @@ def __repr__(self):
188187

189188
def nice_name(self):
190189
return {
191-
Metric.Precision: "Precision",
192-
Metric.Recall: "Recall",
190+
Metric.F1Score: "F1 Score",
193191
Metric.Automation: "Autom. (\\%)",
194192
Metric.EditDistance: "Req. Fixes",
195193
Metric.Runtime: "Time (s)",
@@ -202,22 +200,31 @@ def __init__(
202200
patcher: Patcher,
203201
precision: float,
204202
recall: float,
203+
f1_score: float,
205204
patch_automation: float,
206205
avg_edit_distance: float,
207206
avg_runtime: float,
208207
):
209208
self.precision = np.array([precision])
210209
self.recall = np.array([recall])
210+
self.f1_score = np.array([f1_score])
211211
self.patcher = patcher
212212
self.patch_automation = np.array([patch_automation])
213213
self.avg_edit_distance = np.array([avg_edit_distance])
214214
self.avg_runtime = np.array([avg_runtime])
215215

216216
def add_data(
217-
self, precision, recall, patch_automation, avg_edit_distance, avg_runtime
217+
self,
218+
precision,
219+
recall,
220+
f1_score,
221+
patch_automation,
222+
avg_edit_distance,
223+
avg_runtime,
218224
):
219225
self.precision = np.append(self.precision, precision)
220226
self.recall = np.append(self.recall, recall)
227+
self.f1_score = np.append(self.f1_score, f1_score)
221228
self.patch_automation = np.append(self.patch_automation, patch_automation)
222229
self.avg_edit_distance = np.append(self.avg_edit_distance, avg_edit_distance)
223230
self.avg_runtime = np.append(self.avg_runtime, avg_runtime)

src/main/python/result_analysis/latex.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def generate_metrics_result_table(
3434
# Write the multi-rows and their corresponding rows
3535
file.write("\\toprule\n")
3636
for metric in Metric:
37-
if metric != Metric.Precision:
37+
if metric != Metric.F1Score:
3838
file.write("\\midrule\n")
3939
file.write(
4040
"\\multirow{"
@@ -52,9 +52,7 @@ def generate_metrics_result_table(
5252
results = results_per_patcher[patcher][language].per_patch
5353
value = np.nanmean(results.get(metric))
5454
postfix = ""
55-
if metric == Metric.Precision:
56-
best_type = "max"
57-
elif metric == Metric.Recall:
55+
if metric == Metric.F1Score:
5856
best_type = "max"
5957
elif metric == Metric.Automation:
6058
value = 100 * value
@@ -125,9 +123,7 @@ def determine_best_average(differences, patcher_names, metric):
125123
results = differences[patcher][metric][0]
126124
values.append(results)
127125

128-
if metric == Metric.Precision:
129-
best_type = "max"
130-
elif metric == Metric.Recall:
126+
if metric == Metric.F1Score:
131127
best_type = "max"
132128
elif metric == Metric.Automation:
133129
best_type = "max"

src/main/python/result_analysis/metrics.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,11 @@ def calculate_precision_recall(tp, fp, fn):
55
precision = tp / float(tp + fp) if (tp + fp) > 0 else np.nan
66
recall = tp / float(tp + fn) if (tp + fn) > 0 else np.nan
77
return precision, recall
8+
9+
10+
def calculate_f1_score(precision, recall):
11+
return (
12+
2 * (precision * recall) / (precision + recall)
13+
if (precision + recall) > 0
14+
else np.nan
15+
)

src/main/python/result_analysis/result_handling.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from result_analysis.eval_setup import Patcher
88
from result_analysis.eval_setup import RQ3PatcherData
99
from result_analysis.io import load_all_results
10-
from result_analysis.metrics import calculate_precision_recall
10+
from result_analysis.metrics import calculate_f1_score, calculate_precision_recall
1111
from collections import defaultdict
1212
import numpy as np
1313

@@ -188,6 +188,7 @@ def accumulate_data_per_patcher(
188188
oc.fp(),
189189
oc.fn(),
190190
)
191+
f1 = calculate_f1_score(p, r)
191192
# if np.isnan(p) or np.isnan(r):
192193
# continue
193194
num_incorrect = oc.num_incorrect()
@@ -200,6 +201,7 @@ def accumulate_data_per_patcher(
200201
patcher=patcher,
201202
precision=p,
202203
recall=r,
204+
f1_score=f1,
203205
patch_automation=a,
204206
avg_edit_distance=num_incorrect,
205207
avg_runtime=res.patch_duration,
@@ -208,6 +210,7 @@ def accumulate_data_per_patcher(
208210
data_per_patch.add_data(
209211
precision=p,
210212
recall=r,
213+
f1_score=f1,
211214
patch_automation=a,
212215
avg_edit_distance=num_incorrect,
213216
avg_runtime=res.patch_duration,
@@ -219,6 +222,8 @@ def accumulate_data_per_patcher(
219222
fn=fn,
220223
)
221224

225+
f1_score = calculate_f1_score(precision, recall)
226+
222227
oa = overall_automation(results)
223228
(average_ed, _) = edit_distance(results)
224229
(average_run, _) = runtime(results)
@@ -230,6 +235,7 @@ def accumulate_data_per_patcher(
230235
accumulated_data.accumulated_data.add_data(
231236
precision=precision,
232237
recall=recall,
238+
f1_score=f1_score,
233239
patch_automation=oa,
234240
avg_edit_distance=average_ed,
235241
avg_runtime=average_run,
@@ -240,6 +246,7 @@ def accumulate_data_per_patcher(
240246
patcher=patcher,
241247
precision=precision,
242248
recall=recall,
249+
f1_score=f1_score,
243250
patch_automation=oa,
244251
avg_edit_distance=average_ed,
245252
avg_runtime=average_run,

0 commit comments

Comments
 (0)