chore: allows evaluator to run on existing predictions (#734)

jemeza-codegen · web-flow · commit fc318679344b · 2025-03-04T14:04:04.000-08:00
# Motivation

Allows evaluation to be run on an existing predictions jsonl file.

# Content

- modified logic that loads predictions to check for a consolidated
jsonl file before creating one.

# Testing

Tested by running locally

# Please check the following before marking your PR as ready for review

- [x] I have added tests for my changes
- [X] I have updated the documentation or added new documentation as
needed
diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py
@@ -278,14 +278,14 @@ async def run_eval(
         "verified": SWEBenchDataset.VERIFIED,
     }
     dataset_enum = dataset_dict[dataset]
-    print(repo)
+
     examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo)
-    print(f"Examples:\n{'\n'.join([f'{e.instance_id} - {e.repo} - {e.base_commit}' for e in examples])}")
 
     try:
         if use_existing_preds is None:
+            print(f"Repo: {repo}")
+            print(f"Examples:\n{'\n'.join([f'{e.instance_id} - {e.repo} - {e.base_commit}' for e in examples])}")
             print(f"Processing {len(examples)} examples...")
-
             # Create output directory if it doesn't exist
             predictions_dir.mkdir(exist_ok=True, parents=True)
 
diff --git a/src/codegen/extensions/swebench/report.py b/src/codegen/extensions/swebench/report.py
@@ -113,6 +113,8 @@ def generate_report(predictions_dir: Path, logs_dir: Path, dataset: SWEBenchData
         print(f"Directory does not exist: {predictions_dir}")
         return 1
 
+    predictions_jsonl = predictions_dir / "all_preds.jsonl"
+    existing_preds = predictions_jsonl.exists()
     prediction_files = list(predictions_dir.glob("*.json"))
     print(f"Found {len(prediction_files)} prediction files")
 
@@ -126,29 +128,27 @@ def generate_report(predictions_dir: Path, logs_dir: Path, dataset: SWEBenchData
         except json.JSONDecodeError:
             print(f"Error reading JSON from {file_path}")
             continue
+    if not existing_preds:
+        if not predictions:
+            print("No valid predictions found")
+            return 1
 
-    print(f"Successfully loaded {len(predictions)} predictions")
+        print(f"Successfully loaded {len(predictions)} predictions")
 
-    if predictions:
-        # Create predictions JSONL file
         predictions_jsonl = preds_to_jsonl(predictions, predictions_dir)
-        print(f"\nCreated predictions JSONL: {predictions_jsonl}")
 
-        # Setup log directory
-        log_dir = logs_dir / "results"
-        log_dir.mkdir(exist_ok=True, parents=True)
-        print(f"Using log directory: {log_dir}")
+    # Setup log directory
+    log_dir = logs_dir / "results"
+    log_dir.mkdir(exist_ok=True, parents=True)
+    print(f"Using log directory: {log_dir}")
 
-        # Run evaluations
-        run_evals(predictions_jsonl, logs_dir, dataset, run_id)
+    # Run evaluations
+    run_evals(predictions_jsonl, logs_dir, dataset, run_id)
 
-        # Get and display report
-        report = get_report(predictions_jsonl, logs_dir)
+    # Get and display report
+    report = get_report(predictions_jsonl, logs_dir)
 
-        # Update prediction JSONs with results
-        predictions = update_pred_json(predictions, report, predictions_dir)
-    else:
-        print("No valid predictions found")
-        return 1
+    # Update prediction JSONs with results
+    predictions = update_pred_json(predictions, report, predictions_dir)
 
     return 0