From fa45591ccd334f0060db2dee27ab1c3160ba7019 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Fri, 3 Oct 2025 13:32:13 +0300 Subject: [PATCH 1/6] Enable summarization by subsets and groups Signed-off-by: Jonathan Bnayahu --- src/unitxt/evaluate_cli.py | 128 +++++++++++++++++++++++++++---------- 1 file changed, 96 insertions(+), 32 deletions(-) diff --git a/src/unitxt/evaluate_cli.py b/src/unitxt/evaluate_cli.py index 2e416a3bd6..5052db0752 100644 --- a/src/unitxt/evaluate_cli.py +++ b/src/unitxt/evaluate_cli.py @@ -7,7 +7,7 @@ import platform import subprocess import sys -from datetime import datetime +from datetime import datetime, timezone from functools import partial from typing import Any, Dict, List, Optional, Tuple, Union @@ -691,9 +691,8 @@ def _save_results_to_disk( "results": global_scores, } - # prepend to the results_path name the time in a wat like this: 2025-04-04T11:37:32 - - timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + # prepend the timestamp in UTC (e.g., 2025-01-18T11-37-32) to the file names + timestamp = datetime.now().astimezone(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S") results_path = prepend_timestamp_to_path(results_path, timestamp) samples_path = prepend_timestamp_to_path(samples_path, timestamp) @@ -836,47 +835,112 @@ def main(): logger.info("Unitxt Evaluation CLI finished successfully.") -def extract_scores(directory): # pragma: no cover +def extract_scores(folder: str, subset: str, group: str): # pragma: no cover import pandas as pd - data = [] + def safe_score(d: dict, key="score"): + na = "N/A" + return d.get(key, na) if isinstance(d, dict) else na - for filename in sorted(os.listdir(directory)): - if filename.endswith("evaluation_results.json"): - file_path = os.path.join(directory, filename) - try: - with open(file_path, encoding="utf-8") as f: - content = json.load(f) + def extract_subset(results: dict, subset: str, group: str): + subset_results = results.get(subset, {}) + row = {subset: safe_score(subset_results)} + + groups = subset_results.get("groups", {}) + + if not groups: + return row - env_info = content.get("environment_info", {}) - timestamp = env_info.get("timestamp_utc", "N/A") - model = env_info.get("parsed_arguments", {}).get("model", "N/A") - results = content.get("results", {}) + group_results = groups.get(group) if group else next(iter(groups.values()), {}) - row = {} - row["Model"] = model - row["Timestamp"] = timestamp - row["Average"] = results.get("score", "N/A") + if not isinstance(group_results, dict): + return row - for key in results.keys(): - if isinstance(results[key], dict): - score = results[key].get("score", "N/A") - row[key] = score + row.update( + {k: safe_score(v) for k, v in group_results.items() if isinstance(v, dict)} + ) + return row + + def extract_all(results: dict): + row = {"Average": safe_score(results)} + row.update( + {k: safe_score(v) for k, v in results.items() if isinstance(v, dict)} + ) + return row + + data = [] - data.append(row) - except Exception as e: - logger.error(f"Error parsing results file {filename}: {e}.") + for filename in sorted(os.listdir(folder)): + if not filename.endswith("evaluation_results.json"): + continue + + file_path = os.path.join(folder, filename) + try: + with open(file_path, encoding="utf-8") as f: + content = json.load(f) + + env_info = content.get("environment_info", {}) + row = { + "Timestamp": safe_score(env_info, "timestamp_utc"), + "Model": safe_score(env_info.get("parsed_arguments", {}), "model"), + } + + results = content.get("results", {}) + + extra = ( + extract_subset(results, subset, group) + if subset + else extract_all(results) + ) + row.update(extra) + data.append(row) + except Exception as e: + logger.error(f"Error parsing results file {filename}: {e}.") return pd.DataFrame(data).sort_values(by="Timestamp", ascending=True) +def setup_summarization_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + description="CLI utility for summarizing evaluation results.", + ) + + parser.add_argument( + "--folder", + "-f", + dest="folder", + type=str, + default=".", + help="Directory containing evaluation results json files. Default: current folder.\n", + ) + + parser.add_argument( + "--subset", + "-s", + type=str, + dest="subset", + default=None, + help="Subset to filter results by. Default: none.", + ) + + parser.add_argument( + "--group", + "-g", + type=str, + dest="group", + default=None, + help="Group to filter results to. Requires specifying a subset. Default: first group.", + ) + + return parser + + def summarize_cli(): - if len(sys.argv) != 2: - logger.error("Usage: python summarize_cli_results.py ") - sys.exit(1) - directory = sys.argv[1] - df = extract_scores(directory) + parser = setup_summarization_parser() + args = parser.parse_args() + df = extract_scores(args.folder, args.subset, args.group) logger.info(df.to_markdown(index=False)) From 2bd30d4e0f7adbd48665058b6e5efd4046074499 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Sun, 5 Oct 2025 09:37:45 +0300 Subject: [PATCH 2/6] add output formats. Fix column order. Signed-off-by: Jonathan Bnayahu --- src/unitxt/evaluate_cli.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/unitxt/evaluate_cli.py b/src/unitxt/evaluate_cli.py index 5052db0752..b16b7b01f3 100644 --- a/src/unitxt/evaluate_cli.py +++ b/src/unitxt/evaluate_cli.py @@ -881,8 +881,8 @@ def extract_all(results: dict): env_info = content.get("environment_info", {}) row = { - "Timestamp": safe_score(env_info, "timestamp_utc"), "Model": safe_score(env_info.get("parsed_arguments", {}), "model"), + "Timestamp": safe_score(env_info, "timestamp_utc"), } results = content.get("results", {}) @@ -933,6 +933,16 @@ def setup_summarization_parser() -> argparse.ArgumentParser: help="Group to filter results to. Requires specifying a subset. Default: first group.", ) + parser.add_argument( + "--output", + "-o", + type=str, + choices=["markdown", "csv"], + dest="output", + default="markdown", + help="Output format. Can be markdown or csv. Default: markdown", + ) + return parser @@ -941,7 +951,13 @@ def summarize_cli(): args = parser.parse_args() df = extract_scores(args.folder, args.subset, args.group) - logger.info(df.to_markdown(index=False)) + + if args.output == "markdown": + logger.info(df.to_markdown(index=False)) + elif args.output == "csv": + logger.info(df.to_csv(index=False)) + else: + logger.error(f"Unsupported output format: {args.output}") if __name__ == "__main__": From 085b4a9399f7676952161119e7f9c359d7b5721d Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Sun, 5 Oct 2025 10:40:13 +0300 Subject: [PATCH 3/6] update the example script Signed-off-by: Jonathan Bnayahu --- examples/evaluate_bluebench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/evaluate_bluebench.sh b/examples/evaluate_bluebench.sh index 181d6068a0..d05d4ccadd 100644 --- a/examples/evaluate_bluebench.sh +++ b/examples/evaluate_bluebench.sh @@ -28,4 +28,4 @@ unitxt-evaluate \ --batch_size 8 \ --verbosity ERROR -unitxt-summarize ./results/bluebench +unitxt-summarize --folder ./results/bluebench From 241e2b3acc3b1b9a59886556ade9eb5c69352668 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Thu, 30 Oct 2025 15:08:48 +0200 Subject: [PATCH 4/6] fix test to use UTC time Signed-off-by: Jonathan Bnayahu --- tests/library/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/library/test_cli.py b/tests/library/test_cli.py index 75c2a86378..199b011b30 100644 --- a/tests/library/test_cli.py +++ b/tests/library/test_cli.py @@ -750,7 +750,7 @@ def test_save_results_to_disk_summary_only( """Test saving only the summary results file (log_samples=False).""" # --- Arrange --- # (Arrange section remains the same as previous version) - mock_timestamp = "2025-04-14T10:00:00" + mock_timestamp = "2025-04-14T08:00:00" mock_now = MagicMock() mock_now.strftime.return_value = mock_timestamp mock_datetime.now.return_value = mock_now From 37c29b58463c8e0ee3599b56c201da556056a380 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Thu, 30 Oct 2025 16:17:30 +0200 Subject: [PATCH 5/6] further fix to the test Signed-off-by: Jonathan Bnayahu --- tests/library/test_cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/library/test_cli.py b/tests/library/test_cli.py index 199b011b30..1301e13c68 100644 --- a/tests/library/test_cli.py +++ b/tests/library/test_cli.py @@ -750,13 +750,15 @@ def test_save_results_to_disk_summary_only( """Test saving only the summary results file (log_samples=False).""" # --- Arrange --- # (Arrange section remains the same as previous version) - mock_timestamp = "2025-04-14T08:00:00" + mock_timestamp = "2025-04-14T10:00:00" mock_now = MagicMock() mock_now.strftime.return_value = mock_timestamp mock_datetime.now.return_value = mock_now mock_utcnow = MagicMock() mock_utcnow.isoformat.return_value = "2025-04-14T08:00:00" mock_datetime.utcnow.return_value = mock_utcnow + mock_astimezone = MagicMock() + mock_astimezone.strftime.return_value = "2025-04-14T08:00:00" args = argparse.Namespace( log_samples=False, From 4b606b7a8b9f9859e60e32cbc2ec41cc894674fe Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Thu, 30 Oct 2025 16:40:12 +0200 Subject: [PATCH 6/6] now actually fixed Signed-off-by: Jonathan Bnayahu --- tests/library/test_cli.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/library/test_cli.py b/tests/library/test_cli.py index 1301e13c68..dc5e922d9b 100644 --- a/tests/library/test_cli.py +++ b/tests/library/test_cli.py @@ -751,14 +751,16 @@ def test_save_results_to_disk_summary_only( # --- Arrange --- # (Arrange section remains the same as previous version) mock_timestamp = "2025-04-14T10:00:00" + mock_timestamp_utc = "2025-04-14T08:00:00" mock_now = MagicMock() mock_now.strftime.return_value = mock_timestamp + mock_astimezone = MagicMock() + mock_astimezone.strftime.return_value = mock_timestamp_utc + mock_now.astimezone.return_value = mock_astimezone mock_datetime.now.return_value = mock_now mock_utcnow = MagicMock() mock_utcnow.isoformat.return_value = "2025-04-14T08:00:00" mock_datetime.utcnow.return_value = mock_utcnow - mock_astimezone = MagicMock() - mock_astimezone.strftime.return_value = "2025-04-14T08:00:00" args = argparse.Namespace( log_samples=False, @@ -786,7 +788,9 @@ def test_save_results_to_disk_summary_only( } base_results_path = "/out/results_prefix.json" base_samples_path = "/out/results_prefix_samples.json" - expected_timestamped_results_path = f"/out/{mock_timestamp}_results_prefix.json" + expected_timestamped_results_path = ( + f"/out/{mock_timestamp_utc}_results_prefix.json" + ) # --- Act --- cli._save_results_to_disk( @@ -846,7 +850,7 @@ def test_save_results_to_disk_summary_only( ) log_calls = [call[0][0] for call in mock_logger.info.call_args_list] expected_timestamped_samples_path = ( - f"/out/{mock_timestamp}_results_prefix_samples.json" + f"/out/{mock_timestamp_utc}_results_prefix_samples.json" ) self.assertNotIn( f"Saving detailed samples to: {expected_timestamped_samples_path}",