googleapis · chandra-siri · Dec 3, 2025 · Dec 23, 2025 · Dec 27, 2025 · Dec 27, 2025
@@ -62,3 +62,6 @@ system_tests/local_test_setup
 # Make sure a generated file isn't accidentally committed.
 pylintrc
 pylintrc.test
+
+# Benchmarking results and logs
+__benchmark_results__/**
@@ -0,0 +1,191 @@
+import json
+import csv
+import argparse
+import logging
+import os
+import numpy as np
+
+MB = 1024 * 1024
+
+
+def _process_benchmark_result(bench, headers, extra_info_headers, stats_headers):
+    """
+    Process a single benchmark result and prepare it for CSV reporting.
+
+    This function extracts relevant statistics and metadata from a benchmark
+    run, calculates derived metrics like percentiles and throughput, and
+    formats it as a dictionary.
+
+    Args:
+        bench (dict): The dictionary for a single benchmark from the JSON output.
+        headers (list): The list of all header names for the CSV.
+        extra_info_headers (list): Headers from the 'extra_info' section.
+        stats_headers (list): Headers from the 'stats' section.
+
+    """
+    row = {h: "" for h in headers}
+    row["name"] = bench.get("name", "")
+    row["group"] = bench.get("group", "")
+
+    extra_info = bench.get("extra_info", {})
+
+    # Populate extra_info and stats
+    for key in extra_info_headers:
+        row[key] = extra_info.get(key)
+    for key in stats_headers:
+        row[key] = bench.get("stats", {}).get(key)
+
+    # Handle threads/coros mapping
+    if "threads" in row:
+        row["threads"] = extra_info.get("num_coros", extra_info.get("coros"))
+
+    # Calculate percentiles
+    timings = bench.get("stats", {}).get("data")
+    if timings:
+        row["p90"] = np.percentile(timings, 90)
+        row["p95"] = np.percentile(timings, 95)
+        row["p99"] = np.percentile(timings, 99)
+
+    # Calculate max throughput
+    file_size = extra_info.get("file_size_bytes", extra_info.get("file_size", 0))
-    file_size = extra_info.get("file_size_bytes", extra_info.get("file_size", 0))
+    file_size = extra_info.get("file_size_bytes", 0)
-    file_size = extra_info.get("file_size_bytes", extra_info.get("file_size", 0))
+    file_size = extra_info.get("file_size_bytes", 0)
+    num_files = extra_info.get("num_files", 1)
+    total_bytes = file_size * num_files
+
+    min_time = bench.get("stats", {}).get("min")
+    if min_time and min_time > 0:
+        row["max_throughput_mb_s"] = (total_bytes / min_time) / MB
+    else:
+        row["max_throughput_mb_s"] = 0.0
+
+    return row
+
+
+def _generate_report(json_path, csv_path):
+    """Generate a CSV summary report from the pytest-benchmark JSON output.
+
+    Args:
+        json_path (str): The path to the JSON file containing benchmark results.
+        csv_path (str): The path where the CSV report will be saved.
+
+    Returns:
+        str: The path to the generated CSV report file.
+
+    """
+    logging.info(f"Generating CSV report from {json_path}")
+
+    with open(json_path, "r") as f:
+        data = json.load(f)
+
+    benchmarks = data.get("benchmarks", [])
+    if not benchmarks:
+        logging.warning("No benchmarks found in the JSON file.")
+        return
+
+    # headers order - name	group	block_size	bucket_name	bucket_type	chunk_size	cpu_max_global	file_size	mem_max	net_throughput_mb_s	num_files	pattern	processes	rounds	threads	vcpus	min	max	mean	median	stddev	p90	p95	p99	max_throughput_mb_s
+    # if there are any other column keep it at the afterwards.
+    ordered_headers = [
+        "name",
+        "group",
+        "block_size",
+        "bucket_name",
+        "bucket_type",
+        "chunk_size",
+        "cpu_max_global",
+        "file_size",
-        "chunk_size",
-        "cpu_max_global",
-        "file_size",
+        "chunk_size_bytes",
+        "cpu_max_global",
+        "file_size_bytes",
-        "chunk_size",
-        "cpu_max_global",
-        "file_size",
+        "chunk_size_bytes",
+        "cpu_max_global",
+        "file_size_bytes",
+        "mem_max",
+        "net_throughput_mb_s",
+        "num_files",
+        "pattern",
+        "processes",
+        "rounds",
+        "threads",
+        "vcpus",
+        "min",
+        "max",
+        "mean",
+        "median",
+        "stddev",
+        "p90",
+        "p95",
+        "p99",
+        "max_throughput_mb_s",
+    ]
+
+    # Gather all available headers from the data
+    all_available_headers = set(["name", "group"])
+    stats_headers = ["min", "max", "mean", "median", "stddev"]
+    custom_headers = ["p90", "p95", "p99", "max_throughput_mb_s"]
+
+    all_available_headers.update(stats_headers)
+    all_available_headers.update(custom_headers)
+
+    extra_info_keys = set()
+    for bench in benchmarks:
+        if "extra_info" in bench and isinstance(bench["extra_info"], dict):
+            extra_info_keys.update(bench["extra_info"].keys())
+    all_available_headers.update(extra_info_keys)
+
+    # Construct the final header list
+    final_headers = list(ordered_headers)
+
+    # Add any headers from the data that are not in the ordered list
+    for header in sorted(list(all_available_headers)):
+        if header not in final_headers:
+            final_headers.append(header)
+
+    # We still need the full list of extra_info headers for _process_benchmark_result
+    extra_info_headers = sorted(list(extra_info_keys))
+
+    with open(csv_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(final_headers)
+
+        for bench in benchmarks:
+            row = _process_benchmark_result(
+                bench, final_headers, extra_info_headers, stats_headers
+            )
+            writer.writerow([row.get(h, "") for h in final_headers])
+
+    logging.info(f"CSV report generated at {csv_path}")
+    return csv_path
+
+
+def main():
+    """
+    Converts a JSON benchmark file to a CSV file.
+
+    The CSV file will contain the 'name' of each benchmark and all fields
+    from the 'extra_info' section.
+    """
+    parser = argparse.ArgumentParser(description="Convert benchmark JSON to CSV.")
+    parser.add_argument(
+        "--input_file",
+        nargs="?",
+        default="output.json",
+        help="Path to the input JSON file (default: output.json)",
+    )
+    parser.add_argument(
+        "--output_file",
+        nargs="?",
+        default="output.csv",
+        help="Path to the output CSV file (default: output.csv)",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+
+    try:
+        _generate_report(args.input_file, args.output_file)
+        print(f"Successfully converted {args.input_file} to {args.output_file}")
+    except FileNotFoundError:
+        logging.error(f"Error: Input file not found at {args.input_file}")
+    except json.JSONDecodeError:
+        logging.error(f"Error: Could not decode JSON from {args.input_file}")
+    except Exception as e:
+        logging.error(f"An unexpected error occurred: {e}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,85 @@
+from typing import Any, List
+import statistics
+
+
+def publish_benchmark_extra_info(
+    benchmark: Any,
+    params: Any,
+    benchmark_group: str = "read",
+    true_times: List[float] = [],
+) -> None:
+    """
+    Helper function to publish benchmark parameters to the extra_info property.
+    """
+
+    benchmark.extra_info["num_files"] = params.num_files
+    benchmark.extra_info["file_size"] = params.file_size_bytes
+    benchmark.extra_info["chunk_size"] = params.chunk_size_bytes
+    if benchmark_group == "write":
+        benchmark.extra_info["pattern"] = "seq"
+    else:
+        benchmark.extra_info["pattern"] = params.pattern
+    benchmark.extra_info["coros"] = params.num_coros
+    benchmark.extra_info["rounds"] = params.rounds
+    benchmark.extra_info["bucket_name"] = params.bucket_name
+    benchmark.extra_info["bucket_type"] = params.bucket_type
+    benchmark.extra_info["processes"] = params.num_processes
-    benchmark.extra_info["num_files"] = params.num_files
-    benchmark.extra_info["file_size"] = params.file_size_bytes
-    benchmark.extra_info["chunk_size"] = params.chunk_size_bytes
-    if benchmark_group == "write":
-        benchmark.extra_info["pattern"] = "seq"
-    else:
-        benchmark.extra_info["pattern"] = params.pattern
-    benchmark.extra_info["coros"] = params.num_coros
-    benchmark.extra_info["rounds"] = params.rounds
-    benchmark.extra_info["bucket_name"] = params.bucket_name
-    benchmark.extra_info["bucket_type"] = params.bucket_type
-    benchmark.extra_info["processes"] = params.num_processes
+    benchmark.extra_info["num_files"] = params.num_files
+    benchmark.extra_info["file_size_bytes"] = params.file_size_bytes
+    benchmark.extra_info["chunk_size_bytes"] = params.chunk_size_bytes
+    if benchmark_group == "write":
+        benchmark.extra_info["pattern"] = "seq"
+    else:
+        benchmark.extra_info["pattern"] = params.pattern
+    benchmark.extra_info["threads"] = params.num_coros
+    benchmark.extra_info["rounds"] = params.rounds
+    benchmark.extra_info["bucket_name"] = params.bucket_name
+    benchmark.extra_info["bucket_type"] = params.bucket_type
+    benchmark.extra_info["processes"] = params.num_processes
-    benchmark.extra_info["num_files"] = params.num_files
-    benchmark.extra_info["file_size"] = params.file_size_bytes
-    benchmark.extra_info["chunk_size"] = params.chunk_size_bytes
-    if benchmark_group == "write":
-        benchmark.extra_info["pattern"] = "seq"
-    else:
-        benchmark.extra_info["pattern"] = params.pattern
-    benchmark.extra_info["coros"] = params.num_coros
-    benchmark.extra_info["rounds"] = params.rounds
-    benchmark.extra_info["bucket_name"] = params.bucket_name
-    benchmark.extra_info["bucket_type"] = params.bucket_type
-    benchmark.extra_info["processes"] = params.num_processes
+    benchmark.extra_info["num_files"] = params.num_files
+    benchmark.extra_info["file_size_bytes"] = params.file_size_bytes
+    benchmark.extra_info["chunk_size_bytes"] = params.chunk_size_bytes
+    if benchmark_group == "write":
+        benchmark.extra_info["pattern"] = "seq"
+    else:
+        benchmark.extra_info["pattern"] = params.pattern
+    benchmark.extra_info["threads"] = params.num_coros
+    benchmark.extra_info["rounds"] = params.rounds
+    benchmark.extra_info["bucket_name"] = params.bucket_name
+    benchmark.extra_info["bucket_type"] = params.bucket_type
+    benchmark.extra_info["processes"] = params.num_processes
+    benchmark.group = benchmark_group
+
+    object_size = params.file_size_bytes
+    num_files = params.num_files
+    min_throughput = (object_size / (1024 * 1024) * num_files) / benchmark.stats["max"]
+    max_throughput = (object_size / (1024 * 1024) * num_files) / benchmark.stats["min"]
+    mean_throughput = (object_size / (1024 * 1024) * num_files) / benchmark.stats["mean"]
+    median_throughput = (
+        object_size / (1024 * 1024) * num_files
+    ) / benchmark.stats["median"]
+
+    benchmark.extra_info["throughput_MiB_s_min"] = min_throughput
+    benchmark.extra_info["throughput_MiB_s_max"] = max_throughput
+    benchmark.extra_info["throughput_MiB_s_mean"] = mean_throughput
+    benchmark.extra_info["throughput_MiB_s_median"] = median_throughput
+
+    print(f"\nThroughput Statistics (MiB/s):")
+    print(f"  Min:    {min_throughput:.2f} (from max time)")
+    print(f"  Max:    {max_throughput:.2f} (from min time)")
+    print(f"  Mean:   {mean_throughput:.2f} (approx, from mean time)")
+    print(f"  Median: {median_throughput:.2f} (approx, from median time)")
+
+    if true_times:
+        throughputs = [(object_size / (1024 * 1024) * num_files) / t for t in true_times]
+        true_min_throughput = min(throughputs)
+        true_max_throughput = max(throughputs)
+        true_mean_throughput = statistics.mean(throughputs)
+        true_median_throughput = statistics.median(throughputs)
+
+        benchmark.extra_info["true_throughput_MiB_s_min"] = true_min_throughput
+        benchmark.extra_info["true_throughput_MiB_s_max"] = true_max_throughput
+        benchmark.extra_info["true_throughput_MiB_s_mean"] = true_mean_throughput
+        benchmark.extra_info["true_throughput_MiB_s_median"] = true_median_throughput
+
+        print(f"\nThroughput Statistics from true_times (MiB/s):")
+        print(f"  Min:    {true_min_throughput:.2f}")
+        print(f"  Max:    {true_max_throughput:.2f}")
+        print(f"  Mean:   {true_mean_throughput:.2f}")
+        print(f"  Median: {true_median_throughput:.2f}")
+
+    # Get benchmark name, rounds, and iterations
+    name = benchmark.name
+    rounds = benchmark.stats['rounds']
+    iterations = benchmark.stats['iterations']
+
+    # Header for throughput table
+    header = "\n\n" + "-" * 125 + "\n"
+    header += "Throughput Benchmark (MiB/s)\n"
+    header += "-" * 125 + "\n"
+    header += f"{'Name':<50} {'Min':>10} {'Max':>10} {'Mean':>10} {'StdDev':>10} {'Median':>10} {'Rounds':>8} {'Iterations':>12}\n"
+    header += "-" * 125
+
+    # Data row for throughput table
+    # The table headers (Min, Max) refer to the throughput values.
+    row = f"{name:<50} {min_throughput:>10.4f} {max_throughput:>10.4f} {mean_throughput:>10.4f} {'N/A':>10} {median_throughput:>10.4f} {rounds:>8} {iterations:>12}"
+
+    print(header)
+    print(row)
+    print("-" * 125)