diff --git a/benchmarks/output_branch_size_scan/config.yml b/benchmarks/output_branch_size_scan/config.yml new file mode 100644 index 00000000..952e1480 --- /dev/null +++ b/benchmarks/output_branch_size_scan/config.yml @@ -0,0 +1,24 @@ +sim:output_branch_size_scan: + stage: simulate + extends: .det_benchmark + script: + bash generate.sh + + + +bench:output_branch_size_scan: + stage: benchmarks + extends: .det_benchmark + needs: + - ["sim:output_branch_size_scan"] + script: + bash output_branch_size_scan.sh + + +results:output_branch_size_scan: + stage: collect + extends: .det_benchmark + needs: + - ["bench:output_branch_size_scan"] + script: + diff --git a/benchmarks/output_branch_size_scan/generate.sh b/benchmarks/output_branch_size_scan/generate.sh new file mode 100644 index 00000000..6a7d4db8 --- /dev/null +++ b/benchmarks/output_branch_size_scan/generate.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -Euo pipefail +trap 's=$?; echo "$0: Error on line "$LINENO": $BASH_COMMAND"; exit $s' ERR +IFS=$'\n\t' + +NUM_EVENTS=400 +INPUT_FILE=root://dtn-eic.jlab.org//work/eic2/EPIC/EVGEN/DIS/NC/18x275/minQ2=1/pythia8NCDIS_18x275_minQ2=1_beamEffects_xAngle=-0.025_hiDiv_1.hepmc3.tree.root + +DETECTOR_CONFIG=epic_craterlake +EBEAM=18 +PBEAM=275 + +npsim \ + --runType batch \ + --random.seed 1 \ + --random.enableEventSeed \ + --printLevel WARNING \ + --skipNEvents 0 \ + --numberOfEvents 400 \ + --filter.tracker 'edep0' \ + --hepmc3.useHepMC3 true \ + --compactFile ${DETECTOR_PATH}/${DETECTOR_CONFIG}${EBEAM:+${PBEAM:+_${EBEAM}x${PBEAM}}}.xml \ + --inputFiles ${INPUT_FILE} \ + --outputFile current_campaign.edm4hep.root + +eicrecon \ + -Ppodio:output_file="current_campaign.eicrecon.tree.edm4eic.root" \ + -Pjana:warmup_timeout=0 -Pjana:timeout=0 \ + -Pplugins=janadot \ + "current_campaign.edm4hep.root" + + diff --git a/benchmarks/output_branch_size_scan/output_branch_size_scan.sh b/benchmarks/output_branch_size_scan/output_branch_size_scan.sh new file mode 100644 index 00000000..4c1f908c --- /dev/null +++ b/benchmarks/output_branch_size_scan/output_branch_size_scan.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +root -q -b eicrecon.tree.edm4eic.root -e 'for (auto b : *events->GetListOfLeaves()) { if (events->GetBranch(b->GetName()) == nullptr) continue; cout << events->GetBranch(b->GetName())->GetTotalSize() << " " << b->GetName() << endl; }' | sort -n > branch_size_current.txt +root -q -b root://dtn-eic.jlab.org//work/eic2/EPIC/RECO/ -e 'for (auto b : *events->GetListOfLeaves()) { if (events->GetBranch(b->GetName()) == nullptr) continue; cout << events->GetBranch(b->GetName())->GetTotalSize() << " " << b->GetName() << endl; }' | sort -n > branch_size_default.txt +python plot.py -c branch_size_current.txt -d branch_size_default.txt diff --git a/benchmarks/output_branch_size_scan/plot.py b/benchmarks/output_branch_size_scan/plot.py new file mode 100644 index 00000000..ddfd0486 --- /dev/null +++ b/benchmarks/output_branch_size_scan/plot.py @@ -0,0 +1,63 @@ +import pandas as pd +import argparse +import matplotlib.pyplot as plt + +parser = argparse.ArgumentParser(prog='Plot output branch sizes', description='Plot output branch sizes') + +parser.add_argument("-c", dest="current_campaign_file", action="store", required=True, help="Enter the current campaign file") +parser.add_argument("-d", dest="default_file", action="store", required=True, help="Enter the default file") + +args=parser.parse_args() + + +campaign1=args.current_campaign_file +campaign2=args.default_file + + +# Load the data from the CSV file +df1 = pd.read_csv(campaign1+'.txt', header=None) +df2 = pd.read_csv(campaign2+'.txt', header=None) + +# Plot the third column ('Value') against the first column ('Object') +plt.figure(figsize=(10,6)) +plt.scatter(df1.iloc[:,0], df1.iloc[:,2]) +plt.scatter(df2.iloc[:,0], df2.iloc[:,2]) + +plt.title("Branch Sizes (Bytes) vs Branch Names") + + + + +# Show the figure +plt.tight_layout() +plt.yscale('log') +plt.savefig(campaign1+'_vs_'+campaign2+'.png') + +print(df1) +print(df2) + +# Assuming both dataframes have the same structure and the first column is branch name +# Merge the two dataframes on the branch name (first column) +merged_df = pd.merge(df1.iloc[:, [0, 2]], df2.iloc[:, [0, 2]], on=df1.columns[0], suffixes=('_' + campaign1, '_' + campaign2)) + +# Create a new column that calculates the difference between the third columns of the two DataFrames +merged_df['Difference'] = merged_df.iloc[:, 1] - merged_df.iloc[:, 2] + +# Create a new DataFrame with the branch names and the difference +result_df = merged_df[[df1.columns[0], 'Difference']] + +# Display the resulting DataFrame +print(result_df) + +# Sort the DataFrame by the absolute value of the difference in descending order +sorted_df = result_df.reindex(result_df['Difference'].abs().sort_values(ascending=False).index) + +# Pick the top 10 branches with the largest differences +top_20_branches = sorted_df.head(20) + +# Display the top 10 branches +print(top_20_branches) + + +# Optionally, save it to a new CSV file +sorted_df.to_csv(f"{campaign1}_vs_{campaign2}_difference.csv", index=False)