From 91114655c7da745a78485c858a135623cb42ab56 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 8 Aug 2025 13:07:04 +0200 Subject: [PATCH 01/15] [NFC] Remove trailing spaces in RInterface.hxx --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index bd662bc633802..ff48cb48c1df0 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -882,7 +882,7 @@ public: /// return an RVec of varied values, one for each variation tag, in the same order as the tags. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -988,7 +988,7 @@ public: /// return an RVec of varied values, one for each variation tag, in the same order as the tags. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1036,7 +1036,7 @@ public: /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1091,7 +1091,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied /// values for the specified column. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1126,7 +1126,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied /// values for the specified columns. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time @@ -1163,7 +1163,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied /// values for the specified column. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -2362,13 +2362,13 @@ public: /// auto myGAE2 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh"); /// ~~~ /// - /// `GraphAssymErrors` should also be used for the cases in which values associated only with - /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero. - /// In such cases, user should do the following: + /// `GraphAssymErrors` should also be used for the cases in which values associated only with + /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero. + /// In such cases, user should do the following: /// ~~~{.cpp} /// // Create a column of zeros in RDataFrame - /// auto rdf_withzeros = rdf.Define("zero", "0"); - /// // or alternatively: + /// auto rdf_withzeros = rdf.Define("zero", "0"); + /// // or alternatively: /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;}); /// // Create the graph with y errors only /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh"); From 4a5d8a6b28db6387831a46b4f357a7230ac98dd6 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 8 Aug 2025 13:07:44 +0200 Subject: [PATCH 02/15] [RDF][Docs] Move main snapshot documentation to non-template overload. Due to the deprecation marker, the snapshot documentation is parsed incorrectly in doxygen. Here, the main documentation is moved to the non-template overload, and the other overloads are referring to this one using the full argument list. "See above" typically doesn't work, since doxygen sorts the functions. --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 47 +++++++++++----------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index ff48cb48c1df0..56ae9c5f8da44 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1251,6 +1251,27 @@ public: /// \param[in] options RSnapshotOptions struct with extra options to pass to the output TFile and TTree/RNTuple. /// \return a `RDataFrame` that wraps the snapshotted dataset. /// + template + R__DEPRECATED( + 6, 40, "Snapshot does not need template arguments anymore, you can safely remove them from this function call.") + RResultPtr> Snapshot(std::string_view treename, std::string_view filename, + const ColumnNames_t &columnList, + const RSnapshotOptions &options = RSnapshotOptions()) + { + return Snapshot(treename, filename, columnList, options); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. + /// \param[in] treename The name of the output TTree or RNTuple. + /// \param[in] filename The name of the output TFile. + /// \param[in] columnList The list of names of the columns/branches/fields to be written. + /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. + /// \return a `RDataFrame` that wraps the snapshotted dataset. + /// + /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. + /// The types of the columns are automatically inferred and do not need to be specified. + /// /// Support for writing of nested branches/fields is limited (although RDataFrame is able to read them) and dot ('.') /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot. /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also @@ -1306,28 +1327,6 @@ public: /// opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; /// df.Snapshot("outputNTuple", "outputFile.root", {"x"}, opts); /// ~~~ - template - R__DEPRECATED( - 6, 40, "Snapshot does not need template arguments anymore, you can safely remove them from this function call.") - RResultPtr> Snapshot(std::string_view treename, std::string_view filename, - const ColumnNames_t &columnList, - const RSnapshotOptions &options = RSnapshotOptions()) - { - return Snapshot(treename, filename, columnList, options); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. - /// \param[in] treename The name of the output TTree or RNTuple. - /// \param[in] filename The name of the output TFile. - /// \param[in] columnList The list of names of the columns/branches/fields to be written. - /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. - /// \return a `RDataFrame` that wraps the snapshotted dataset. - /// - /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. - /// The types of the columns are automatically inferred and do not need to be specified. - /// - /// See above for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options = RSnapshotOptions()) @@ -1464,7 +1463,7 @@ public: /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. /// The types of the columns are automatically inferred and do not need to be specified. /// - /// See above for a more complete description and example usages. + /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, std::string_view columnNameRegexp = "", const RSnapshotOptions &options = RSnapshotOptions()) @@ -1507,7 +1506,7 @@ public: /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. /// The types of the columns are automatically inferred and do not need to be specified. /// - /// See above for a more complete description and example usages. + /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, std::initializer_list columnList, const RSnapshotOptions &options = RSnapshotOptions()) From 20c5c852274c3137152dbf1b8a356c6fd279a088 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 15 Aug 2025 13:19:14 +0200 Subject: [PATCH 03/15] [RDF] Share column readers for Defines not affected by variations. When a Define is added to a branch that gets varied, a new column reader was created for every variation and for every slot, irrespective of whether the column gets varied or not. For snapshot with variations, this column would be written for every variation, although it always evaluates to the same value. To prevent this duplication, the column readers that point to the same Define are shared after this commit. The readers are still cloned per slot though, so the Define's value caches remain thread safe. This allows SnapshotWithVariations to detect that columns are identical, so they are not written multiple times. The memory savings amount to 24 bytes per suppressed column reader,so they won't have a notable impact. --- tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx | 2 +- tree/dataframe/src/RDefineReader.cxx | 37 ++++++++++++------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx b/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx index 34192c0ad90ee..754d4cdfcb5fe 100644 --- a/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx @@ -62,7 +62,7 @@ class RDefinesWithReaders { // (see BookDefineJit). it is never null. std::shared_ptr fDefine; // Column readers per variation (in the map) per slot (in the vector). - std::vector>> fReadersPerVariation; + std::vector>> fReadersPerVariation; // Strings that were already used to represent column names in this RDataFrame instance. ROOT::Internal::RDF::RStringCache &fCachedColNames; diff --git a/tree/dataframe/src/RDefineReader.cxx b/tree/dataframe/src/RDefineReader.cxx index e374219a41b2b..2126037e70248 100644 --- a/tree/dataframe/src/RDefineReader.cxx +++ b/tree/dataframe/src/RDefineReader.cxx @@ -32,19 +32,28 @@ ROOT::Internal::RDF::RDefinesWithReaders::GetReader(unsigned int slot, std::stri if (it != defineReaders.end()) return *it->second; + std::shared_ptr readerToReturn; auto *define = fDefine.get(); - if (*nameIt != "nominal") - define = &define->GetVariedDefine(std::string(variationName)); - -#if !defined(__clang__) && __GNUC__ >= 7 && __GNUC_MINOR__ >= 3 - const auto insertion = - defineReaders.insert({*nameIt, std::make_unique(slot, *define)}); - return *insertion.first->second; -#else - // gcc < 7.3 has issues with passing the non-movable std::pair temporary into the insert call - auto reader = std::make_unique(slot, *define); - auto &ret = *reader; - defineReaders[*nameIt] = std::move(reader); - return ret; -#endif + if (*nameIt == "nominal") { + readerToReturn = std::make_shared(slot, *define); + } else { + auto *variedDefine = &define->GetVariedDefine(std::string(variationName)); + if (variedDefine == define) { + // The column in not affected by variations. We can return the same reader as for nominal + if (auto nominalReaderIt = defineReaders.find("nominal"); nominalReaderIt != defineReaders.end()) { + readerToReturn = nominalReaderIt->second; + } else { + // The nominal reader doesn't exist yet + readerToReturn = std::make_shared(slot, *define); + auto nominalNameIt = fCachedColNames.Insert("nominal"); + defineReaders.insert({*nominalNameIt, readerToReturn}); + } + } else { + readerToReturn = std::make_shared(slot, *variedDefine); + } + } + + defineReaders.insert({*nameIt, readerToReturn}); + + return *readerToReturn; } From 453a817e21fea366f4de9edeb183b8c28d2c29f1 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Wed, 20 Aug 2025 11:02:51 +0200 Subject: [PATCH 04/15] [RDF] Keep RJittedDefine pointers invariant when columns don't change. In order to not write a column multiple times in a snapshot with variations, the RDefineBase pointer needs to stay invariant when a variation is generated. However, the RJittedDefine was returning the pointer to its member, so it looked like the column was affected by variations. Here, this is changed to returning "this" when no variations are in effect. --- tree/dataframe/src/RJittedDefine.cxx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tree/dataframe/src/RJittedDefine.cxx b/tree/dataframe/src/RJittedDefine.cxx index 99c369a2fb847..0ead1a3b0ce1d 100644 --- a/tree/dataframe/src/RJittedDefine.cxx +++ b/tree/dataframe/src/RJittedDefine.cxx @@ -66,5 +66,10 @@ void RJittedDefine::MakeVariations(const std::vector &variations) RDefineBase &RJittedDefine::GetVariedDefine(const std::string &variationName) { assert(fConcreteDefine != nullptr); - return fConcreteDefine->GetVariedDefine(variationName); + + auto &variedDefine = fConcreteDefine->GetVariedDefine(variationName); + if (&variedDefine == fConcreteDefine.get()) + return *this; // Ensures that the pointer is the same across all variations + else + return variedDefine; } From 387c01b950d5dfac9329e0f1fe233aaa4fdd7adf Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Thu, 21 Aug 2025 15:54:44 +0200 Subject: [PATCH 05/15] [RDF] Forbid calling VariationsFor on snapshots. Given that snapshots with variations will be enabled via RSnapshotOptions, the snapshot overload for Vary can be removed, and a static_assert can be used to signal the failure. --- tree/dataframe/inc/ROOT/RDFHelpers.hxx | 7 ++++--- tree/dataframe/src/RDFHelpers.cxx | 15 +++------------ tree/dataframe/test/dataframe_vary.cxx | 17 ----------------- 3 files changed, 7 insertions(+), 32 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDFHelpers.hxx b/tree/dataframe/inc/ROOT/RDFHelpers.hxx index 27fbcef39a794..34ae4f129d769 100644 --- a/tree/dataframe/inc/ROOT/RDFHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDFHelpers.hxx @@ -222,6 +222,10 @@ namespace Experimental { template RResultMap VariationsFor(RResultPtr resPtr) { + using SnapshotResult_t = ROOT::RDF::RInterface; + static_assert(!std::is_same_v, + "Snapshot with variations only can be enabled via RSnapshotOptions."); + R__ASSERT(resPtr != nullptr && "Calling VariationsFor on an empty RResultPtr"); // populate parts of the computation graph for which we only have "empty shells", e.g. RJittedActions and @@ -270,9 +274,6 @@ RResultMap VariationsFor(RResultPtr resPtr) *resPtr.fLoopManager, std::move(nominalAction), std::move(variedAction)); } -using SnapshotPtr_t = ROOT::RDF::RResultPtr>; -SnapshotPtr_t VariationsFor(SnapshotPtr_t resPtr); - /// \brief Add ProgressBar to a ROOT::RDF::RNode /// \param[in] df RDataFrame node at which ProgressBar is called. /// diff --git a/tree/dataframe/src/RDFHelpers.cxx b/tree/dataframe/src/RDFHelpers.cxx index a4d8ad422c623..dc4c5d7515937 100644 --- a/tree/dataframe/src/RDFHelpers.cxx +++ b/tree/dataframe/src/RDFHelpers.cxx @@ -141,15 +141,7 @@ unsigned int ROOT::RDF::RunGraphs(std::vector handles) return uniqueLoops.size(); } -ROOT::RDF::Experimental::SnapshotPtr_t ROOT::RDF::Experimental::VariationsFor(ROOT::RDF::Experimental::SnapshotPtr_t) -{ - throw std::logic_error("Varying a Snapshot result is not implemented yet."); -} - -namespace ROOT { -namespace RDF { - -namespace Experimental { +namespace ROOT::RDF::Experimental { void ThreadsPerTH3(unsigned int N) { @@ -398,6 +390,5 @@ void AddProgressBar(ROOT::RDataFrame dataframe) auto node = ROOT::RDF::AsRNode(dataframe); ROOT::RDF::Experimental::AddProgressBar(node); } -} // namespace Experimental -} // namespace RDF -} // namespace ROOT + +} // namespace ROOT::RDF::Experimental diff --git a/tree/dataframe/test/dataframe_vary.cxx b/tree/dataframe/test/dataframe_vary.cxx index f881329c5a099..85e6cc74d2f90 100644 --- a/tree/dataframe/test/dataframe_vary.cxx +++ b/tree/dataframe/test/dataframe_vary.cxx @@ -1557,23 +1557,6 @@ TEST_P(RDFVary, VaryTake) EXPECT_EQ(sorted(rs["x:1"]), std::vector({1, 2, 3})); } -TEST_P(RDFVary, VarySnapshot) -{ - const auto fname = "dummy.root"; - auto h = ROOT::RDataFrame(10) - .Define("x", [](ULong64_t e) { return int(e); }, {"rdfentry_"}) - .Vary( - "x", [](int x) { return ROOT::RVecI{x - 1, x + 1}; }, {"x"}, 2) - .Snapshot("t", fname, {"x"}); - EXPECT_THROW( - try { VariationsFor(h); } catch (const std::logic_error &err) { - const auto msg = "Varying a Snapshot result is not implemented yet."; - EXPECT_STREQ(err.what(), msg); - throw; - }, - std::logic_error); -} - // this is a regression test, we used to read from wrong addresses in this case TEST_P(RDFVary, MoreVariedColumnsThanVariations) { From 5bd53a7962a2f0ab80cd697cc2061bdf44000ef6 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Tue, 5 Aug 2025 13:52:51 +0200 Subject: [PATCH 06/15] [RDF] Unify members of Snapshot helpers into structs. In order to implement snapshots with systematic variations, it is beneficial to place all members related to output branches into structs. At present, this only constitutes a move from member of SnapshotHelper to member of RBranchData struct. The TTree snapshot helpers are changed accordingly. --- .../inc/ROOT/RDF/SnapshotHelpers.hxx | 48 ++- tree/dataframe/src/RDFSnapshotHelpers.cxx | 322 +++++++++++------- 2 files changed, 225 insertions(+), 145 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index e7ebca8041e95..6f938ded33f28 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -102,6 +102,34 @@ public: UntypedSnapshotRNTupleHelper MakeNew(void *newName); }; +/// Stores properties of each output branch in a Snapshot. +struct RBranchData { + std::string fInputBranchName; // This contains resolved aliases + std::string fOutputBranchName; + const std::type_info *fInputTypeID = nullptr; + TBranch *fOutputBranch = nullptr; + void *fBranchAddressForCArrays = nullptr; // Used to detect if branch addresses need to be updated + + std::unique_ptr> fEmptyInstance; + bool fIsCArray = false; + bool fIsDefine = false; + + RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine, const std::type_info *typeID, + TBranch *outputBranch = nullptr) + : fInputBranchName{std::move(inputBranchName)}, + fOutputBranchName{std::move(outputBranchName)}, + fInputTypeID{typeID}, + fOutputBranch{outputBranch}, + fIsDefine(isDefine) + { + } + void ClearBranchPointers() + { + fOutputBranch = nullptr; + fBranchAddressForCArrays = nullptr; + } +}; + class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelper final : public RActionImpl { std::string fFileName; std::string fDirName; @@ -110,17 +138,10 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelper final : public RActionIm std::unique_ptr fOutputFile; std::unique_ptr fOutputTree; // must be a ptr because TTrees are not copy/move constructible bool fBranchAddressesNeedReset{true}; - ColumnNames_t fInputBranchNames; // This contains the resolved aliases - ColumnNames_t fOutputBranchNames; TTree *fInputTree = nullptr; // Current input tree. Set at initialization time (`InitTask`) - // TODO we might be able to unify fBranches, fBranchAddresses and fOutputBranches - std::vector fBranches; // Addresses of branches in output, non-null only for the ones holding C arrays - std::vector fBranchAddresses; // Addresses of objects associated to output branches - RBranchSet fOutputBranches; - std::vector fIsDefine; + std::vector fBranchData; // Information for all output branches ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ROOT::Detail::RDF::RLoopManager *fInputLoopManager; - std::vector fInputColumnTypeIDs; // Types for the input columns public: UntypedSnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename, @@ -169,11 +190,7 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelperMT final : public RAction std::vector> fOutputTrees; std::vector fBranchAddressesNeedReset; // vector does not allow concurrent writing of different elements std::vector fInputTrees; // Current input trees, one per slot. Set at initialization time (`InitTask`) - // Addresses of branches in output per slot, non-null only for the ones holding C arrays - std::vector> fBranches; - // Addresses of objects associated to output branches per slot, non-null only for the ones holding C arrays - std::vector> fBranchAddresses; - std::vector fOutputBranches; // Unique set of output branches, one per slot. + std::vector> fBranchData; // Information for all output branches of each slot // Attributes of the output TTree @@ -182,16 +199,11 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelperMT final : public RAction std::string fTreeName; TFile *fOutputFile; // Non-owning view on the output file RSnapshotOptions fOptions; - std::vector fOutputBranchNames; // Attributes related to the computation graph ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ROOT::Detail::RDF::RLoopManager *fInputLoopManager; - std::vector fInputBranchNames; // This contains the resolved aliases - std::vector fInputColumnTypeIDs; // Types for the input columns - - std::vector fIsDefine; public: UntypedSnapshotTTreeHelperMT(unsigned int nSlots, std::string_view filename, std::string_view dirname, diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 94c254f4d7078..fc2bf7eb7dab2 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -34,8 +34,45 @@ #include #include +#include +#include +#include + +using ROOT::Internal::RDF::RBranchData; + namespace { +void AssertNoNullBranchAddresses(std::vector const &branches) +{ + std::vector branchesWithNullAddress; + for (auto const &branchData : branches) { + if (branchData.fOutputBranch->GetAddress() == nullptr) + branchesWithNullAddress.push_back(branchData.fOutputBranch); + } + + if (branchesWithNullAddress.empty()) + return; + + // otherwise build error message and throw + std::vector missingBranchNames; + std::transform(branchesWithNullAddress.begin(), branchesWithNullAddress.end(), + std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); }); + std::string msg = "RDataFrame::Snapshot:"; + if (missingBranchNames.size() == 1) { + msg += " branch " + missingBranchNames[0] + + " is needed as it provides the size for one or more branches containing dynamically sized arrays, but " + "it is"; + } else { + msg += " branches "; + for (const auto &bName : missingBranchNames) + msg += bName + ", "; + msg.resize(msg.size() - 2); // remove last ", " + msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are"; + } + msg += " not part of the set of branches that are being written out."; + throw std::runtime_error(msg); +} + TBranch *SearchForBranch(TTree *inputTree, const std::string &branchName) { if (inputTree) { @@ -49,35 +86,51 @@ TBranch *SearchForBranch(TTree *inputTree, const std::string &branchName) return nullptr; } -void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet &outputBranches, TBranch *inputBranch, - const std::string &outputBranchName, int basketSize, void *address) +std::vector::iterator CreateCStyleArrayBranch(TTree &outputTree, std::vector &outputBranches, + std::vector::iterator thisBranch, + TBranch *inputBranch, int basketSize, void *address) { if (!inputBranch) - return; + return thisBranch; const auto STLKind = TClassEdit::IsSTLCont(inputBranch->GetClassName()); if (STLKind == ROOT::ESTLType::kSTLvector || STLKind == ROOT::ESTLType::kROOTRVec) - return; + return thisBranch; // must construct the leaflist for the output branch and create the branch in the output tree const auto *leaf = static_cast(inputBranch->GetListOfLeaves()->UncheckedAt(0)); if (!leaf) - return; + return thisBranch; const auto bname = leaf->GetName(); auto *sizeLeaf = leaf->GetLeafCount(); const auto sizeLeafName = sizeLeaf ? std::string(sizeLeaf->GetName()) : std::to_string(leaf->GetLenStatic()); // We proceed only if branch is a fixed-or-variable-sized array if (sizeLeaf || leaf->GetLenStatic() > 1) { - if (sizeLeaf && !outputBranches.Get(sizeLeafName)) { - // The output array branch `bname` has dynamic size stored in leaf `sizeLeafName`, but that leaf has not been - // added to the output tree yet. However, the size leaf has to be available for the creation of the array - // branch to be successful. So we create the size leaf here. - const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName()); - // Use Original basket size for Existing Branches otherwise use Custom basket Size. - const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize(); - // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for `sizeLeafName` - auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast(nullptr), - (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize); - outputBranches.Insert(sizeLeafName, outputBranch); + if (sizeLeaf) { + // The array branch `bname` has dynamic size stored in leaf `sizeLeafName`, so we need to ensure that it's + // in the output tree. + auto sizeLeafIt = + std::find_if(outputBranches.begin(), outputBranches.end(), + [&sizeLeafName](RBranchData const &bd) { return bd.fOutputBranchName == sizeLeafName; }); + if (sizeLeafIt == outputBranches.end()) { + // The size leaf is not part of the output branches yet, so emplace an empty slot for it. + // This means that iterators need to be updated in case the container reallocates. + const auto indexBeforeEmplace = std::distance(outputBranches.begin(), thisBranch); + outputBranches.emplace_back("", sizeLeafName, /*isDefine=*/false, /*typeID=*/nullptr, + /*outputBranch=*/nullptr); + thisBranch = outputBranches.begin() + indexBeforeEmplace; + sizeLeafIt = outputBranches.end() - 1; + } + if (!sizeLeafIt->fOutputBranch) { + // The size leaf was emplaced, but not initialised yet + const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName()); + // Use Original basket size for Existing Branches otherwise use Custom basket Size. + const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize(); + // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for + // `sizeLeafName` + auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast(nullptr), + (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize); + sizeLeafIt->fOutputBranch = outputBranch; + } } const auto btype = leaf->GetTypeName(); @@ -87,7 +140,7 @@ void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet "RDataFrame::Snapshot: could not correctly construct a leaflist for C-style array in column %s. The " "leaf is of type '%s'. This column will not be written out.", bname, btype); - return; + return thisBranch; } const auto leaflist = std::string(bname) + "[" + sizeLeafName + "]/" + rootbtype; @@ -102,23 +155,26 @@ void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet } return nullptr; }(); - auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize); - outputBranch->SetTitle(inputBranch->GetTitle()); - outputBranches.Insert(outputBranchName, outputBranch, true); + thisBranch->fOutputBranch = + outputTree.Branch(thisBranch->fOutputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize); + thisBranch->fOutputBranch->SetTitle(inputBranch->GetTitle()); + thisBranch->fIsCArray = true; } + + return thisBranch; } -void SetBranchAddress(TBranch *inputBranch, TBranch &outputBranch, void *&outputBranchAddress, bool isCArray, - void *valueAddress) +void SetBranchAddress(TBranch *inputBranch, RBranchData &branchData, void *valueAddress) { const static TClassRef TBOClRef("TBranchObject"); if (inputBranch && inputBranch->IsA() == TBOClRef) { - outputBranch.SetAddress(reinterpret_cast(inputBranch->GetAddress())); - } else if (outputBranch.IsA() != TBranch::Class()) { - outputBranchAddress = valueAddress; - outputBranch.SetAddress(&outputBranchAddress); + branchData.fOutputBranch->SetAddress(reinterpret_cast(inputBranch->GetAddress())); + } else if (branchData.fOutputBranch->IsA() != TBranch::Class()) { + // This is a relatively rare case of a fixed-size array getting redefined + branchData.fBranchAddressForCArrays = valueAddress; + branchData.fOutputBranch->SetAddress(&branchData.fBranchAddressForCArrays); } else { - void *correctAddress = [valueAddress, isCArray]() -> void * { + void *correctAddress = [valueAddress, isCArray = branchData.fIsCArray]() -> void * { if (isCArray) { // Address here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer @@ -127,29 +183,26 @@ void SetBranchAddress(TBranch *inputBranch, TBranch &outputBranch, void *&output } return valueAddress; }(); - outputBranch.SetAddress(correctAddress); - outputBranchAddress = valueAddress; + branchData.fOutputBranch->SetAddress(correctAddress); + branchData.fBranchAddressForCArrays = valueAddress; } } -void CreateFundamentalTypeBranch(TTree &outputTree, const std::string &outputBranchName, void *valueAddress, - const std::type_info &valueTypeID, ROOT::Internal::RDF::RBranchSet &outputBranches, - int bufSize) +void CreateFundamentalTypeBranch(TTree &outputTree, RBranchData &bd, void *valueAddress, int bufSize) { // Logic taken from // TTree::BranchImpRef( // const char* branchname, TClass* ptrClass, EDataType datatype, void* addobj, Int_t bufsize, Int_t splitlevel) - auto rootTypeChar = ROOT::Internal::RDF::TypeID2ROOTTypeName(valueTypeID); + auto rootTypeChar = ROOT::Internal::RDF::TypeID2ROOTTypeName(*bd.fInputTypeID); if (rootTypeChar == ' ') { Warning("Snapshot", "RDataFrame::Snapshot: could not correctly construct a leaflist for fundamental type in column %s. This " "column will not be written out.", - outputBranchName.c_str()); + bd.fOutputBranchName.c_str()); return; } - std::string leafList{outputBranchName + '/' + rootTypeChar}; - auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize); - outputBranches.Insert(outputBranchName, outputBranch); + std::string leafList{bd.fOutputBranchName + '/' + rootTypeChar}; + bd.fOutputBranch = outputTree.Branch(bd.fOutputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize); } /// Ensure that the TTree with the resulting snapshot can be written to the target TFile. This means checking that the @@ -238,13 +291,18 @@ void EnsureValidSnapshotRNTupleOutput(const ROOT::RDF::RSnapshotOptions &opts, c } } -void SetBranchesHelper(TTree *inputTree, TTree &outputTree, ROOT::Internal::RDF::RBranchSet &outputBranches, - int basketSize, const std::string &inputBranchName, const std::string &outputBranchName, - const std::type_info &valueTypeID, void *valueAddress, TBranch *&actionHelperBranchPtr, - void *&actionHelperBranchPtrAddress, bool isDefine) +void SetBranchesHelper(TTree *inputTree, TTree &outputTree, + std::vector &allBranchData, std::size_t currentIndex, + int basketSize, void *valueAddress) { + auto branchData = allBranchData.begin() + currentIndex; + auto *inputBranch = branchData->fIsDefine ? nullptr : SearchForBranch(inputTree, branchData->fInputBranchName); - auto *inputBranch = isDefine ? nullptr : SearchForBranch(inputTree, inputBranchName); + if (branchData->fOutputBranch && valueAddress) { + // The output branch was already created, we just need to (re)set its address + SetBranchAddress(inputBranch, *branchData, valueAddress); + return; + } // Respect the original bufsize and splitlevel arguments // In particular, by keeping splitlevel equal to 0 if this was the case for `inputBranch`, we avoid @@ -254,55 +312,47 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, ROOT::Internal::RDF: const auto bufSize = (basketSize > 0) ? basketSize : (inputBranch ? inputBranch->GetBasketSize() : 32000); const auto splitLevel = inputBranch ? inputBranch->GetSplitLevel() : 99; - if (auto *outputBranch = outputBranches.Get(outputBranchName); outputBranch && valueAddress) { - // The output branch was already created, we just need to (re)set its address - SetBranchAddress(inputBranch, *outputBranch, actionHelperBranchPtrAddress, - outputBranches.IsCArray(outputBranchName), valueAddress); - return; - } - - auto *dictionary = TDictionary::GetDictionary(valueTypeID); + auto *dictionary = TDictionary::GetDictionary(*branchData->fInputTypeID); if (dynamic_cast(dictionary)) { // Branch of fundamental type - CreateFundamentalTypeBranch(outputTree, outputBranchName, valueAddress, valueTypeID, outputBranches, bufSize); + CreateFundamentalTypeBranch(outputTree, *branchData, valueAddress, bufSize); return; } - if (!isDefine) { + if (!branchData->fIsDefine) { // Cases where we need a leaflist (e.g. C-style arrays) // We only enter this code path if the input value does not come from a Define/Redefine. In those cases, it is // not allowed to create a column of C-style array type, so that can't happen when writing the TTree. This is // currently what prevents writing the wrong branch output type in a scenario where the input branch of the TTree // is a C-style array and then the user is Redefining it with some other type (e.g. a ROOT::RVec). - CreateCStyleArrayBranch(outputTree, outputBranches, inputBranch, outputBranchName, bufSize, valueAddress); + branchData = CreateCStyleArrayBranch(outputTree, allBranchData, branchData, inputBranch, bufSize, valueAddress); } - if (auto *arrayBranch = outputBranches.Get(outputBranchName)) { + if (branchData->fOutputBranch) { // A branch was created in the previous function call - actionHelperBranchPtr = arrayBranch; if (valueAddress) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(valueAddress); - actionHelperBranchPtrAddress = rawRVec->data(); + branchData->fBranchAddressForCArrays = rawRVec->data(); } return; } if (auto *classPtr = dynamic_cast(dictionary)) { - TBranch *outputBranch{}; // Case of unsplit object with polymorphic type if (inputBranch && dynamic_cast(inputBranch) && valueAddress) - outputBranch = ROOT::Internal::TreeUtils::CallBranchImp(outputTree, outputBranchName.c_str(), classPtr, - inputBranch->GetAddress(), bufSize, splitLevel); + branchData->fOutputBranch = + ROOT::Internal::TreeUtils::CallBranchImp(outputTree, branchData->fOutputBranchName.c_str(), classPtr, + inputBranch->GetAddress(), bufSize, splitLevel); // General case, with valid address else if (valueAddress) - outputBranch = ROOT::Internal::TreeUtils::CallBranchImpRef(outputTree, outputBranchName.c_str(), classPtr, - TDataType::GetType(valueTypeID), valueAddress, - bufSize, splitLevel); + branchData->fOutputBranch = ROOT::Internal::TreeUtils::CallBranchImpRef( + outputTree, branchData->fOutputBranchName.c_str(), classPtr, TDataType::GetType(*branchData->fInputTypeID), + valueAddress, bufSize, splitLevel); // No value was passed, we're just creating a hollow branch to populate the dataset schema else - outputBranch = outputTree.Branch(outputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize); - outputBranches.Insert(outputBranchName, outputBranch); + branchData->fOutputBranch = + outputTree.Branch(branchData->fOutputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize); return; } @@ -388,16 +438,16 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UntypedSnapshotTTreeHelper( fDirName(dirname), fTreeName(treename), fOptions(options), - fInputBranchNames(vbnames), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), - fBranches(vbnames.size(), nullptr), - fBranchAddresses(vbnames.size(), nullptr), - fIsDefine(std::move(isDefine)), fOutputLoopManager(loopManager), - fInputLoopManager(inputLM), - fInputColumnTypeIDs(colTypeIDs) + fInputLoopManager(inputLM) { EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName); + + auto outputBranchNames = ReplaceDotWithUnderscore(bnames); + fBranchData.reserve(vbnames.size()); + for (unsigned int i = 0; i < vbnames.size(); ++i) { + fBranchData.emplace_back(vbnames[i], std::move(outputBranchNames[i]), isDefine[i], colTypeIDs[i]); + } } // Define special member methods here where the definition of all the data member types is available @@ -449,17 +499,16 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UpdateCArraysPtrs(const st // associated to those is re-allocated. As a result the value of the pointer can change therewith // leaving associated to the branch of the output tree an invalid pointer. // With this code, we set the value of the pointer in the output branch anew when needed. - assert(values.size() == fBranches.size()); + assert(values.size() <= fBranchData.size()); auto nValues = values.size(); for (decltype(nValues) i{}; i < nValues; i++) { - if (fBranches[i] && fOutputBranches.IsCArray(fOutputBranchNames[i])) { + if (fBranchData[i].fIsCArray) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(values[i]); - if (auto *data = rawRVec->data(); fBranchAddresses[i] != data) { - // reset the branch address - fBranches[i]->SetAddress(data); - fBranchAddresses[i] = data; + if (auto *data = rawRVec->data(); fBranchData[i].fBranchAddressForCArrays != data) { + fBranchData[i].fOutputBranch->SetAddress(data); + fBranchData[i].fBranchAddressForCArrays = data; } } } @@ -468,26 +517,18 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UpdateCArraysPtrs(const st void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetBranches(const std::vector &values) { // create branches in output tree - auto nValues = values.size(); - for (decltype(nValues) i{}; i < nValues; i++) { - SetBranchesHelper(fInputTree, *fOutputTree, fOutputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], values[i], fBranches[i], fBranchAddresses[i], - fIsDefine[i]); + assert(fBranchData.size() == values.size()); + for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions + SetBranchesHelper(fInputTree, *fOutputTree, fBranchData, i, fOptions.fBasketSize, values[i]); } - fOutputBranches.AssertNoNullBranchAddresses(); + AssertNoNullBranchAddresses(fBranchData); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetEmptyBranches(TTree *inputTree, TTree &outputTree) { void *dummyValueAddress{}; - TBranch *dummyTBranchPtr{}; - void *dummyTBranchAddress{}; - RBranchSet outputBranches{}; - auto nBranches = fInputBranchNames.size(); - for (decltype(nBranches) i{}; i < nBranches; i++) { - SetBranchesHelper(inputTree, outputTree, outputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], dummyValueAddress, dummyTBranchPtr, - dummyTBranchAddress, fIsDefine[i]); + for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions + SetBranchesHelper(inputTree, outputTree, fBranchData, i, fOptions.fBasketSize, dummyValueAddress); } } @@ -551,16 +592,29 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelper ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::MakeNew(void *newName, std::string_view) { const std::string finalName = *reinterpret_cast(newName); + std::vector inputBranchNames; + std::vector outputBranchNames; + std::vector isDefine; + std::vector inputColumnTypeIDs; + for (const auto &bd : fBranchData) { + if (bd.fInputBranchName.empty()) + break; + inputBranchNames.push_back(bd.fInputBranchName); + outputBranchNames.push_back(bd.fOutputBranchName); + isDefine.push_back(bd.fIsDefine); + inputColumnTypeIDs.push_back(bd.fInputTypeID); + } + return ROOT::Internal::RDF::UntypedSnapshotTTreeHelper{finalName, fDirName, fTreeName, - fInputBranchNames, - fOutputBranchNames, + std::move(inputBranchNames), + std::move(outputBranchNames), fOptions, - std::vector(fIsDefine), + std::move(isDefine), fOutputLoopManager, fInputLoopManager, - fInputColumnTypeIDs}; + inputColumnTypeIDs}; } ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UntypedSnapshotTTreeHelperMT( @@ -573,21 +627,25 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UntypedSnapshotTTreeHelperMT( fOutputTrees(fNSlots), fBranchAddressesNeedReset(fNSlots, 1), fInputTrees(fNSlots), - fBranches(fNSlots, std::vector(vbnames.size(), nullptr)), - fBranchAddresses(fNSlots, std::vector(vbnames.size(), nullptr)), - fOutputBranches(fNSlots), fFileName(filename), fDirName(dirname), fTreeName(treename), fOptions(options), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), fOutputLoopManager(loopManager), - fInputLoopManager(inputLM), - fInputBranchNames(vbnames), - fInputColumnTypeIDs(colTypeIDs), - fIsDefine(std::move(isDefine)) + fInputLoopManager(inputLM) { EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName); + + auto outputBranchNames = ReplaceDotWithUnderscore(bnames); + fBranchData.reserve(fNSlots); + for (unsigned int slot = 0; slot < fNSlots; ++slot) { + fBranchData.emplace_back(); + auto &thisSlot = fBranchData.back(); + thisSlot.reserve(vbnames.size()); + for (unsigned int i = 0; i < vbnames.size(); ++i) { + thisSlot.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]); + } + } } // Define special member methods here where the definition of all the data member types is available @@ -647,9 +705,10 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::FinalizeTask(unsigned in { if (fOutputTrees[slot]->GetEntries() > 0) fOutputFiles[slot]->Write(); + for (auto &branchData : fBranchData[slot]) + branchData.ClearBranchPointers(); // Pointers might go to an old tree, so they are stale now // clear now to avoid concurrent destruction of output trees and input tree (which has them listed as fClones) fOutputTrees[slot].reset(nullptr); - fOutputBranches[slot].Clear(); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Exec(unsigned int slot, const std::vector &values) @@ -674,17 +733,18 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UpdateCArraysPtrs(unsign // associated to those is re-allocated. As a result the value of the pointer can change therewith // leaving associated to the branch of the output tree an invalid pointer. // With this code, we set the value of the pointer in the output branch anew when needed. - assert(values.size() == fBranches[slot].size()); + assert(values.size() <= fBranchData[slot].size()); auto nValues = values.size(); for (decltype(nValues) i{}; i < nValues; i++) { - if (fBranches[slot][i] && fOutputBranches[slot].IsCArray(fOutputBranchNames[i])) { + auto &branchData = fBranchData[slot][i]; + if (branchData.fIsCArray) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(values[i]); - if (auto *data = rawRVec->data(); fBranchAddresses[slot][i] != data) { + if (auto *data = rawRVec->data(); branchData.fBranchAddressForCArrays != data) { // reset the branch address - fBranches[slot][i]->SetAddress(data); - fBranchAddresses[slot][i] = data; + branchData.fOutputBranch->SetAddress(data); + branchData.fBranchAddressForCArrays = data; } } } @@ -694,26 +754,21 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetBranches(unsigned int const std::vector &values) { // create branches in output tree - auto nValues = values.size(); - for (decltype(nValues) i{}; i < nValues; i++) { - SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], fOutputBranches[slot], fOptions.fBasketSize, - fInputBranchNames[i], fOutputBranchNames[i], *fInputColumnTypeIDs[i], values[i], - fBranches[slot][i], fBranchAddresses[slot][i], fIsDefine[i]); + auto &branchData = fBranchData[slot]; + assert(branchData.size() == values.size()); + for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions + SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], branchData, i, fOptions.fBasketSize, values[i]); } - fOutputBranches[slot].AssertNoNullBranchAddresses(); + + AssertNoNullBranchAddresses(branchData); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetEmptyBranches(TTree *inputTree, TTree &outputTree) { void *dummyValueAddress{}; - TBranch *dummyTBranchPtr{}; - void *dummyTBranchAddress{}; - RBranchSet outputBranches{}; - auto nBranches = fInputBranchNames.size(); - for (decltype(nBranches) i{}; i < nBranches; i++) { - SetBranchesHelper(inputTree, outputTree, outputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], dummyValueAddress, dummyTBranchPtr, - dummyTBranchAddress, fIsDefine[i]); + auto &branchData = fBranchData.front(); + for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions + SetBranchesHelper(inputTree, outputTree, branchData, i, fOptions.fBasketSize, dummyValueAddress); } } @@ -785,17 +840,30 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::MakeNew(void *newName, std::string_view) { const std::string finalName = *reinterpret_cast(newName); + std::vector inputBranchNames; + std::vector outputBranchNames; + std::vector isDefine; + std::vector inputColumnTypeIDs; + for (const auto &bd : fBranchData.front()) { + if (bd.fInputBranchName.empty()) + break; + inputBranchNames.push_back(bd.fInputBranchName); + outputBranchNames.push_back(bd.fOutputBranchName); + isDefine.push_back(bd.fIsDefine); + inputColumnTypeIDs.push_back(bd.fInputTypeID); + } + return ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT{fNSlots, finalName, fDirName, fTreeName, - fInputBranchNames, - fOutputBranchNames, + std::move(inputBranchNames), + std::move(outputBranchNames), fOptions, - std::vector(fIsDefine), + std::move(isDefine), fOutputLoopManager, fInputLoopManager, - fInputColumnTypeIDs}; + std::move(inputColumnTypeIDs)}; } ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper( From aed8ece889dc720065d8f630a306cb6041df411d Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Wed, 6 Aug 2025 11:53:09 +0200 Subject: [PATCH 07/15] [RDF] Remove RBranchSet from SnapshotHelpers. It is superseded by RBranchData. --- .../inc/ROOT/RDF/SnapshotHelpers.hxx | 13 ---- tree/dataframe/src/RDFSnapshotHelpers.cxx | 67 ------------------- 2 files changed, 80 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index 6f938ded33f28..69ad0634af14e 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -38,19 +38,6 @@ class TBufferMergerFile; namespace ROOT::Internal::RDF { -class RBranchSet { - std::vector fBranches; - std::vector fNames; - std::vector fIsCArray; - -public: - TBranch *Get(const std::string &name) const; - bool IsCArray(const std::string &name) const; - void Insert(const std::string &name, TBranch *address, bool isCArray = false); - void Clear(); - void AssertNoNullBranchAddresses(); -}; - class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RActionImpl { std::string fFileName; std::string fDirName; diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index fc2bf7eb7dab2..c874f8e9c3c5d 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -362,73 +362,6 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, } } // namespace -TBranch *ROOT::Internal::RDF::RBranchSet::Get(const std::string &name) const -{ - auto it = std::find(fNames.begin(), fNames.end(), name); - if (it == fNames.end()) - return nullptr; - return fBranches[std::distance(fNames.begin(), it)]; -} - -bool ROOT::Internal::RDF::RBranchSet::IsCArray(const std::string &name) const -{ - if (auto it = std::find(fNames.begin(), fNames.end(), name); it != fNames.end()) - return fIsCArray[std::distance(fNames.begin(), it)]; - return false; -} - -void ROOT::Internal::RDF::RBranchSet::Insert(const std::string &name, TBranch *address, bool isCArray) -{ - if (address == nullptr) { - throw std::logic_error("Trying to insert a null branch address."); - } - if (std::find(fBranches.begin(), fBranches.end(), address) != fBranches.end()) { - throw std::logic_error("Trying to insert a branch address that's already present."); - } - if (std::find(fNames.begin(), fNames.end(), name) != fNames.end()) { - throw std::logic_error("Trying to insert a branch name that's already present."); - } - fNames.emplace_back(name); - fBranches.emplace_back(address); - fIsCArray.push_back(isCArray); -} - -void ROOT::Internal::RDF::RBranchSet::Clear() -{ - fBranches.clear(); - fNames.clear(); - fIsCArray.clear(); -} - -void ROOT::Internal::RDF::RBranchSet::AssertNoNullBranchAddresses() -{ - std::vector branchesWithNullAddress; - std::copy_if(fBranches.begin(), fBranches.end(), std::back_inserter(branchesWithNullAddress), - [](TBranch *b) { return b->GetAddress() == nullptr; }); - - if (branchesWithNullAddress.empty()) - return; - - // otherwise build error message and throw - std::vector missingBranchNames; - std::transform(branchesWithNullAddress.begin(), branchesWithNullAddress.end(), - std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); }); - std::string msg = "RDataFrame::Snapshot:"; - if (missingBranchNames.size() == 1) { - msg += " branch " + missingBranchNames[0] + - " is needed as it provides the size for one or more branches containing dynamically sized arrays, but " - "it is"; - } else { - msg += " branches "; - for (const auto &bName : missingBranchNames) - msg += bName + ", "; - msg.resize(msg.size() - 2); // remove last ", " - msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are"; - } - msg += " not part of the set of branches that are being written out."; - throw std::runtime_error(msg); -} - ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UntypedSnapshotTTreeHelper( std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine, From 12127fcb9b3f8a1bef970ca5e20efa90e2b14706 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Mon, 21 Jul 2025 15:25:15 +0200 Subject: [PATCH 08/15] [roottest] Allow a single ninja compilation to run in parallel with tests. In roottest, some targets are added to the Ninja build graph. These are compiled as part of the test suite, but this would create race conditions when many builds are started in parallel. Therefore, all these builds were flagged with RUN_SERIAL, significantly reducing the parallelism of roottest. Now, the tests that compile have "RESOURCE_LOCK NINJA_COMPILATION", so only a single test that compiles exectables can run, but it can run in parallel with tests that don't compile. --- cmake/modules/RootMacros.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/modules/RootMacros.cmake b/cmake/modules/RootMacros.cmake index 1824c07d18f63..29f197c8b8c51 100644 --- a/cmake/modules/RootMacros.cmake +++ b/cmake/modules/RootMacros.cmake @@ -2369,7 +2369,7 @@ macro(ROOTTEST_COMPILE_MACRO filename) endif() set_property(TEST ${COMPILE_MACRO_TEST} PROPERTY ENVIRONMENT ${ROOTTEST_ENVIRONMENT}) if(CMAKE_GENERATOR MATCHES Ninja AND NOT MSVC) - set_property(TEST ${COMPILE_MACRO_TEST} PROPERTY RUN_SERIAL true) + set_property(TEST ${COMPILE_MACRO_TEST} PROPERTY RESOURCE_LOCK NINJA_COMPILATION) endif() if (ARG_FIXTURES_SETUP) set_property(TEST ${COMPILE_MACRO_TEST} PROPERTY @@ -2475,7 +2475,7 @@ macro(ROOTTEST_GENERATE_DICTIONARY dictname) set_property(TEST ${GENERATE_DICTIONARY_TEST} PROPERTY ENVIRONMENT ${ROOTTEST_ENVIRONMENT}) if(CMAKE_GENERATOR MATCHES Ninja AND NOT MSVC) - set_property(TEST ${GENERATE_DICTIONARY_TEST} PROPERTY RUN_SERIAL true) + set_property(TEST ${GENERATE_DICTIONARY_TEST} PROPERTY RESOURCE_LOCK NINJA_COMPILATION) endif() if (ARG_FIXTURES_SETUP) @@ -2592,7 +2592,7 @@ macro(ROOTTEST_GENERATE_REFLEX_DICTIONARY dictionary) set_property(TEST ${GENERATE_REFLEX_TEST} PROPERTY ENVIRONMENT ${ROOTTEST_ENVIRONMENT}) if(CMAKE_GENERATOR MATCHES Ninja AND NOT MSVC) - set_property(TEST ${GENERATE_REFLEX_TEST} PROPERTY RUN_SERIAL true) + set_property(TEST ${GENERATE_REFLEX_TEST} PROPERTY RESOURCE_LOCK NINJA_COMPILATION) endif() if (ARG_FIXTURES_SETUP) @@ -2714,7 +2714,7 @@ macro(ROOTTEST_GENERATE_EXECUTABLE executable) endif() if(CMAKE_GENERATOR MATCHES Ninja AND NOT MSVC) - set_property(TEST ${GENERATE_EXECUTABLE_TEST} PROPERTY RUN_SERIAL true) + set_property(TEST ${GENERATE_EXECUTABLE_TEST} PROPERTY RESOURCE_LOCK NINJA_COMPILATION) endif() if(MSVC AND NOT CMAKE_GENERATOR MATCHES Ninja) From 2a3ffe4906632db33090c93ac5c0573852d72dab Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Thu, 7 Aug 2025 09:20:06 +0200 Subject: [PATCH 09/15] [CMake] Improve parallelism of RDF 10[0-9] tutorials. In addition to correctly specifying the number of CPUs in CMake, the tutorials were also setting a resource lock, which prevented them from running in parallel with other RDF tutorials. Since ROOT honours ROOT_MAX_THREADS, this resource lock can be removed. --- tutorials/CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tutorials/CMakeLists.txt b/tutorials/CMakeLists.txt index 600533d283035..f2f6ab3f5a436 100644 --- a/tutorials/CMakeLists.txt +++ b/tutorials/CMakeLists.txt @@ -1030,10 +1030,8 @@ if(ROOT_pyroot_FOUND) ${py_will_fail}) if(${t} IN_LIST multithreaded) - # Makes sure that this doesn't run in parallel with other multithreaded tutorials, and that cmake doesn't start too - # many other tests. That we use 4 processors is actually a lie, because IMT takes whatever it finds. - # However, even this poor indication of MT behaviour is a good hint for cmake to reduce congestion. - set_tests_properties(${tutorial_name} PROPERTIES RESOURCE_LOCK multithreaded PROCESSORS ${NProcessors}) + # Makes sure that this doesn't run in parallel with too many other multithreaded tutorials. + set_tests_properties(${tutorial_name} PROPERTIES PROCESSORS ${NProcessors}) endif() if(${t} IN_LIST distrdf_spark_tutorials) From 00d6747672c91822c8dd91a9f92f0910f0ac6024 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 15 Aug 2025 14:48:10 +0200 Subject: [PATCH 10/15] [RDF] Prepare RActionSnapshot for variations. Convert the fPreviousNode pointer into a vector of pointers. For now, it will only have a single entry, but this will allow for extending RActionSnapshot to systematic variations later. --- .../inc/ROOT/RDF/RActionSnapshot.hxx | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx b/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx index 86cfd9243d741..19d176cb75a66 100644 --- a/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx @@ -36,7 +36,7 @@ class R__CLING_PTRCHECK(off) RActionSnapshot final : public RActionBase { Helper fHelper; /// Pointer to the previous node in this branch of the computation graph - std::shared_ptr fPrevNode; + std::vector> fPrevNodes; /// Column readers per slot and per input column std::vector> fValues; @@ -54,8 +54,8 @@ public: const std::vector &colTypeIDs, std::shared_ptr pd, const RColumnRegister &colRegister) : RActionBase(pd->GetLoopManagerUnchecked(), columns, colRegister, pd->GetVariations()), - fHelper(std::forward(h)), - fPrevNode(std::move(pd)), + fHelper(std::move(h)), + fPrevNodes{std::move(pd)}, fValues(GetNSlots()), fColTypeIDs(colTypeIDs) { @@ -94,6 +94,8 @@ public: void *GetValue(unsigned int slot, std::size_t readerIdx, Long64_t entry) { + assert(slot < fValues.size()); + assert(readerIdx < fValues[slot].size()); if (auto *val = fValues[slot][readerIdx]->template TryGet(entry)) return val; @@ -118,11 +120,15 @@ public: void Run(unsigned int slot, Long64_t entry) final { // check if entry passes all filters - if (fPrevNode->CheckFilters(slot, entry)) + if (fPrevNodes.front()->CheckFilters(slot, entry)) CallExec(slot, entry); } - void TriggerChildrenCount() final { fPrevNode->IncrChildrenCount(); } + void TriggerChildrenCount() final + { + for (auto const &node : fPrevNodes) + node->IncrChildrenCount(); + } /// Clean-up operations to be performed at the end of a task. void FinalizeSlot(unsigned int slot) final @@ -142,18 +148,19 @@ public: std::shared_ptr GetGraph(std::unordered_map> &visitedMap) final { - auto prevNode = fPrevNode->GetGraph(visitedMap); - const auto &prevColumns = prevNode->GetDefinedColumns(); - // Action nodes do not need to go through CreateFilterNode: they are never common nodes between multiple branches const auto nodeType = HasRun() ? GraphDrawing::ENodeType::kUsedAction : GraphDrawing::ENodeType::kAction; auto thisNode = std::make_shared(fHelper.GetActionName(), visitedMap.size(), nodeType); visitedMap[(void *)this] = thisNode; - auto upmostNode = AddDefinesToGraph(thisNode, GetColRegister(), prevColumns, visitedMap); + for (auto const &node : fPrevNodes) { + auto prevNode = node->GetGraph(visitedMap); + const auto &prevColumns = prevNode->GetDefinedColumns(); + auto upmostNode = AddDefinesToGraph(thisNode, GetColRegister(), prevColumns, visitedMap); - thisNode->AddDefinedColumns(GetColRegister().GenerateColumnNames()); - upmostNode->SetPrevNode(prevNode); + thisNode->AddDefinedColumns(GetColRegister().GenerateColumnNames()); + upmostNode->SetPrevNode(prevNode); + } return thisNode; } @@ -175,8 +182,8 @@ public: */ std::unique_ptr CloneAction(void *newResult) final { - return std::make_unique(fHelper.CallMakeNew(newResult), GetColumnNames(), fColTypeIDs, fPrevNode, - GetColRegister()); + return std::make_unique(fHelper.CallMakeNew(newResult), GetColumnNames(), fColTypeIDs, + fPrevNodes.front(), GetColRegister()); } }; From f381609b71fc60715bd72b9e297b160072394c9b Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 22 Aug 2025 11:50:12 +0200 Subject: [PATCH 11/15] [RDF] Add SnapshotHelperWithVariations. Add a new action helper that allows for storing variations of branches alongside the classic snapshot functionality. Branches are only stored if their column readers are different from the nominal column readers. SnapshotWithVariations writes the same types as the input tree. When a branch with a systematic uncertainty doesn't pass a filter, a default-constructed value will be written. Reading this branch, however, might yield invalid values. Therefore, an additional branch with a bitmask is written, which can be used to figure out if the values in a specific column are valid. Later on, a column reader will be provided which will allow for masking the branches whose values are invalid. --- core/clingutils/src/unordered_mapLinkdef.h | 4 + .../inc/ROOT/RDF/SnapshotHelpers.hxx | 72 ++++- tree/dataframe/src/RDFSnapshotHelpers.cxx | 289 +++++++++++++++++- 3 files changed, 357 insertions(+), 8 deletions(-) diff --git a/core/clingutils/src/unordered_mapLinkdef.h b/core/clingutils/src/unordered_mapLinkdef.h index 80339b792b137..86f326a1c1aef 100644 --- a/core/clingutils/src/unordered_mapLinkdef.h +++ b/core/clingutils/src/unordered_mapLinkdef.h @@ -1,3 +1,4 @@ +// clang-format off #include #include @@ -28,3 +29,6 @@ #pragma create TClass unordered_map; #pragma create TClass unordered_map; #pragma create TClass unordered_map; + +// For snapshot with systematic variations in RDF: +#pragma create TClass unordered_map>; diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index 69ad0634af14e..1b82353e70eec 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -26,6 +26,8 @@ #include #include +#include + class TBranch; class TFile; @@ -97,24 +99,35 @@ struct RBranchData { TBranch *fOutputBranch = nullptr; void *fBranchAddressForCArrays = nullptr; // Used to detect if branch addresses need to be updated - std::unique_ptr> fEmptyInstance; + int fVariationIndex = -1; // For branches that are only valid if a specific filter passed + void *fEmptyInstance = nullptr; + alignas(8) std::array fNullBytes{ + std::byte{0}}; // Nullbytes for writing 0 into branches of primitive types + DesFunc_t fEmptyInstanceDeleter = nullptr; bool fIsCArray = false; bool fIsDefine = false; - RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine, const std::type_info *typeID, - TBranch *outputBranch = nullptr) + RBranchData() = default; + RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine, const std::type_info *typeID) : fInputBranchName{std::move(inputBranchName)}, fOutputBranchName{std::move(outputBranchName)}, fInputTypeID{typeID}, - fOutputBranch{outputBranch}, - fIsDefine(isDefine) + fIsDefine{isDefine} + { + } + ~RBranchData() { + if (fEmptyInstanceDeleter) + fEmptyInstanceDeleter(fEmptyInstance); } + void ClearBranchPointers() { fOutputBranch = nullptr; fBranchAddressForCArrays = nullptr; } + void *EmptyInstance(); + void ResetBranchAddressToEmtpyInstance(); }; class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelper final : public RActionImpl { @@ -231,6 +244,55 @@ public: UntypedSnapshotTTreeHelperMT MakeNew(void *newName, std::string_view /*variation*/ = "nominal"); }; +struct SnapshotOutputWriter; + +/// Helper object for a single-thread Snapshot action +class R__CLING_PTRCHECK(off) SnapshotHelperWithVariations + : public ROOT::Detail::RDF::RActionImpl { + RSnapshotOptions fOptions; + std::shared_ptr fOutputHandle; + TTree *fInputTree = nullptr; // Current input tree. Set at initialization time (`InitTask`) + std::vector fBranchData; + ROOT::Detail::RDF::RLoopManager *fInputLoopManager = nullptr; + ROOT::Detail::RDF::RLoopManager *fOutputLoopManager = nullptr; + + void ClearOutputBranches(); + +public: + SnapshotHelperWithVariations(std::string_view filename, std::string_view dirname, std::string_view treename, + const ColumnNames_t & /*vbnames*/, const ColumnNames_t &bnames, + const RSnapshotOptions &options, std::vector && /*isDefine*/, + ROOT::Detail::RDF::RLoopManager *outputLoopMgr, + ROOT::Detail::RDF::RLoopManager *inputLoopMgr, + const std::vector &colTypeIDs); + + SnapshotHelperWithVariations(SnapshotHelperWithVariations &&) = default; + ~SnapshotHelperWithVariations(); + + void RegisterVariedColumn(unsigned int slot, unsigned int columnIndex, unsigned int originalColumnIndex, + unsigned int varationIndex, std::string const &variationName); + + void InitTask(TTreeReader *, unsigned int slot); + + void Exec(unsigned int /*slot*/, const std::vector &values, std::vector const &filterPassed); + + void FlushEvent(unsigned int slot); + + void ResetBranchAddresses(unsigned int slot); + + void Initialize() {} + + void Finalize(); + + std::string GetActionName() { return "SnapshotWithVariations"; } + + ROOT::RDF::SampleCallback_t GetSampleCallback() final + { + // TODO: Needed? + return [this](unsigned int, const RSampleInfo &) mutable { ; }; + } +}; + } // namespace ROOT::Internal::RDF #endif diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index c874f8e9c3c5d..401f63ee2f2cf 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -115,8 +115,7 @@ std::vector::iterator CreateCStyleArrayBranch(TTree &outputTree, st // The size leaf is not part of the output branches yet, so emplace an empty slot for it. // This means that iterators need to be updated in case the container reallocates. const auto indexBeforeEmplace = std::distance(outputBranches.begin(), thisBranch); - outputBranches.emplace_back("", sizeLeafName, /*isDefine=*/false, /*typeID=*/nullptr, - /*outputBranch=*/nullptr); + outputBranches.emplace_back("", sizeLeafName, /*isDefine=*/false, /*typeID=*/nullptr); thisBranch = outputBranches.begin() + indexBeforeEmplace; sizeLeafIt = outputBranches.end() - 1; } @@ -362,6 +361,33 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, } } // namespace +void *ROOT::Internal::RDF::RBranchData::EmptyInstance() +{ + if (!fEmptyInstance) { + auto *dictionary = TDictionary::GetDictionary(*fInputTypeID); + if (auto dataType = dynamic_cast(dictionary); dataType) { + assert(dataType->Size() <= 8); + fEmptyInstance = fNullBytes.data(); + } else { + assert(dynamic_cast(dictionary) != nullptr); + auto tclass = static_cast(dictionary); + fEmptyInstance = tclass->New(); + fEmptyInstanceDeleter = tclass->GetDestructor(); + } + } + return fEmptyInstance; +} + +/// Point the branch address to an empty instance of the type represented by this branch. +/// This is used in case of variations, when certain defines/actions don't execute. We +/// nevertheless need to write something, so we point the branch to an empty instance. +void ROOT::Internal::RDF::RBranchData::ResetBranchAddressToEmtpyInstance() +{ + if (!fOutputBranch) + return; + fOutputBranch->SetAddress(EmptyInstance()); +} + ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UntypedSnapshotTTreeHelper( std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine, @@ -639,7 +665,7 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::FinalizeTask(unsigned in if (fOutputTrees[slot]->GetEntries() > 0) fOutputFiles[slot]->Write(); for (auto &branchData : fBranchData[slot]) - branchData.ClearBranchPointers(); // Pointers might go to an old tree, so they are stale now + branchData.ClearBranchPointers(); // The branch pointers will go stale below // clear now to avoid concurrent destruction of output trees and input tree (which has them listed as fClones) fOutputTrees[slot].reset(nullptr); } @@ -905,3 +931,260 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::MakeNew(void *newName) fInputLoopManager, fOutputLoopManager, std::vector(fIsDefine), fInputColumnTypeIDs}; } + +/* + * ------------------------------------ + * Snapshot with systematic variations + * ------------------------------------ + */ +namespace ROOT::Internal::RDF { +/// An object to store an output file and a tree in one common place to share them between instances +/// of Snapshot with systematic uncertainties. +struct SnapshotOutputWriter { + std::unique_ptr fFile; + std::unique_ptr fTree; + std::string fDirectoryName; + RLoopManager *fOutputLoopManager; + + // Bitmasks to indicate whether syst. uncertainties have been computed. Bound to TBranches, so need to be stable in + // memory. + struct Bitmask { + std::string branchName; + std::bitset<64> bitset{}; + std::unique_ptr branchBuffer{new uint64_t{}}; + }; + std::vector fBitMasks; + + std::unordered_map fBranchToVariationMapping; + // For the dictionary, see core/clingutils/src + std::unordered_map> fBranchToBitmaskMapping; + unsigned int fNBits = 0; + + SnapshotOutputWriter(TFile *file, TTree *tree) : fFile{file}, fTree{tree} {} + ~SnapshotOutputWriter() + { + if (!fBranchToBitmaskMapping.empty()) { + fFile->WriteObject(&fBranchToBitmaskMapping, + (std::string{"R_rdf_branchToBitmaskMapping_"} + fTree->GetName()).c_str()); + } + if (fTree) { + // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory + fTree->AutoSave("flushbaskets"); + + // Now connect the data source to the loop manager so it can be used for further processing + std::string tree = fTree->GetName(); + if (!fDirectoryName.empty()) + tree = fDirectoryName + '/' + tree; + std::string file = fFile->GetName(); + + fTree.reset(); + fFile.reset(); + + if (fOutputLoopManager) + fOutputLoopManager->SetDataSource(std::make_unique(tree, file)); + } + } + + /// Register a branch and corresponding systematic uncertainty. The index returned is the global index of this + /// systematic. + void RegisterBranch(std::string const &branchName, unsigned int variationIndex) + { + if (auto it = fBranchToVariationMapping.find(branchName); it != fBranchToVariationMapping.end()) { + if (variationIndex != it->second) { + throw std::logic_error("Branch " + branchName + " is being registered with different variation index."); + } + return; + } + + // Neither branch nor systematic are known, so a new entry needs to be created + fNBits = std::max(fNBits, variationIndex); + const auto vectorIndex = variationIndex / 64u; + const auto bitIndex = variationIndex % 64u; + + // Create bitmask branches as long as necessary to capture the bit + while (vectorIndex >= fBitMasks.size()) { + std::string bitmaskBranchName = + std::string{"R_rdf_mask_"} + fTree->GetName() + '_' + std::to_string(fBitMasks.size()); + fBitMasks.push_back(Bitmask{bitmaskBranchName}); + fTree->Branch(bitmaskBranchName.c_str(), fBitMasks.back().branchBuffer.get()); + } + + fBranchToVariationMapping[branchName] = variationIndex; + fBranchToBitmaskMapping[branchName] = std::make_pair(fBitMasks[vectorIndex].branchName, bitIndex); + } + + void ClearMaskBits() + { + for (auto &mask : fBitMasks) + mask.bitset.reset(); + } + + void SetMaskBit(unsigned int index) + { + const auto vectorIndex = index / 64; + const auto bitIndex = index % 64; + fBitMasks[vectorIndex].bitset.set(bitIndex, true); + } + + bool MaskEmpty() const + { + return std::none_of(fBitMasks.begin(), fBitMasks.end(), [](Bitmask const &mask) { return mask.bitset.any(); }); + } + + void Write() const + { + for (auto const &mask : fBitMasks) { + *mask.branchBuffer = mask.bitset.to_ullong(); + } + + fTree->Fill(); + } +}; + +ROOT::Internal::RDF::SnapshotHelperWithVariations::SnapshotHelperWithVariations( + std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, + const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine, + ROOT::Detail::RDF::RLoopManager *outputLoopMgr, ROOT::Detail::RDF::RLoopManager *inputLoopMgr, + const std::vector &colTypeIDs) + : fOptions(options), fInputLoopManager{inputLoopMgr}, fOutputLoopManager{outputLoopMgr} +{ + EnsureValidSnapshotTTreeOutput(fOptions, std::string(treename), std::string(filename)); + + TFile::TContext fileCtxt; + fOutputHandle = std::make_shared( + TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", + ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel)), + nullptr); + if (!fOutputHandle->fFile) + throw std::runtime_error(std::string{"Snapshot: could not create output file "} + std::string{filename}); + + TDirectory *outputDir = fOutputHandle->fFile.get(); + if (!dirname.empty()) { + fOutputHandle->fDirectoryName = dirname; + TString checkupdate = fOptions.fMode; + checkupdate.ToLower(); + if (checkupdate == "update") + outputDir = + fOutputHandle->fFile->mkdir(std::string{dirname}.c_str(), "", true); // do not overwrite existing directory + else + outputDir = fOutputHandle->fFile->mkdir(std::string{dirname}.c_str()); + } + + fOutputHandle->fTree = std::make_unique(std::string{treename}.c_str(), std::string{treename}.c_str(), + fOptions.fSplitLevel, /*dir=*/outputDir); + fOutputHandle->fOutputLoopManager = fOutputLoopManager; + if (fOptions.fAutoFlush) + fOutputHandle->fTree->SetAutoFlush(fOptions.fAutoFlush); + + auto outputBranchNames = ReplaceDotWithUnderscore(bnames); + + fBranchData.reserve(vbnames.size()); + for (unsigned int i = 0; i < vbnames.size(); ++i) { + fOutputHandle->RegisterBranch(outputBranchNames[i], 0); + fBranchData.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]); + } +} + +SnapshotHelperWithVariations::~SnapshotHelperWithVariations() +{ + // FIXME: Check if this needs to be enabled: + // if (!fTreeName.empty() /*not moved from*/ && !fOutputFile /* did not run */ && fOptions.fLazy) { + // const auto fileOpenMode = [&]() { + // TString checkupdate = fOptions.fMode; + // checkupdate.ToLower(); + // return checkupdate == "update" ? "updated" : "created"; + // }(); + // Warning("Snapshot", + // "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. " + // "In case it was desired instead, remember to trigger the Snapshot operation, by storing " + // "its result in a variable and for example calling the GetValue() method on it.", + // fTreeName.c_str(), fFileName.c_str(), fileOpenMode); + // } +} + +void SnapshotHelperWithVariations::RegisterVariedColumn(unsigned int /*slot*/, unsigned int columnIndex, + unsigned int originalColumnIndex, unsigned int variationIndex, + std::string const &variationName) +{ + if (columnIndex == originalColumnIndex) { + fBranchData[columnIndex].fVariationIndex = variationIndex; // The base column has variations + fOutputHandle->RegisterBranch(fBranchData[columnIndex].fOutputBranchName, variationIndex); + } else if (columnIndex >= fBranchData.size()) { + // First task, need to create branches + fBranchData.resize(columnIndex + 1); + auto &bd = fBranchData[columnIndex]; + bd = fBranchData[originalColumnIndex]; + std::string newOutputName = bd.fOutputBranchName + "__" + variationName; + std::replace(newOutputName.begin(), newOutputName.end(), ':', '_'); + bd.fOutputBranchName = std::move(newOutputName); + bd.fVariationIndex = variationIndex; + + fOutputHandle->RegisterBranch(bd.fOutputBranchName, variationIndex); + } else { + assert(static_cast(fBranchData[columnIndex].fVariationIndex) == variationIndex); + } +} + +void SnapshotHelperWithVariations::InitTask(TTreeReader *, unsigned int) +{ + // We ask the input RLoopManager if it has a TTree. We cannot rely on getting this information when constructing + // this action helper, since the TTree might change e.g. when ChangeSpec is called in-between distributed tasks. + if (auto treeDS = dynamic_cast(fInputLoopManager->GetDataSource())) + fInputTree = treeDS->GetTree(); + + // Create all output branches; and bind them to empty values + for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions + SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, + fBranchData[i].EmptyInstance()); + } + + // TODO: Don't run this every time + AssertNoNullBranchAddresses(fBranchData); +} + +/// Connect all output fields to the values pointed to by `values`, fill the output dataset, and +/// Call the Fill of the output tree, and clear the mask bits that show whether a variation was reached. +/// This function must be called from exactly one snapshot action. +/// It triggers the fill of the shared tree at the end of each event. +void SnapshotHelperWithVariations::Exec(unsigned int slot, const std::vector &values, + std::vector const &filterPassed) +{ + // Rebind branch pointers to RDF values + assert(fBranchData.size() == values.size()); + for (std::size_t i = 0; i < values.size(); i++) { + const auto variationIndex = fBranchData[i].fVariationIndex; + if (variationIndex < 0) { + // Branch without variations + SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, values[i]); + } else if (filterPassed[variationIndex]) { + // Branch with variations + SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, values[i]); + fOutputHandle->SetMaskBit(variationIndex); + } + } + + if (fOutputHandle->MaskEmpty()) + return; + + if (!fOutputHandle->fTree) + throw std::runtime_error("The TTree associated to the Snapshot action doesn't exist, any more."); + + fOutputHandle->Write(); + fOutputHandle->ClearMaskBits(); + ResetBranchAddresses(slot); +} + +/// Reset all branches to empty values. +void SnapshotHelperWithVariations::ResetBranchAddresses(unsigned int /*slot*/) +{ + for (auto &branchData : fBranchData) { + branchData.ResetBranchAddressToEmtpyInstance(); + } +} + +void SnapshotHelperWithVariations::Finalize() +{ + fOutputHandle.reset(); +} + +} // namespace ROOT::Internal::RDF From cacd5d0629dc78404b775708f0ba329531481c72 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 15 Aug 2025 18:18:58 +0200 Subject: [PATCH 12/15] [RDF] Make RBranchData nothrow copy/move constructible. This allows for faster vector resizes. --- .../inc/ROOT/RDF/SnapshotHelpers.hxx | 4 ++ tree/dataframe/src/RDFSnapshotHelpers.cxx | 40 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index 1b82353e70eec..637f75a03a465 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -115,11 +115,15 @@ struct RBranchData { fIsDefine{isDefine} { } + RBranchData(RBranchData const &other) noexcept { *this = other; } + RBranchData(RBranchData &&other) noexcept { *this = std::move(other); } ~RBranchData() { if (fEmptyInstanceDeleter) fEmptyInstanceDeleter(fEmptyInstance); } + RBranchData &operator=(RBranchData const &other) noexcept; + RBranchData &operator=(RBranchData &&other) noexcept; void ClearBranchPointers() { diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 401f63ee2f2cf..b704cf0198e2f 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -39,6 +39,9 @@ #include using ROOT::Internal::RDF::RBranchData; +// Maintaining the following allows for faster vector resize: +static_assert(std::is_nothrow_move_assignable_v); +static_assert(std::is_nothrow_move_constructible_v); namespace { @@ -361,6 +364,43 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, } } // namespace +RBranchData &RBranchData::operator=(RBranchData const &other) noexcept +{ + fInputBranchName = other.fInputBranchName; + fOutputBranchName = other.fOutputBranchName; + fInputTypeID = other.fInputTypeID; + fOutputBranch = other.fOutputBranch; + fBranchAddressForCArrays = other.fBranchAddressForCArrays; + fVariationIndex = other.fVariationIndex; + fEmptyInstance = nullptr; + fNullBytes = other.fNullBytes; + fEmptyInstanceDeleter = nullptr; + fIsCArray = other.fIsCArray; + fIsDefine = other.fIsDefine; + + return *this; +} + +RBranchData &RBranchData::operator=(RBranchData &&other) noexcept +{ + fInputBranchName = std::move(other.fInputBranchName); + fOutputBranchName = std::move(other.fOutputBranchName); + fInputTypeID = other.fInputTypeID; + fOutputBranch = other.fOutputBranch; + fBranchAddressForCArrays = other.fBranchAddressForCArrays; + fVariationIndex = other.fVariationIndex; + fEmptyInstance = other.fEmptyInstance; + fNullBytes = other.fNullBytes; + fEmptyInstanceDeleter = other.fEmptyInstanceDeleter; + fIsCArray = other.fIsCArray; + fIsDefine = other.fIsDefine; + + // This is why it's not = default: + other.fEmptyInstanceDeleter = nullptr; + + return *this; +} + void *ROOT::Internal::RDF::RBranchData::EmptyInstance() { if (!fEmptyInstance) { From 8066d6e97cb9cf6a104881ca08ed066f28046172 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 15 Aug 2025 13:50:12 +0200 Subject: [PATCH 13/15] [RDF] Add a flag to RSnapshotOptions for snapshot with variations. --- .../dataframe/inc/ROOT/RDF/InterfaceUtils.hxx | 22 ++++++++++++++----- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 4 ++-- tree/dataframe/inc/ROOT/RSnapshotOptions.hxx | 1 + 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx index a7b2fdfb81ed4..ddda4698a8f78 100644 --- a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx @@ -270,6 +270,7 @@ struct SnapshotHelperArgs { ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ROOT::Detail::RDF::RLoopManager *fInputLoopManager; bool fToNTuple; + bool fIncludeVariations; }; template @@ -309,12 +310,23 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr; - actionPtr.reset(new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, - std::move(isDefine), outputLM, inputLM, colTypeIDs), - colNames, colTypeIDs, prevNode, colRegister)); + if (snapHelperArgs->fIncludeVariations) { + using Helper_t = SnapshotHelperWithVariations; + using Action_t = RActionSnapshot; + actionPtr.reset(new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, + std::move(isDefine), outputLM, inputLM, colTypeIDs), + colNames, colTypeIDs, prevNode, colRegister)); + } else { + using Helper_t = UntypedSnapshotTTreeHelper; + using Action_t = RActionSnapshot; + actionPtr.reset(new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, + std::move(isDefine), outputLM, inputLM, colTypeIDs), + colNames, colTypeIDs, prevNode, colRegister)); + } } else { + if (snapHelperArgs->fIncludeVariations) { + throw std::invalid_argument("Multi-threaded snapshot with variations is not supported yet."); + } // multi-thread snapshot using Helper_t = UntypedSnapshotTTreeHelperMT; using Action_t = RActionSnapshot; diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index 56ae9c5f8da44..33662b2c8efdc 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1384,7 +1384,7 @@ public: auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, - options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */}); + options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */, /*fIncludeVariations=*/false}); auto &&nColumns = colListNoAliasesWithSizeBranches.size(); const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches); @@ -1422,7 +1422,7 @@ public: auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, - options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */}); + options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */, options.fIncludeVariations}); auto &&nColumns = colListNoAliasesWithSizeBranches.size(); const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches); diff --git a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx index 6d558d2188e46..cfdc4eaac457c 100644 --- a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx +++ b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx @@ -53,6 +53,7 @@ struct RSnapshotOptions { bool fLazy = false; ///< Do not start the event loop when Snapshot is called bool fOverwriteIfExists = false; ///< If fMode is "UPDATE", overwrite object in output file if it already exists bool fVector2RVec = true; ///< If set to true will convert std::vector columns to RVec when saving to disk + bool fIncludeVariations = false; ///< Include columns that result from a Vary() action int fBasketSize = -1; ///< Set a custom basket size option. For more details, see ///< https://root.cern/manual/trees/#baskets-clusters-and-the-tree-header ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to From 288765027feca7fcac2510e5013041553fb74ff6 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 15 Aug 2025 14:55:28 +0200 Subject: [PATCH 14/15] [RDF] Enable usage of snapshot with variations in RActionSnapshot. Depending on the type of the action helper (classic or with variations), add column readers for variations to the snapshot action. --- .../inc/ROOT/RDF/RActionSnapshot.hxx | 75 +++++++++++++++++-- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx b/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx index 19d176cb75a66..a5de6e27f2d40 100644 --- a/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx @@ -29,6 +29,8 @@ std::shared_ptr AddDefinesToGraph(std::shared_ptr node, co std::unordered_map> &visitedMap); } // namespace GraphDrawing +class SnapshotHelperWithVariations; + template class R__CLING_PTRCHECK(off) RActionSnapshot final : public RActionBase { @@ -65,6 +67,27 @@ public: fIsDefine.reserve(nColumns); for (auto i = 0u; i < nColumns; ++i) fIsDefine.push_back(colRegister.IsDefineOrAlias(columns[i])); + + if (const auto &variations = GetVariations(); !variations.empty()) { + // Get pointers to previous nodes of all systematics + fPrevNodes.reserve(1 + variations.size()); + auto nominalFilter = fPrevNodes.front(); + if (static_cast(nominalFilter.get()) == fLoopManager) { + // just fill this with the RLoopManager N times + fPrevNodes.resize(1 + variations.size(), nominalFilter); + } else { + // create varied versions of the previous filter node + const auto &prevVariations = nominalFilter->GetVariations(); + for (const auto &variation : variations) { + if (IsStrInVec(variation, prevVariations)) { + fPrevNodes.emplace_back( + std::static_pointer_cast(nominalFilter->GetVariedFilter(variation))); + } else { + fPrevNodes.emplace_back(nominalFilter); + } + } + } + } } RActionSnapshot(const RActionSnapshot &) = delete; @@ -89,6 +112,26 @@ public: { fValues[slot] = GetUntypedColumnReaders(slot, r, RActionBase::GetColRegister(), *fLoopManager, RActionBase::GetColumnNames(), fColTypeIDs); + + if constexpr (std::is_same_v) { + // In case of systematic variations, append also the varied column readers to the values + // that get passed to the helpers + auto const &variations = GetVariations(); + for (unsigned int variationIndex = 0; variationIndex < variations.size(); ++variationIndex) { + auto const &readers = + GetUntypedColumnReaders(slot, r, RActionBase::GetColRegister(), *fLoopManager, + RActionBase::GetColumnNames(), fColTypeIDs, variations[variationIndex]); + for (unsigned int i = 0; i < readers.size(); ++i) { + if (fValues[slot][i] != readers[i]) { + fValues[slot].push_back(readers[i]); + fHelper.RegisterVariedColumn(slot, i, i, 0, "nominal"); + fHelper.RegisterVariedColumn(slot, fValues[slot].size() - 1, i, variationIndex + 1, + variations[variationIndex]); + } + } + } + } + fHelper.InitTask(r, slot); } @@ -119,9 +162,27 @@ public: void Run(unsigned int slot, Long64_t entry) final { - // check if entry passes all filters - if (fPrevNodes.front()->CheckFilters(slot, entry)) - CallExec(slot, entry); + if constexpr (std::is_same_v) { + // check if entry passes all filters + std::vector filterPassed(fPrevNodes.size(), false); + for (unsigned int variation = 0; variation < fPrevNodes.size(); ++variation) { + filterPassed[variation] = fPrevNodes[variation]->CheckFilters(slot, entry); + } + + if (std::any_of(filterPassed.begin(), filterPassed.end(), [](bool val) { return val; })) { + // TODO: Don't allocate + std::vector untypedValues; + auto nReaders = fValues[slot].size(); + untypedValues.reserve(nReaders); + for (decltype(nReaders) readerIdx{}; readerIdx < nReaders; readerIdx++) + untypedValues.push_back(GetValue(slot, readerIdx, entry)); + + fHelper.Exec(slot, untypedValues, filterPassed); + } + } else { + if (fPrevNodes.front()->CheckFilters(slot, entry)) + CallExec(slot, entry); + } } void TriggerChildrenCount() final @@ -164,14 +225,14 @@ public: return thisNode; } - /// This method is invoked to update a partial result during the event loop, right before passing the result to a - /// user-defined callback registered via RResultPtr::RegisterCallback + /// Forwards to the action helpers; will throw since PartialUpdate not supported for most snapshot helpers. void *PartialUpdate(unsigned int slot) final { return fHelper.CallPartialUpdate(slot); } + /// Will throw, since varied actions are unsupported. Instead, set a flag in RSnapshotOptions. [[maybe_unused]] std::unique_ptr MakeVariedAction(std::vector && /*results*/) final { - // TODO: Probably we also need an untyped RVariedAction - throw std::runtime_error("RDataFrame::Snapshot: Snapshot with systematic variations is not supported yet."); + throw std::logic_error("RDataFrame::Snapshot: The snapshot action cannot be varied. Instead, switch on " + "variations in RSnapshotOptions."); } /** From 4701fcc74637c19af4b41677b0070451f99a141b Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Thu, 21 Aug 2025 13:44:57 +0200 Subject: [PATCH 15/15] [RDF] Add tests for snapshot with variations. --- .../test/dataframe_snapshotWithVariations.cxx | 328 ++++++++++++++++++ 1 file changed, 328 insertions(+) create mode 100644 tree/dataframe/test/dataframe_snapshotWithVariations.cxx diff --git a/tree/dataframe/test/dataframe_snapshotWithVariations.cxx b/tree/dataframe/test/dataframe_snapshotWithVariations.cxx new file mode 100644 index 0000000000000..faa8e39b7b42a --- /dev/null +++ b/tree/dataframe/test/dataframe_snapshotWithVariations.cxx @@ -0,0 +1,328 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include + +constexpr bool verbose = false; + +using ROOT::RDF::Experimental::VariationsFor; + +void printTree(TTree &tree) +{ + tree.Print(); + tree.SetScanField(50); + tree.Scan("*", "", "colsize=15", 50); +} + +template +void checkOutput(TTree &tree, std::vector const &systematics, F &&activeCuts) +{ + X_t x; + Y_t y; + + for (auto const &sysName : systematics) { + ASSERT_EQ(TTree::kMatch, tree.SetBranchAddress(("x" + sysName).c_str(), &x)) << ("x" + sysName); + ASSERT_EQ(TTree::kMatch, tree.SetBranchAddress(("y" + sysName).c_str(), &y)) << ("y" + sysName); + + for (unsigned int i = 0; i < tree.GetEntries(); ++i) { + ASSERT_GT(tree.GetEntry(i), 0); + + EXPECT_EQ(x, -1 * y); + if (!activeCuts(x, y)) { + EXPECT_EQ(x, X_t{}); + EXPECT_EQ(y, Y_t{}); + } + } + + tree.ResetBranchAddresses(); + } +} + +TEST(RDFVarySnapshot, SimpleRDFWithFilters) +{ + constexpr auto filename = "VarySnapshot.root"; + constexpr unsigned int N = 10; + ROOT::RDF::RSnapshotOptions options; + options.fLazy = true; + options.fOverwriteIfExists = true; + options.fIncludeVariations = true; + + auto cuts = [](float x, double y) { return x < 50 || y < -70; }; + + auto h = ROOT::RDataFrame(N) + .Define("entry_beforeX", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Define("x", [](ULong64_t e) -> float { return 10.f * e; }, {"rdfentry_"}) + .Define("entry_afterX", "rdfentry_") + .Vary( + "x", [](float x) { return ROOT::RVecF{x - 0.5f, x + 0.5f}; }, {"x"}, 2, "xVar") + .Define("y", [](float x) -> double { return -1. * x; }, {"x"}) + .Define("entry_afterVary", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Filter(cuts, {"x", "y"}) + .Snapshot("t", filename, {"x", "y", "entry_beforeX", "entry_afterX", "entry_afterVary"}, options); + h.GetPtr(); + + { + std::unique_ptr file{TFile::Open(filename)}; + auto tree = file->Get("t"); + if (verbose) + printTree(*tree); + + EXPECT_EQ(tree->GetEntries(), 9); // Event number 6 won't pass any filters + EXPECT_EQ(tree->GetNbranches(), 10); // 6 branches for x/y with variations, three not-varied branches, bitmask + for (const auto branchName : {"x", "y", "x__xVar_0", "x__xVar_1", "y__xVar_0", "y__xVar_0"}) + EXPECT_NE(tree->FindBranch(branchName), nullptr) << branchName; + + checkOutput(*tree, std::vector{"__xVar_0", "__xVar_1"}, cuts); + + if (HasFailure()) { + tree->Print(); + tree->Scan(); + } + } + + if (!HasFailure()) + std::remove(filename); +} + +TEST(RDFVarySnapshot, RDFFromTTree) +{ + constexpr auto inFile = "VarySnapshot_1.root"; + constexpr unsigned int N = 10; + auto in = ROOT::RDataFrame(N) + .Define("entry", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Define("x", [](ULong64_t e) -> int { return 10 * int(e); }, {"rdfentry_"}) + .Define("y", [](int x) -> float { return -1 * x; }, {"x"}) + .Snapshot("t", inFile, {"x", "y", "entry"}); + auto nextRDF = in.GetValue(); + { + std::unique_ptr fileIn{TFile::Open(inFile, "READ")}; + TTree *tree = fileIn->Get("t"); + TBranch *branch = nullptr; + + ASSERT_NE(branch = tree->GetBranch("x"), nullptr); + EXPECT_STREQ(static_cast(branch->GetListOfLeaves()->At(0))->GetTypeName(), "Int_t"); + + ASSERT_NE(branch = tree->GetBranch("y"), nullptr); + EXPECT_STREQ(static_cast(branch->GetListOfLeaves()->At(0))->GetTypeName(), "Float_t"); + if (verbose) + tree->Scan("*", "", "", 5); + } + + constexpr auto outfile = "VarySnapshot_2.root"; + ROOT::RDF::RSnapshotOptions options; + options.fLazy = true; + options.fIncludeVariations = true; + auto filter = [](int x, Long64_t y) { return (20 <= x && x < 70) && y != -50; }; + + auto var = ROOT::RDataFrame("t", {inFile, inFile}) + .Vary( + "x", [](int x) { return ROOT::RVecI{x - 1, x + 1}; }, {"x"}, 2, "xVariation") + .Redefine("y", [](int x) -> Long64_t { return -1 * x; }, {"x"}) + .Filter(filter, {"x", "y"}, "nominal filter") + .Snapshot("t", outfile, {"x", "y", "entry"}, options); + auto thirdRDF = var.GetValue(); + + { + std::unique_ptr file{TFile::Open(outfile)}; + auto tree = file->Get("t"); + if (verbose) { + tree->Print(); + tree->Scan("*", "", "colsize=20", 20); + } + + EXPECT_EQ(tree->GetEntries(), 2 * 6); // x=0, 10, 70, and y = -50 don't pass + EXPECT_EQ(tree->GetNbranches(), 8); // 3 variations of x and y, mask bits, and entry number + for (const auto [branchName, branchType] : + std::initializer_list>{{"entry", "ULong64_t"}, + {"x", "Int_t"}, + {"y", "Long64_t"}, + {"x__xVariation_0", "Int_t"}, + {"x__xVariation_1", "Int_t"}, + {"y__xVariation_0", "Long64_t"}, + {"y__xVariation_1", "Long64_t"}}) { + TBranch *branch; + ASSERT_NE(branch = tree->GetBranch(branchName), nullptr); + EXPECT_STREQ(static_cast(branch->GetListOfLeaves()->At(0))->GetTypeName(), branchType); + } + + checkOutput(*tree, std::vector{"__xVariation_0", "__xVariation_1"}, filter); + + if (HasFailure()) { + tree->Print(); + tree->Scan(); + } + } + + if (!HasFailure()) { + std::remove(inFile); + std::remove(outfile); + } +} + +TEST(RDFVarySnapshot, Bitmask) +{ + constexpr auto filename = "VarySnapshot_bitmask.root"; + std::string const treename = "testTree"; + constexpr unsigned int N = 15; + constexpr unsigned int NSystematics = 130; // Will use three bitmask branches + ROOT::RDF::RSnapshotOptions options; + options.fLazy = false; + options.fOverwriteIfExists = true; + options.fIncludeVariations = true; + + auto cuts = [](int x, int y) { return x % 2 == 0 && y % 3 == 0; }; + + ROOT::RDataFrame(N) + .Define("x", [](ULong64_t e) -> int { return e; }, {"rdfentry_"}) + .Define("entry", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Vary( + "x", + [](int x) { + std::vector systOffsets(NSystematics); + std::iota(systOffsets.begin(), systOffsets.end(), 0); + std::transform(systOffsets.begin(), systOffsets.end(), systOffsets.begin(), + [x](int offset) { return x + offset; }); + return ROOT::RVecI{systOffsets}; + }, + {"x"}, NSystematics, "xVar") + .Define("y", [](int x) -> int { return -1 * x; }, {"x"}) + .Filter(cuts, {"x", "y"}) + .Snapshot(treename, filename, {"entry", "x", "y"}, options); + + { + TFile file(filename, "READ"); + std::unique_ptr tree{file.Get(treename.data())}; + ASSERT_NE(tree, nullptr); + TBranch *branch = tree->GetBranch(("R_rdf_mask_" + treename + "_0").c_str()); + ASSERT_NE(branch, nullptr); + + auto *branchToIndexMap = file.Get>>( + ("R_rdf_branchToBitmaskMapping_" + treename).c_str()); + ASSERT_NE(branchToIndexMap, nullptr); + for (const auto branchName : {"x", "y", "x__xVar_0", "x__xVar_1", "y__xVar_0", "y__xVar_0"}) { + ASSERT_NE(branchToIndexMap->find(branchName), branchToIndexMap->end()); + } + EXPECT_EQ((*branchToIndexMap)["x"], (*branchToIndexMap)["y"]); + + for (unsigned int systematic = 0; systematic < NSystematics; ++systematic) { + int x, y; + ULong64_t rdfentry; + uint64_t bitmask; + std::string const systematicName = std::string{"__xVar_"} + std::to_string(systematic); + std::string const branchName_x = "x" + systematicName; + std::string const branchName_y = "y" + systematicName; + ASSERT_EQ(tree->SetBranchAddress(branchName_x.c_str(), &x), TTree::kMatch); + ASSERT_EQ(tree->SetBranchAddress(branchName_y.c_str(), &y), TTree::kMatch); + ASSERT_EQ(tree->SetBranchAddress("entry", &rdfentry), TTree::kMatch); + const auto it = branchToIndexMap->find(branchName_x); + ASSERT_NE(it, branchToIndexMap->end()); + ASSERT_EQ(tree->SetBranchAddress(it->second.first.c_str(), &bitmask), TTree::kMatch) << it->second.first; + ASSERT_NE(branchToIndexMap->find(branchName_y), branchToIndexMap->end()); + EXPECT_EQ(it->second.first, (*branchToIndexMap)[branchName_y].first); + EXPECT_EQ(it->second.second, (*branchToIndexMap)[branchName_y].second); + for (unsigned int i = 0; i < N; ++i) { + tree->GetEntry(i); + EXPECT_EQ(rdfentry, i); + const int xExpected = rdfentry + systematic; + const int yExpected = -1 * xExpected; + if (cuts(xExpected, yExpected)) { + EXPECT_EQ(x, xExpected) << "event=" << i << " systematic=" << systematic << " rdfentry=" << rdfentry; + EXPECT_EQ(y, yExpected) << "event=" << i << " systematic=" << systematic << " rdfentry=" << rdfentry; + } + + const std::bitset<64> bs{bitmask}; + const auto bitIndex = it->second.second; + EXPECT_EQ(cuts(xExpected, yExpected), bs[bitIndex]) + << "event=" << i << " syst=" << systematic << " x(" << branchName_x << ")=" << x << " y(" << branchName_y + << ")=" << y << " rdfentry=" << rdfentry << " bitset: " << bs << " bitIndex: " << bitIndex; + if (HasFailure()) + break; + } + tree->ResetBranchAddresses(); + if (HasFailure()) + break; + } + + if (HasFailure()) { + tree->Scan("entry:x:y:x__xVar_0:y__xVar_0:x__xVar_1:y__xVar_1:x__xVar_2:y__xVar_2"); + } + } + + if (!HasFailure()) + std::remove(filename); +} + +TEST(RDFVarySnapshot, SnapshotCollections) +{ + constexpr auto filename = "VarySnapshot.root"; + constexpr unsigned int N = 10; + ROOT::RDF::RSnapshotOptions options; + options.fIncludeVariations = true; + + gInterpreter->GenerateDictionary("std::vector >", "vector;ROOT/RVec.hxx"); + + auto cuts = [](int x, ROOT::RVecI const &y) { return x % 2 == 0 || y[0] == 5; }; + + auto variation = ROOT::RDataFrame(N) + .Define("x", [](ULong64_t e) -> int { return int(e); }, {"rdfentry_"}) + .Define("entry", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Define("dummyBranch", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Vary( + "x", [](int x) { return ROOT::RVecI{x + 1, x * 3}; }, {"x"}, 2, "xVariation") + .Define("y", [](int x) { return ROOT::RVecI{x, x + 1, x + 2, x + 3}; }, {"x"}) + .Filter(cuts, {"x", "y"}) + .Snapshot("t", filename, {"x", "y", "entry"}, options); + variation.GetPtr(); + + std::unique_ptr file{TFile::Open(filename)}; + auto tree = file->Get("t"); + if (verbose) + printTree(*tree); + + EXPECT_EQ(tree->GetEntries(), N); + EXPECT_EQ(tree->GetNbranches(), 8); + for (const auto branchName : + {"x", "y", "x__xVariation_0", "x__xVariation_0", "y__xVariation_0", "y__xVariation_0", "entry"}) + EXPECT_NE(tree->GetBranch(branchName), nullptr) << branchName; + EXPECT_EQ(tree->GetBranch("dummyBranch"), nullptr); + + for (std::string systematicName : {"", "__xVariation_0", "__xVariation_1"}) { + ULong64_t entry; + int x; + ROOT::RVecI *y = nullptr; + ASSERT_EQ(tree->SetBranchAddress("entry", &entry), TTree::kMatch); + ASSERT_EQ(tree->SetBranchAddress(("x" + systematicName).c_str(), &x), TTree::kMatch) + << ("x" + systematicName); + ASSERT_EQ(tree->SetBranchAddress(("y" + systematicName).c_str(), &y), TTree::kMatch) << ("y" + systematicName); + + for (unsigned int event = 0; event < N; ++event) { + ASSERT_GT(tree->GetEntry(event), 0); + const auto originalX = + (systematicName == "") + ? entry + : ((systematicName.find("xVariation_0") != std::string::npos) ? entry + 1 : entry * 3); + const bool passCuts = (originalX % 2 == 0) || originalX == 5; + + if (passCuts) + EXPECT_EQ(x, originalX) << "sys:'" << systematicName << "' originalX: " << originalX << " event: " << event; + else + EXPECT_EQ(x, 0) << "sys:'" << systematicName << "' originalX: " << originalX << " event: " << event; + + ASSERT_EQ(y->size(), passCuts ? 4 : 0) + << "sys:'" << systematicName << "' entry: " << entry << " originalX: " << originalX << " event: " << event; + for (unsigned int i = 0; i < y->size(); ++i) { + EXPECT_EQ((*y)[i], x + i); + } + } + tree->ResetBranchAddresses(); + } + + file.reset(); + gSystem->Unlink(filename); +} \ No newline at end of file