diff --git a/cmake/modules/RootMacros.cmake b/cmake/modules/RootMacros.cmake index 1824c07d18f63..29f197c8b8c51 100644 --- a/cmake/modules/RootMacros.cmake +++ b/cmake/modules/RootMacros.cmake @@ -2369,7 +2369,7 @@ macro(ROOTTEST_COMPILE_MACRO filename) endif() set_property(TEST ${COMPILE_MACRO_TEST} PROPERTY ENVIRONMENT ${ROOTTEST_ENVIRONMENT}) if(CMAKE_GENERATOR MATCHES Ninja AND NOT MSVC) - set_property(TEST ${COMPILE_MACRO_TEST} PROPERTY RUN_SERIAL true) + set_property(TEST ${COMPILE_MACRO_TEST} PROPERTY RESOURCE_LOCK NINJA_COMPILATION) endif() if (ARG_FIXTURES_SETUP) set_property(TEST ${COMPILE_MACRO_TEST} PROPERTY @@ -2475,7 +2475,7 @@ macro(ROOTTEST_GENERATE_DICTIONARY dictname) set_property(TEST ${GENERATE_DICTIONARY_TEST} PROPERTY ENVIRONMENT ${ROOTTEST_ENVIRONMENT}) if(CMAKE_GENERATOR MATCHES Ninja AND NOT MSVC) - set_property(TEST ${GENERATE_DICTIONARY_TEST} PROPERTY RUN_SERIAL true) + set_property(TEST ${GENERATE_DICTIONARY_TEST} PROPERTY RESOURCE_LOCK NINJA_COMPILATION) endif() if (ARG_FIXTURES_SETUP) @@ -2592,7 +2592,7 @@ macro(ROOTTEST_GENERATE_REFLEX_DICTIONARY dictionary) set_property(TEST ${GENERATE_REFLEX_TEST} PROPERTY ENVIRONMENT ${ROOTTEST_ENVIRONMENT}) if(CMAKE_GENERATOR MATCHES Ninja AND NOT MSVC) - set_property(TEST ${GENERATE_REFLEX_TEST} PROPERTY RUN_SERIAL true) + set_property(TEST ${GENERATE_REFLEX_TEST} PROPERTY RESOURCE_LOCK NINJA_COMPILATION) endif() if (ARG_FIXTURES_SETUP) @@ -2714,7 +2714,7 @@ macro(ROOTTEST_GENERATE_EXECUTABLE executable) endif() if(CMAKE_GENERATOR MATCHES Ninja AND NOT MSVC) - set_property(TEST ${GENERATE_EXECUTABLE_TEST} PROPERTY RUN_SERIAL true) + set_property(TEST ${GENERATE_EXECUTABLE_TEST} PROPERTY RESOURCE_LOCK NINJA_COMPILATION) endif() if(MSVC AND NOT CMAKE_GENERATOR MATCHES Ninja) diff --git a/core/clingutils/src/unordered_mapLinkdef.h b/core/clingutils/src/unordered_mapLinkdef.h index 80339b792b137..86f326a1c1aef 100644 --- a/core/clingutils/src/unordered_mapLinkdef.h +++ b/core/clingutils/src/unordered_mapLinkdef.h @@ -1,3 +1,4 @@ +// clang-format off #include #include @@ -28,3 +29,6 @@ #pragma create TClass unordered_map; #pragma create TClass unordered_map; #pragma create TClass unordered_map; + +// For snapshot with systematic variations in RDF: +#pragma create TClass unordered_map>; diff --git a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx index a7b2fdfb81ed4..ddda4698a8f78 100644 --- a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx @@ -270,6 +270,7 @@ struct SnapshotHelperArgs { ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ROOT::Detail::RDF::RLoopManager *fInputLoopManager; bool fToNTuple; + bool fIncludeVariations; }; template @@ -309,12 +310,23 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr; - actionPtr.reset(new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, - std::move(isDefine), outputLM, inputLM, colTypeIDs), - colNames, colTypeIDs, prevNode, colRegister)); + if (snapHelperArgs->fIncludeVariations) { + using Helper_t = SnapshotHelperWithVariations; + using Action_t = RActionSnapshot; + actionPtr.reset(new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, + std::move(isDefine), outputLM, inputLM, colTypeIDs), + colNames, colTypeIDs, prevNode, colRegister)); + } else { + using Helper_t = UntypedSnapshotTTreeHelper; + using Action_t = RActionSnapshot; + actionPtr.reset(new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, + std::move(isDefine), outputLM, inputLM, colTypeIDs), + colNames, colTypeIDs, prevNode, colRegister)); + } } else { + if (snapHelperArgs->fIncludeVariations) { + throw std::invalid_argument("Multi-threaded snapshot with variations is not supported yet."); + } // multi-thread snapshot using Helper_t = UntypedSnapshotTTreeHelperMT; using Action_t = RActionSnapshot; diff --git a/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx b/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx index 86cfd9243d741..a5de6e27f2d40 100644 --- a/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RActionSnapshot.hxx @@ -29,6 +29,8 @@ std::shared_ptr AddDefinesToGraph(std::shared_ptr node, co std::unordered_map> &visitedMap); } // namespace GraphDrawing +class SnapshotHelperWithVariations; + template class R__CLING_PTRCHECK(off) RActionSnapshot final : public RActionBase { @@ -36,7 +38,7 @@ class R__CLING_PTRCHECK(off) RActionSnapshot final : public RActionBase { Helper fHelper; /// Pointer to the previous node in this branch of the computation graph - std::shared_ptr fPrevNode; + std::vector> fPrevNodes; /// Column readers per slot and per input column std::vector> fValues; @@ -54,8 +56,8 @@ public: const std::vector &colTypeIDs, std::shared_ptr pd, const RColumnRegister &colRegister) : RActionBase(pd->GetLoopManagerUnchecked(), columns, colRegister, pd->GetVariations()), - fHelper(std::forward(h)), - fPrevNode(std::move(pd)), + fHelper(std::move(h)), + fPrevNodes{std::move(pd)}, fValues(GetNSlots()), fColTypeIDs(colTypeIDs) { @@ -65,6 +67,27 @@ public: fIsDefine.reserve(nColumns); for (auto i = 0u; i < nColumns; ++i) fIsDefine.push_back(colRegister.IsDefineOrAlias(columns[i])); + + if (const auto &variations = GetVariations(); !variations.empty()) { + // Get pointers to previous nodes of all systematics + fPrevNodes.reserve(1 + variations.size()); + auto nominalFilter = fPrevNodes.front(); + if (static_cast(nominalFilter.get()) == fLoopManager) { + // just fill this with the RLoopManager N times + fPrevNodes.resize(1 + variations.size(), nominalFilter); + } else { + // create varied versions of the previous filter node + const auto &prevVariations = nominalFilter->GetVariations(); + for (const auto &variation : variations) { + if (IsStrInVec(variation, prevVariations)) { + fPrevNodes.emplace_back( + std::static_pointer_cast(nominalFilter->GetVariedFilter(variation))); + } else { + fPrevNodes.emplace_back(nominalFilter); + } + } + } + } } RActionSnapshot(const RActionSnapshot &) = delete; @@ -89,11 +112,33 @@ public: { fValues[slot] = GetUntypedColumnReaders(slot, r, RActionBase::GetColRegister(), *fLoopManager, RActionBase::GetColumnNames(), fColTypeIDs); + + if constexpr (std::is_same_v) { + // In case of systematic variations, append also the varied column readers to the values + // that get passed to the helpers + auto const &variations = GetVariations(); + for (unsigned int variationIndex = 0; variationIndex < variations.size(); ++variationIndex) { + auto const &readers = + GetUntypedColumnReaders(slot, r, RActionBase::GetColRegister(), *fLoopManager, + RActionBase::GetColumnNames(), fColTypeIDs, variations[variationIndex]); + for (unsigned int i = 0; i < readers.size(); ++i) { + if (fValues[slot][i] != readers[i]) { + fValues[slot].push_back(readers[i]); + fHelper.RegisterVariedColumn(slot, i, i, 0, "nominal"); + fHelper.RegisterVariedColumn(slot, fValues[slot].size() - 1, i, variationIndex + 1, + variations[variationIndex]); + } + } + } + } + fHelper.InitTask(r, slot); } void *GetValue(unsigned int slot, std::size_t readerIdx, Long64_t entry) { + assert(slot < fValues.size()); + assert(readerIdx < fValues[slot].size()); if (auto *val = fValues[slot][readerIdx]->template TryGet(entry)) return val; @@ -117,12 +162,34 @@ public: void Run(unsigned int slot, Long64_t entry) final { - // check if entry passes all filters - if (fPrevNode->CheckFilters(slot, entry)) - CallExec(slot, entry); + if constexpr (std::is_same_v) { + // check if entry passes all filters + std::vector filterPassed(fPrevNodes.size(), false); + for (unsigned int variation = 0; variation < fPrevNodes.size(); ++variation) { + filterPassed[variation] = fPrevNodes[variation]->CheckFilters(slot, entry); + } + + if (std::any_of(filterPassed.begin(), filterPassed.end(), [](bool val) { return val; })) { + // TODO: Don't allocate + std::vector untypedValues; + auto nReaders = fValues[slot].size(); + untypedValues.reserve(nReaders); + for (decltype(nReaders) readerIdx{}; readerIdx < nReaders; readerIdx++) + untypedValues.push_back(GetValue(slot, readerIdx, entry)); + + fHelper.Exec(slot, untypedValues, filterPassed); + } + } else { + if (fPrevNodes.front()->CheckFilters(slot, entry)) + CallExec(slot, entry); + } } - void TriggerChildrenCount() final { fPrevNode->IncrChildrenCount(); } + void TriggerChildrenCount() final + { + for (auto const &node : fPrevNodes) + node->IncrChildrenCount(); + } /// Clean-up operations to be performed at the end of a task. void FinalizeSlot(unsigned int slot) final @@ -142,29 +209,30 @@ public: std::shared_ptr GetGraph(std::unordered_map> &visitedMap) final { - auto prevNode = fPrevNode->GetGraph(visitedMap); - const auto &prevColumns = prevNode->GetDefinedColumns(); - // Action nodes do not need to go through CreateFilterNode: they are never common nodes between multiple branches const auto nodeType = HasRun() ? GraphDrawing::ENodeType::kUsedAction : GraphDrawing::ENodeType::kAction; auto thisNode = std::make_shared(fHelper.GetActionName(), visitedMap.size(), nodeType); visitedMap[(void *)this] = thisNode; - auto upmostNode = AddDefinesToGraph(thisNode, GetColRegister(), prevColumns, visitedMap); + for (auto const &node : fPrevNodes) { + auto prevNode = node->GetGraph(visitedMap); + const auto &prevColumns = prevNode->GetDefinedColumns(); + auto upmostNode = AddDefinesToGraph(thisNode, GetColRegister(), prevColumns, visitedMap); - thisNode->AddDefinedColumns(GetColRegister().GenerateColumnNames()); - upmostNode->SetPrevNode(prevNode); + thisNode->AddDefinedColumns(GetColRegister().GenerateColumnNames()); + upmostNode->SetPrevNode(prevNode); + } return thisNode; } - /// This method is invoked to update a partial result during the event loop, right before passing the result to a - /// user-defined callback registered via RResultPtr::RegisterCallback + /// Forwards to the action helpers; will throw since PartialUpdate not supported for most snapshot helpers. void *PartialUpdate(unsigned int slot) final { return fHelper.CallPartialUpdate(slot); } + /// Will throw, since varied actions are unsupported. Instead, set a flag in RSnapshotOptions. [[maybe_unused]] std::unique_ptr MakeVariedAction(std::vector && /*results*/) final { - // TODO: Probably we also need an untyped RVariedAction - throw std::runtime_error("RDataFrame::Snapshot: Snapshot with systematic variations is not supported yet."); + throw std::logic_error("RDataFrame::Snapshot: The snapshot action cannot be varied. Instead, switch on " + "variations in RSnapshotOptions."); } /** @@ -175,8 +243,8 @@ public: */ std::unique_ptr CloneAction(void *newResult) final { - return std::make_unique(fHelper.CallMakeNew(newResult), GetColumnNames(), fColTypeIDs, fPrevNode, - GetColRegister()); + return std::make_unique(fHelper.CallMakeNew(newResult), GetColumnNames(), fColTypeIDs, + fPrevNodes.front(), GetColRegister()); } }; diff --git a/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx b/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx index 34192c0ad90ee..754d4cdfcb5fe 100644 --- a/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RDefineReader.hxx @@ -62,7 +62,7 @@ class RDefinesWithReaders { // (see BookDefineJit). it is never null. std::shared_ptr fDefine; // Column readers per variation (in the map) per slot (in the vector). - std::vector>> fReadersPerVariation; + std::vector>> fReadersPerVariation; // Strings that were already used to represent column names in this RDataFrame instance. ROOT::Internal::RDF::RStringCache &fCachedColNames; diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index bd662bc633802..33662b2c8efdc 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -882,7 +882,7 @@ public: /// return an RVec of varied values, one for each variation tag, in the same order as the tags. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -988,7 +988,7 @@ public: /// return an RVec of varied values, one for each variation tag, in the same order as the tags. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1036,7 +1036,7 @@ public: /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] inputColumns the names of the columns to be passed to the callable. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1091,7 +1091,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied /// values for the specified column. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1126,7 +1126,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec or RVecs containing the varied /// values for the specified columns. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// /// This overload adds the possibility for the expression used to evaluate the varied values to be just-in-time @@ -1163,7 +1163,7 @@ public: /// \param[in] expression a string containing valid C++ code that evaluates to an RVec containing the varied /// values for the specified column. /// \param[in] nVariations number of variations returned by the expression. The corresponding tags will be `"0"`, - /// `"1"`, etc. + /// `"1"`, etc. /// \param[in] variationName a generic name for this set of varied values, e.g. `"ptvariation"`. /// colName is used if none is provided. /// @@ -1251,6 +1251,27 @@ public: /// \param[in] options RSnapshotOptions struct with extra options to pass to the output TFile and TTree/RNTuple. /// \return a `RDataFrame` that wraps the snapshotted dataset. /// + template + R__DEPRECATED( + 6, 40, "Snapshot does not need template arguments anymore, you can safely remove them from this function call.") + RResultPtr> Snapshot(std::string_view treename, std::string_view filename, + const ColumnNames_t &columnList, + const RSnapshotOptions &options = RSnapshotOptions()) + { + return Snapshot(treename, filename, columnList, options); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. + /// \param[in] treename The name of the output TTree or RNTuple. + /// \param[in] filename The name of the output TFile. + /// \param[in] columnList The list of names of the columns/branches/fields to be written. + /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. + /// \return a `RDataFrame` that wraps the snapshotted dataset. + /// + /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. + /// The types of the columns are automatically inferred and do not need to be specified. + /// /// Support for writing of nested branches/fields is limited (although RDataFrame is able to read them) and dot ('.') /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot. /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also @@ -1306,28 +1327,6 @@ public: /// opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; /// df.Snapshot("outputNTuple", "outputFile.root", {"x"}, opts); /// ~~~ - template - R__DEPRECATED( - 6, 40, "Snapshot does not need template arguments anymore, you can safely remove them from this function call.") - RResultPtr> Snapshot(std::string_view treename, std::string_view filename, - const ColumnNames_t &columnList, - const RSnapshotOptions &options = RSnapshotOptions()) - { - return Snapshot(treename, filename, columnList, options); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. - /// \param[in] treename The name of the output TTree or RNTuple. - /// \param[in] filename The name of the output TFile. - /// \param[in] columnList The list of names of the columns/branches/fields to be written. - /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. - /// \return a `RDataFrame` that wraps the snapshotted dataset. - /// - /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. - /// The types of the columns are automatically inferred and do not need to be specified. - /// - /// See above for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, const RSnapshotOptions &options = RSnapshotOptions()) @@ -1385,7 +1384,7 @@ public: auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, - options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */}); + options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */, /*fIncludeVariations=*/false}); auto &&nColumns = colListNoAliasesWithSizeBranches.size(); const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches); @@ -1423,7 +1422,7 @@ public: auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, - options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */}); + options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */, options.fIncludeVariations}); auto &&nColumns = colListNoAliasesWithSizeBranches.size(); const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches); @@ -1464,7 +1463,7 @@ public: /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. /// The types of the columns are automatically inferred and do not need to be specified. /// - /// See above for a more complete description and example usages. + /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, std::string_view columnNameRegexp = "", const RSnapshotOptions &options = RSnapshotOptions()) @@ -1507,7 +1506,7 @@ public: /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. /// The types of the columns are automatically inferred and do not need to be specified. /// - /// See above for a more complete description and example usages. + /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. RResultPtr> Snapshot(std::string_view treename, std::string_view filename, std::initializer_list columnList, const RSnapshotOptions &options = RSnapshotOptions()) @@ -2362,13 +2361,13 @@ public: /// auto myGAE2 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh"); /// ~~~ /// - /// `GraphAssymErrors` should also be used for the cases in which values associated only with - /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero. - /// In such cases, user should do the following: + /// `GraphAssymErrors` should also be used for the cases in which values associated only with + /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero. + /// In such cases, user should do the following: /// ~~~{.cpp} /// // Create a column of zeros in RDataFrame - /// auto rdf_withzeros = rdf.Define("zero", "0"); - /// // or alternatively: + /// auto rdf_withzeros = rdf.Define("zero", "0"); + /// // or alternatively: /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;}); /// // Create the graph with y errors only /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh"); diff --git a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx index e7ebca8041e95..637f75a03a465 100644 --- a/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/SnapshotHelpers.hxx @@ -26,6 +26,8 @@ #include #include +#include + class TBranch; class TFile; @@ -38,19 +40,6 @@ class TBufferMergerFile; namespace ROOT::Internal::RDF { -class RBranchSet { - std::vector fBranches; - std::vector fNames; - std::vector fIsCArray; - -public: - TBranch *Get(const std::string &name) const; - bool IsCArray(const std::string &name) const; - void Insert(const std::string &name, TBranch *address, bool isCArray = false); - void Clear(); - void AssertNoNullBranchAddresses(); -}; - class R__CLING_PTRCHECK(off) UntypedSnapshotRNTupleHelper final : public RActionImpl { std::string fFileName; std::string fDirName; @@ -102,6 +91,49 @@ public: UntypedSnapshotRNTupleHelper MakeNew(void *newName); }; +/// Stores properties of each output branch in a Snapshot. +struct RBranchData { + std::string fInputBranchName; // This contains resolved aliases + std::string fOutputBranchName; + const std::type_info *fInputTypeID = nullptr; + TBranch *fOutputBranch = nullptr; + void *fBranchAddressForCArrays = nullptr; // Used to detect if branch addresses need to be updated + + int fVariationIndex = -1; // For branches that are only valid if a specific filter passed + void *fEmptyInstance = nullptr; + alignas(8) std::array fNullBytes{ + std::byte{0}}; // Nullbytes for writing 0 into branches of primitive types + DesFunc_t fEmptyInstanceDeleter = nullptr; + bool fIsCArray = false; + bool fIsDefine = false; + + RBranchData() = default; + RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine, const std::type_info *typeID) + : fInputBranchName{std::move(inputBranchName)}, + fOutputBranchName{std::move(outputBranchName)}, + fInputTypeID{typeID}, + fIsDefine{isDefine} + { + } + RBranchData(RBranchData const &other) noexcept { *this = other; } + RBranchData(RBranchData &&other) noexcept { *this = std::move(other); } + ~RBranchData() + { + if (fEmptyInstanceDeleter) + fEmptyInstanceDeleter(fEmptyInstance); + } + RBranchData &operator=(RBranchData const &other) noexcept; + RBranchData &operator=(RBranchData &&other) noexcept; + + void ClearBranchPointers() + { + fOutputBranch = nullptr; + fBranchAddressForCArrays = nullptr; + } + void *EmptyInstance(); + void ResetBranchAddressToEmtpyInstance(); +}; + class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelper final : public RActionImpl { std::string fFileName; std::string fDirName; @@ -110,17 +142,10 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelper final : public RActionIm std::unique_ptr fOutputFile; std::unique_ptr fOutputTree; // must be a ptr because TTrees are not copy/move constructible bool fBranchAddressesNeedReset{true}; - ColumnNames_t fInputBranchNames; // This contains the resolved aliases - ColumnNames_t fOutputBranchNames; TTree *fInputTree = nullptr; // Current input tree. Set at initialization time (`InitTask`) - // TODO we might be able to unify fBranches, fBranchAddresses and fOutputBranches - std::vector fBranches; // Addresses of branches in output, non-null only for the ones holding C arrays - std::vector fBranchAddresses; // Addresses of objects associated to output branches - RBranchSet fOutputBranches; - std::vector fIsDefine; + std::vector fBranchData; // Information for all output branches ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ROOT::Detail::RDF::RLoopManager *fInputLoopManager; - std::vector fInputColumnTypeIDs; // Types for the input columns public: UntypedSnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename, @@ -169,11 +194,7 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelperMT final : public RAction std::vector> fOutputTrees; std::vector fBranchAddressesNeedReset; // vector does not allow concurrent writing of different elements std::vector fInputTrees; // Current input trees, one per slot. Set at initialization time (`InitTask`) - // Addresses of branches in output per slot, non-null only for the ones holding C arrays - std::vector> fBranches; - // Addresses of objects associated to output branches per slot, non-null only for the ones holding C arrays - std::vector> fBranchAddresses; - std::vector fOutputBranches; // Unique set of output branches, one per slot. + std::vector> fBranchData; // Information for all output branches of each slot // Attributes of the output TTree @@ -182,16 +203,11 @@ class R__CLING_PTRCHECK(off) UntypedSnapshotTTreeHelperMT final : public RAction std::string fTreeName; TFile *fOutputFile; // Non-owning view on the output file RSnapshotOptions fOptions; - std::vector fOutputBranchNames; // Attributes related to the computation graph ROOT::Detail::RDF::RLoopManager *fOutputLoopManager; ROOT::Detail::RDF::RLoopManager *fInputLoopManager; - std::vector fInputBranchNames; // This contains the resolved aliases - std::vector fInputColumnTypeIDs; // Types for the input columns - - std::vector fIsDefine; public: UntypedSnapshotTTreeHelperMT(unsigned int nSlots, std::string_view filename, std::string_view dirname, @@ -232,6 +248,55 @@ public: UntypedSnapshotTTreeHelperMT MakeNew(void *newName, std::string_view /*variation*/ = "nominal"); }; +struct SnapshotOutputWriter; + +/// Helper object for a single-thread Snapshot action +class R__CLING_PTRCHECK(off) SnapshotHelperWithVariations + : public ROOT::Detail::RDF::RActionImpl { + RSnapshotOptions fOptions; + std::shared_ptr fOutputHandle; + TTree *fInputTree = nullptr; // Current input tree. Set at initialization time (`InitTask`) + std::vector fBranchData; + ROOT::Detail::RDF::RLoopManager *fInputLoopManager = nullptr; + ROOT::Detail::RDF::RLoopManager *fOutputLoopManager = nullptr; + + void ClearOutputBranches(); + +public: + SnapshotHelperWithVariations(std::string_view filename, std::string_view dirname, std::string_view treename, + const ColumnNames_t & /*vbnames*/, const ColumnNames_t &bnames, + const RSnapshotOptions &options, std::vector && /*isDefine*/, + ROOT::Detail::RDF::RLoopManager *outputLoopMgr, + ROOT::Detail::RDF::RLoopManager *inputLoopMgr, + const std::vector &colTypeIDs); + + SnapshotHelperWithVariations(SnapshotHelperWithVariations &&) = default; + ~SnapshotHelperWithVariations(); + + void RegisterVariedColumn(unsigned int slot, unsigned int columnIndex, unsigned int originalColumnIndex, + unsigned int varationIndex, std::string const &variationName); + + void InitTask(TTreeReader *, unsigned int slot); + + void Exec(unsigned int /*slot*/, const std::vector &values, std::vector const &filterPassed); + + void FlushEvent(unsigned int slot); + + void ResetBranchAddresses(unsigned int slot); + + void Initialize() {} + + void Finalize(); + + std::string GetActionName() { return "SnapshotWithVariations"; } + + ROOT::RDF::SampleCallback_t GetSampleCallback() final + { + // TODO: Needed? + return [this](unsigned int, const RSampleInfo &) mutable { ; }; + } +}; + } // namespace ROOT::Internal::RDF #endif diff --git a/tree/dataframe/inc/ROOT/RDFHelpers.hxx b/tree/dataframe/inc/ROOT/RDFHelpers.hxx index 27fbcef39a794..34ae4f129d769 100644 --- a/tree/dataframe/inc/ROOT/RDFHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDFHelpers.hxx @@ -222,6 +222,10 @@ namespace Experimental { template RResultMap VariationsFor(RResultPtr resPtr) { + using SnapshotResult_t = ROOT::RDF::RInterface; + static_assert(!std::is_same_v, + "Snapshot with variations only can be enabled via RSnapshotOptions."); + R__ASSERT(resPtr != nullptr && "Calling VariationsFor on an empty RResultPtr"); // populate parts of the computation graph for which we only have "empty shells", e.g. RJittedActions and @@ -270,9 +274,6 @@ RResultMap VariationsFor(RResultPtr resPtr) *resPtr.fLoopManager, std::move(nominalAction), std::move(variedAction)); } -using SnapshotPtr_t = ROOT::RDF::RResultPtr>; -SnapshotPtr_t VariationsFor(SnapshotPtr_t resPtr); - /// \brief Add ProgressBar to a ROOT::RDF::RNode /// \param[in] df RDataFrame node at which ProgressBar is called. /// diff --git a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx index 6d558d2188e46..cfdc4eaac457c 100644 --- a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx +++ b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx @@ -53,6 +53,7 @@ struct RSnapshotOptions { bool fLazy = false; ///< Do not start the event loop when Snapshot is called bool fOverwriteIfExists = false; ///< If fMode is "UPDATE", overwrite object in output file if it already exists bool fVector2RVec = true; ///< If set to true will convert std::vector columns to RVec when saving to disk + bool fIncludeVariations = false; ///< Include columns that result from a Vary() action int fBasketSize = -1; ///< Set a custom basket size option. For more details, see ///< https://root.cern/manual/trees/#baskets-clusters-and-the-tree-header ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to diff --git a/tree/dataframe/src/RDFHelpers.cxx b/tree/dataframe/src/RDFHelpers.cxx index a4d8ad422c623..dc4c5d7515937 100644 --- a/tree/dataframe/src/RDFHelpers.cxx +++ b/tree/dataframe/src/RDFHelpers.cxx @@ -141,15 +141,7 @@ unsigned int ROOT::RDF::RunGraphs(std::vector handles) return uniqueLoops.size(); } -ROOT::RDF::Experimental::SnapshotPtr_t ROOT::RDF::Experimental::VariationsFor(ROOT::RDF::Experimental::SnapshotPtr_t) -{ - throw std::logic_error("Varying a Snapshot result is not implemented yet."); -} - -namespace ROOT { -namespace RDF { - -namespace Experimental { +namespace ROOT::RDF::Experimental { void ThreadsPerTH3(unsigned int N) { @@ -398,6 +390,5 @@ void AddProgressBar(ROOT::RDataFrame dataframe) auto node = ROOT::RDF::AsRNode(dataframe); ROOT::RDF::Experimental::AddProgressBar(node); } -} // namespace Experimental -} // namespace RDF -} // namespace ROOT + +} // namespace ROOT::RDF::Experimental diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 94c254f4d7078..b704cf0198e2f 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -34,8 +34,48 @@ #include #include +#include +#include +#include + +using ROOT::Internal::RDF::RBranchData; +// Maintaining the following allows for faster vector resize: +static_assert(std::is_nothrow_move_assignable_v); +static_assert(std::is_nothrow_move_constructible_v); + namespace { +void AssertNoNullBranchAddresses(std::vector const &branches) +{ + std::vector branchesWithNullAddress; + for (auto const &branchData : branches) { + if (branchData.fOutputBranch->GetAddress() == nullptr) + branchesWithNullAddress.push_back(branchData.fOutputBranch); + } + + if (branchesWithNullAddress.empty()) + return; + + // otherwise build error message and throw + std::vector missingBranchNames; + std::transform(branchesWithNullAddress.begin(), branchesWithNullAddress.end(), + std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); }); + std::string msg = "RDataFrame::Snapshot:"; + if (missingBranchNames.size() == 1) { + msg += " branch " + missingBranchNames[0] + + " is needed as it provides the size for one or more branches containing dynamically sized arrays, but " + "it is"; + } else { + msg += " branches "; + for (const auto &bName : missingBranchNames) + msg += bName + ", "; + msg.resize(msg.size() - 2); // remove last ", " + msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are"; + } + msg += " not part of the set of branches that are being written out."; + throw std::runtime_error(msg); +} + TBranch *SearchForBranch(TTree *inputTree, const std::string &branchName) { if (inputTree) { @@ -49,35 +89,50 @@ TBranch *SearchForBranch(TTree *inputTree, const std::string &branchName) return nullptr; } -void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet &outputBranches, TBranch *inputBranch, - const std::string &outputBranchName, int basketSize, void *address) +std::vector::iterator CreateCStyleArrayBranch(TTree &outputTree, std::vector &outputBranches, + std::vector::iterator thisBranch, + TBranch *inputBranch, int basketSize, void *address) { if (!inputBranch) - return; + return thisBranch; const auto STLKind = TClassEdit::IsSTLCont(inputBranch->GetClassName()); if (STLKind == ROOT::ESTLType::kSTLvector || STLKind == ROOT::ESTLType::kROOTRVec) - return; + return thisBranch; // must construct the leaflist for the output branch and create the branch in the output tree const auto *leaf = static_cast(inputBranch->GetListOfLeaves()->UncheckedAt(0)); if (!leaf) - return; + return thisBranch; const auto bname = leaf->GetName(); auto *sizeLeaf = leaf->GetLeafCount(); const auto sizeLeafName = sizeLeaf ? std::string(sizeLeaf->GetName()) : std::to_string(leaf->GetLenStatic()); // We proceed only if branch is a fixed-or-variable-sized array if (sizeLeaf || leaf->GetLenStatic() > 1) { - if (sizeLeaf && !outputBranches.Get(sizeLeafName)) { - // The output array branch `bname` has dynamic size stored in leaf `sizeLeafName`, but that leaf has not been - // added to the output tree yet. However, the size leaf has to be available for the creation of the array - // branch to be successful. So we create the size leaf here. - const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName()); - // Use Original basket size for Existing Branches otherwise use Custom basket Size. - const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize(); - // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for `sizeLeafName` - auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast(nullptr), - (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize); - outputBranches.Insert(sizeLeafName, outputBranch); + if (sizeLeaf) { + // The array branch `bname` has dynamic size stored in leaf `sizeLeafName`, so we need to ensure that it's + // in the output tree. + auto sizeLeafIt = + std::find_if(outputBranches.begin(), outputBranches.end(), + [&sizeLeafName](RBranchData const &bd) { return bd.fOutputBranchName == sizeLeafName; }); + if (sizeLeafIt == outputBranches.end()) { + // The size leaf is not part of the output branches yet, so emplace an empty slot for it. + // This means that iterators need to be updated in case the container reallocates. + const auto indexBeforeEmplace = std::distance(outputBranches.begin(), thisBranch); + outputBranches.emplace_back("", sizeLeafName, /*isDefine=*/false, /*typeID=*/nullptr); + thisBranch = outputBranches.begin() + indexBeforeEmplace; + sizeLeafIt = outputBranches.end() - 1; + } + if (!sizeLeafIt->fOutputBranch) { + // The size leaf was emplaced, but not initialised yet + const auto sizeTypeStr = ROOT::Internal::RDF::TypeName2ROOTTypeName(sizeLeaf->GetTypeName()); + // Use Original basket size for Existing Branches otherwise use Custom basket Size. + const auto bufSize = (basketSize > 0) ? basketSize : sizeLeaf->GetBranch()->GetBasketSize(); + // The null branch address is a placeholder. It will be set when SetBranchesHelper is called for + // `sizeLeafName` + auto *outputBranch = outputTree.Branch(sizeLeafName.c_str(), static_cast(nullptr), + (sizeLeafName + '/' + sizeTypeStr).c_str(), bufSize); + sizeLeafIt->fOutputBranch = outputBranch; + } } const auto btype = leaf->GetTypeName(); @@ -87,7 +142,7 @@ void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet "RDataFrame::Snapshot: could not correctly construct a leaflist for C-style array in column %s. The " "leaf is of type '%s'. This column will not be written out.", bname, btype); - return; + return thisBranch; } const auto leaflist = std::string(bname) + "[" + sizeLeafName + "]/" + rootbtype; @@ -102,23 +157,26 @@ void CreateCStyleArrayBranch(TTree &outputTree, ROOT::Internal::RDF::RBranchSet } return nullptr; }(); - auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize); - outputBranch->SetTitle(inputBranch->GetTitle()); - outputBranches.Insert(outputBranchName, outputBranch, true); + thisBranch->fOutputBranch = + outputTree.Branch(thisBranch->fOutputBranchName.c_str(), addressForBranch, leaflist.c_str(), bufSize); + thisBranch->fOutputBranch->SetTitle(inputBranch->GetTitle()); + thisBranch->fIsCArray = true; } + + return thisBranch; } -void SetBranchAddress(TBranch *inputBranch, TBranch &outputBranch, void *&outputBranchAddress, bool isCArray, - void *valueAddress) +void SetBranchAddress(TBranch *inputBranch, RBranchData &branchData, void *valueAddress) { const static TClassRef TBOClRef("TBranchObject"); if (inputBranch && inputBranch->IsA() == TBOClRef) { - outputBranch.SetAddress(reinterpret_cast(inputBranch->GetAddress())); - } else if (outputBranch.IsA() != TBranch::Class()) { - outputBranchAddress = valueAddress; - outputBranch.SetAddress(&outputBranchAddress); + branchData.fOutputBranch->SetAddress(reinterpret_cast(inputBranch->GetAddress())); + } else if (branchData.fOutputBranch->IsA() != TBranch::Class()) { + // This is a relatively rare case of a fixed-size array getting redefined + branchData.fBranchAddressForCArrays = valueAddress; + branchData.fOutputBranch->SetAddress(&branchData.fBranchAddressForCArrays); } else { - void *correctAddress = [valueAddress, isCArray]() -> void * { + void *correctAddress = [valueAddress, isCArray = branchData.fIsCArray]() -> void * { if (isCArray) { // Address here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer @@ -127,29 +185,26 @@ void SetBranchAddress(TBranch *inputBranch, TBranch &outputBranch, void *&output } return valueAddress; }(); - outputBranch.SetAddress(correctAddress); - outputBranchAddress = valueAddress; + branchData.fOutputBranch->SetAddress(correctAddress); + branchData.fBranchAddressForCArrays = valueAddress; } } -void CreateFundamentalTypeBranch(TTree &outputTree, const std::string &outputBranchName, void *valueAddress, - const std::type_info &valueTypeID, ROOT::Internal::RDF::RBranchSet &outputBranches, - int bufSize) +void CreateFundamentalTypeBranch(TTree &outputTree, RBranchData &bd, void *valueAddress, int bufSize) { // Logic taken from // TTree::BranchImpRef( // const char* branchname, TClass* ptrClass, EDataType datatype, void* addobj, Int_t bufsize, Int_t splitlevel) - auto rootTypeChar = ROOT::Internal::RDF::TypeID2ROOTTypeName(valueTypeID); + auto rootTypeChar = ROOT::Internal::RDF::TypeID2ROOTTypeName(*bd.fInputTypeID); if (rootTypeChar == ' ') { Warning("Snapshot", "RDataFrame::Snapshot: could not correctly construct a leaflist for fundamental type in column %s. This " "column will not be written out.", - outputBranchName.c_str()); + bd.fOutputBranchName.c_str()); return; } - std::string leafList{outputBranchName + '/' + rootTypeChar}; - auto *outputBranch = outputTree.Branch(outputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize); - outputBranches.Insert(outputBranchName, outputBranch); + std::string leafList{bd.fOutputBranchName + '/' + rootTypeChar}; + bd.fOutputBranch = outputTree.Branch(bd.fOutputBranchName.c_str(), valueAddress, leafList.c_str(), bufSize); } /// Ensure that the TTree with the resulting snapshot can be written to the target TFile. This means checking that the @@ -238,13 +293,18 @@ void EnsureValidSnapshotRNTupleOutput(const ROOT::RDF::RSnapshotOptions &opts, c } } -void SetBranchesHelper(TTree *inputTree, TTree &outputTree, ROOT::Internal::RDF::RBranchSet &outputBranches, - int basketSize, const std::string &inputBranchName, const std::string &outputBranchName, - const std::type_info &valueTypeID, void *valueAddress, TBranch *&actionHelperBranchPtr, - void *&actionHelperBranchPtrAddress, bool isDefine) +void SetBranchesHelper(TTree *inputTree, TTree &outputTree, + std::vector &allBranchData, std::size_t currentIndex, + int basketSize, void *valueAddress) { + auto branchData = allBranchData.begin() + currentIndex; + auto *inputBranch = branchData->fIsDefine ? nullptr : SearchForBranch(inputTree, branchData->fInputBranchName); - auto *inputBranch = isDefine ? nullptr : SearchForBranch(inputTree, inputBranchName); + if (branchData->fOutputBranch && valueAddress) { + // The output branch was already created, we just need to (re)set its address + SetBranchAddress(inputBranch, *branchData, valueAddress); + return; + } // Respect the original bufsize and splitlevel arguments // In particular, by keeping splitlevel equal to 0 if this was the case for `inputBranch`, we avoid @@ -254,55 +314,47 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, ROOT::Internal::RDF: const auto bufSize = (basketSize > 0) ? basketSize : (inputBranch ? inputBranch->GetBasketSize() : 32000); const auto splitLevel = inputBranch ? inputBranch->GetSplitLevel() : 99; - if (auto *outputBranch = outputBranches.Get(outputBranchName); outputBranch && valueAddress) { - // The output branch was already created, we just need to (re)set its address - SetBranchAddress(inputBranch, *outputBranch, actionHelperBranchPtrAddress, - outputBranches.IsCArray(outputBranchName), valueAddress); - return; - } - - auto *dictionary = TDictionary::GetDictionary(valueTypeID); + auto *dictionary = TDictionary::GetDictionary(*branchData->fInputTypeID); if (dynamic_cast(dictionary)) { // Branch of fundamental type - CreateFundamentalTypeBranch(outputTree, outputBranchName, valueAddress, valueTypeID, outputBranches, bufSize); + CreateFundamentalTypeBranch(outputTree, *branchData, valueAddress, bufSize); return; } - if (!isDefine) { + if (!branchData->fIsDefine) { // Cases where we need a leaflist (e.g. C-style arrays) // We only enter this code path if the input value does not come from a Define/Redefine. In those cases, it is // not allowed to create a column of C-style array type, so that can't happen when writing the TTree. This is // currently what prevents writing the wrong branch output type in a scenario where the input branch of the TTree // is a C-style array and then the user is Redefining it with some other type (e.g. a ROOT::RVec). - CreateCStyleArrayBranch(outputTree, outputBranches, inputBranch, outputBranchName, bufSize, valueAddress); + branchData = CreateCStyleArrayBranch(outputTree, allBranchData, branchData, inputBranch, bufSize, valueAddress); } - if (auto *arrayBranch = outputBranches.Get(outputBranchName)) { + if (branchData->fOutputBranch) { // A branch was created in the previous function call - actionHelperBranchPtr = arrayBranch; if (valueAddress) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(valueAddress); - actionHelperBranchPtrAddress = rawRVec->data(); + branchData->fBranchAddressForCArrays = rawRVec->data(); } return; } if (auto *classPtr = dynamic_cast(dictionary)) { - TBranch *outputBranch{}; // Case of unsplit object with polymorphic type if (inputBranch && dynamic_cast(inputBranch) && valueAddress) - outputBranch = ROOT::Internal::TreeUtils::CallBranchImp(outputTree, outputBranchName.c_str(), classPtr, - inputBranch->GetAddress(), bufSize, splitLevel); + branchData->fOutputBranch = + ROOT::Internal::TreeUtils::CallBranchImp(outputTree, branchData->fOutputBranchName.c_str(), classPtr, + inputBranch->GetAddress(), bufSize, splitLevel); // General case, with valid address else if (valueAddress) - outputBranch = ROOT::Internal::TreeUtils::CallBranchImpRef(outputTree, outputBranchName.c_str(), classPtr, - TDataType::GetType(valueTypeID), valueAddress, - bufSize, splitLevel); + branchData->fOutputBranch = ROOT::Internal::TreeUtils::CallBranchImpRef( + outputTree, branchData->fOutputBranchName.c_str(), classPtr, TDataType::GetType(*branchData->fInputTypeID), + valueAddress, bufSize, splitLevel); // No value was passed, we're just creating a hollow branch to populate the dataset schema else - outputBranch = outputTree.Branch(outputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize); - outputBranches.Insert(outputBranchName, outputBranch); + branchData->fOutputBranch = + outputTree.Branch(branchData->fOutputBranchName.c_str(), classPtr->GetName(), nullptr, bufSize); return; } @@ -312,71 +364,68 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, ROOT::Internal::RDF: } } // namespace -TBranch *ROOT::Internal::RDF::RBranchSet::Get(const std::string &name) const +RBranchData &RBranchData::operator=(RBranchData const &other) noexcept { - auto it = std::find(fNames.begin(), fNames.end(), name); - if (it == fNames.end()) - return nullptr; - return fBranches[std::distance(fNames.begin(), it)]; + fInputBranchName = other.fInputBranchName; + fOutputBranchName = other.fOutputBranchName; + fInputTypeID = other.fInputTypeID; + fOutputBranch = other.fOutputBranch; + fBranchAddressForCArrays = other.fBranchAddressForCArrays; + fVariationIndex = other.fVariationIndex; + fEmptyInstance = nullptr; + fNullBytes = other.fNullBytes; + fEmptyInstanceDeleter = nullptr; + fIsCArray = other.fIsCArray; + fIsDefine = other.fIsDefine; + + return *this; } -bool ROOT::Internal::RDF::RBranchSet::IsCArray(const std::string &name) const +RBranchData &RBranchData::operator=(RBranchData &&other) noexcept { - if (auto it = std::find(fNames.begin(), fNames.end(), name); it != fNames.end()) - return fIsCArray[std::distance(fNames.begin(), it)]; - return false; + fInputBranchName = std::move(other.fInputBranchName); + fOutputBranchName = std::move(other.fOutputBranchName); + fInputTypeID = other.fInputTypeID; + fOutputBranch = other.fOutputBranch; + fBranchAddressForCArrays = other.fBranchAddressForCArrays; + fVariationIndex = other.fVariationIndex; + fEmptyInstance = other.fEmptyInstance; + fNullBytes = other.fNullBytes; + fEmptyInstanceDeleter = other.fEmptyInstanceDeleter; + fIsCArray = other.fIsCArray; + fIsDefine = other.fIsDefine; + + // This is why it's not = default: + other.fEmptyInstanceDeleter = nullptr; + + return *this; } -void ROOT::Internal::RDF::RBranchSet::Insert(const std::string &name, TBranch *address, bool isCArray) +void *ROOT::Internal::RDF::RBranchData::EmptyInstance() { - if (address == nullptr) { - throw std::logic_error("Trying to insert a null branch address."); - } - if (std::find(fBranches.begin(), fBranches.end(), address) != fBranches.end()) { - throw std::logic_error("Trying to insert a branch address that's already present."); - } - if (std::find(fNames.begin(), fNames.end(), name) != fNames.end()) { - throw std::logic_error("Trying to insert a branch name that's already present."); + if (!fEmptyInstance) { + auto *dictionary = TDictionary::GetDictionary(*fInputTypeID); + if (auto dataType = dynamic_cast(dictionary); dataType) { + assert(dataType->Size() <= 8); + fEmptyInstance = fNullBytes.data(); + } else { + assert(dynamic_cast(dictionary) != nullptr); + auto tclass = static_cast(dictionary); + fEmptyInstance = tclass->New(); + fEmptyInstanceDeleter = tclass->GetDestructor(); + } } - fNames.emplace_back(name); - fBranches.emplace_back(address); - fIsCArray.push_back(isCArray); + return fEmptyInstance; } -void ROOT::Internal::RDF::RBranchSet::Clear() +/// Point the branch address to an empty instance of the type represented by this branch. +/// This is used in case of variations, when certain defines/actions don't execute. We +/// nevertheless need to write something, so we point the branch to an empty instance. +void ROOT::Internal::RDF::RBranchData::ResetBranchAddressToEmtpyInstance() { - fBranches.clear(); - fNames.clear(); - fIsCArray.clear(); -} - -void ROOT::Internal::RDF::RBranchSet::AssertNoNullBranchAddresses() -{ - std::vector branchesWithNullAddress; - std::copy_if(fBranches.begin(), fBranches.end(), std::back_inserter(branchesWithNullAddress), - [](TBranch *b) { return b->GetAddress() == nullptr; }); - - if (branchesWithNullAddress.empty()) + if (!fOutputBranch) return; - - // otherwise build error message and throw - std::vector missingBranchNames; - std::transform(branchesWithNullAddress.begin(), branchesWithNullAddress.end(), - std::back_inserter(missingBranchNames), [](TBranch *b) { return b->GetName(); }); - std::string msg = "RDataFrame::Snapshot:"; - if (missingBranchNames.size() == 1) { - msg += " branch " + missingBranchNames[0] + - " is needed as it provides the size for one or more branches containing dynamically sized arrays, but " - "it is"; - } else { - msg += " branches "; - for (const auto &bName : missingBranchNames) - msg += bName + ", "; - msg.resize(msg.size() - 2); // remove last ", " - msg += " are needed as they provide the size of other branches containing dynamically sized arrays, but they are"; - } - msg += " not part of the set of branches that are being written out."; - throw std::runtime_error(msg); + fOutputBranch->SetAddress(EmptyInstance()); } ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UntypedSnapshotTTreeHelper( @@ -388,16 +437,16 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UntypedSnapshotTTreeHelper( fDirName(dirname), fTreeName(treename), fOptions(options), - fInputBranchNames(vbnames), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), - fBranches(vbnames.size(), nullptr), - fBranchAddresses(vbnames.size(), nullptr), - fIsDefine(std::move(isDefine)), fOutputLoopManager(loopManager), - fInputLoopManager(inputLM), - fInputColumnTypeIDs(colTypeIDs) + fInputLoopManager(inputLM) { EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName); + + auto outputBranchNames = ReplaceDotWithUnderscore(bnames); + fBranchData.reserve(vbnames.size()); + for (unsigned int i = 0; i < vbnames.size(); ++i) { + fBranchData.emplace_back(vbnames[i], std::move(outputBranchNames[i]), isDefine[i], colTypeIDs[i]); + } } // Define special member methods here where the definition of all the data member types is available @@ -449,17 +498,16 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UpdateCArraysPtrs(const st // associated to those is re-allocated. As a result the value of the pointer can change therewith // leaving associated to the branch of the output tree an invalid pointer. // With this code, we set the value of the pointer in the output branch anew when needed. - assert(values.size() == fBranches.size()); + assert(values.size() <= fBranchData.size()); auto nValues = values.size(); for (decltype(nValues) i{}; i < nValues; i++) { - if (fBranches[i] && fOutputBranches.IsCArray(fOutputBranchNames[i])) { + if (fBranchData[i].fIsCArray) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(values[i]); - if (auto *data = rawRVec->data(); fBranchAddresses[i] != data) { - // reset the branch address - fBranches[i]->SetAddress(data); - fBranchAddresses[i] = data; + if (auto *data = rawRVec->data(); fBranchData[i].fBranchAddressForCArrays != data) { + fBranchData[i].fOutputBranch->SetAddress(data); + fBranchData[i].fBranchAddressForCArrays = data; } } } @@ -468,26 +516,18 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::UpdateCArraysPtrs(const st void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetBranches(const std::vector &values) { // create branches in output tree - auto nValues = values.size(); - for (decltype(nValues) i{}; i < nValues; i++) { - SetBranchesHelper(fInputTree, *fOutputTree, fOutputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], values[i], fBranches[i], fBranchAddresses[i], - fIsDefine[i]); + assert(fBranchData.size() == values.size()); + for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions + SetBranchesHelper(fInputTree, *fOutputTree, fBranchData, i, fOptions.fBasketSize, values[i]); } - fOutputBranches.AssertNoNullBranchAddresses(); + AssertNoNullBranchAddresses(fBranchData); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetEmptyBranches(TTree *inputTree, TTree &outputTree) { void *dummyValueAddress{}; - TBranch *dummyTBranchPtr{}; - void *dummyTBranchAddress{}; - RBranchSet outputBranches{}; - auto nBranches = fInputBranchNames.size(); - for (decltype(nBranches) i{}; i < nBranches; i++) { - SetBranchesHelper(inputTree, outputTree, outputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], dummyValueAddress, dummyTBranchPtr, - dummyTBranchAddress, fIsDefine[i]); + for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions + SetBranchesHelper(inputTree, outputTree, fBranchData, i, fOptions.fBasketSize, dummyValueAddress); } } @@ -551,16 +591,29 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelper ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::MakeNew(void *newName, std::string_view) { const std::string finalName = *reinterpret_cast(newName); + std::vector inputBranchNames; + std::vector outputBranchNames; + std::vector isDefine; + std::vector inputColumnTypeIDs; + for (const auto &bd : fBranchData) { + if (bd.fInputBranchName.empty()) + break; + inputBranchNames.push_back(bd.fInputBranchName); + outputBranchNames.push_back(bd.fOutputBranchName); + isDefine.push_back(bd.fIsDefine); + inputColumnTypeIDs.push_back(bd.fInputTypeID); + } + return ROOT::Internal::RDF::UntypedSnapshotTTreeHelper{finalName, fDirName, fTreeName, - fInputBranchNames, - fOutputBranchNames, + std::move(inputBranchNames), + std::move(outputBranchNames), fOptions, - std::vector(fIsDefine), + std::move(isDefine), fOutputLoopManager, fInputLoopManager, - fInputColumnTypeIDs}; + inputColumnTypeIDs}; } ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UntypedSnapshotTTreeHelperMT( @@ -573,21 +626,25 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UntypedSnapshotTTreeHelperMT( fOutputTrees(fNSlots), fBranchAddressesNeedReset(fNSlots, 1), fInputTrees(fNSlots), - fBranches(fNSlots, std::vector(vbnames.size(), nullptr)), - fBranchAddresses(fNSlots, std::vector(vbnames.size(), nullptr)), - fOutputBranches(fNSlots), fFileName(filename), fDirName(dirname), fTreeName(treename), fOptions(options), - fOutputBranchNames(ReplaceDotWithUnderscore(bnames)), fOutputLoopManager(loopManager), - fInputLoopManager(inputLM), - fInputBranchNames(vbnames), - fInputColumnTypeIDs(colTypeIDs), - fIsDefine(std::move(isDefine)) + fInputLoopManager(inputLM) { EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName); + + auto outputBranchNames = ReplaceDotWithUnderscore(bnames); + fBranchData.reserve(fNSlots); + for (unsigned int slot = 0; slot < fNSlots; ++slot) { + fBranchData.emplace_back(); + auto &thisSlot = fBranchData.back(); + thisSlot.reserve(vbnames.size()); + for (unsigned int i = 0; i < vbnames.size(); ++i) { + thisSlot.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]); + } + } } // Define special member methods here where the definition of all the data member types is available @@ -647,9 +704,10 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::FinalizeTask(unsigned in { if (fOutputTrees[slot]->GetEntries() > 0) fOutputFiles[slot]->Write(); + for (auto &branchData : fBranchData[slot]) + branchData.ClearBranchPointers(); // The branch pointers will go stale below // clear now to avoid concurrent destruction of output trees and input tree (which has them listed as fClones) fOutputTrees[slot].reset(nullptr); - fOutputBranches[slot].Clear(); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Exec(unsigned int slot, const std::vector &values) @@ -674,17 +732,18 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::UpdateCArraysPtrs(unsign // associated to those is re-allocated. As a result the value of the pointer can change therewith // leaving associated to the branch of the output tree an invalid pointer. // With this code, we set the value of the pointer in the output branch anew when needed. - assert(values.size() == fBranches[slot].size()); + assert(values.size() <= fBranchData[slot].size()); auto nValues = values.size(); for (decltype(nValues) i{}; i < nValues; i++) { - if (fBranches[slot][i] && fOutputBranches[slot].IsCArray(fOutputBranchNames[i])) { + auto &branchData = fBranchData[slot][i]; + if (branchData.fIsCArray) { // valueAddress here points to a ROOT::RVec coming from RTreeUntypedArrayColumnReader. We know we // need its buffer, so we cast it and extract the address of the buffer auto *rawRVec = reinterpret_cast *>(values[i]); - if (auto *data = rawRVec->data(); fBranchAddresses[slot][i] != data) { + if (auto *data = rawRVec->data(); branchData.fBranchAddressForCArrays != data) { // reset the branch address - fBranches[slot][i]->SetAddress(data); - fBranchAddresses[slot][i] = data; + branchData.fOutputBranch->SetAddress(data); + branchData.fBranchAddressForCArrays = data; } } } @@ -694,26 +753,21 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetBranches(unsigned int const std::vector &values) { // create branches in output tree - auto nValues = values.size(); - for (decltype(nValues) i{}; i < nValues; i++) { - SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], fOutputBranches[slot], fOptions.fBasketSize, - fInputBranchNames[i], fOutputBranchNames[i], *fInputColumnTypeIDs[i], values[i], - fBranches[slot][i], fBranchAddresses[slot][i], fIsDefine[i]); + auto &branchData = fBranchData[slot]; + assert(branchData.size() == values.size()); + for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions + SetBranchesHelper(fInputTrees[slot], *fOutputTrees[slot], branchData, i, fOptions.fBasketSize, values[i]); } - fOutputBranches[slot].AssertNoNullBranchAddresses(); + + AssertNoNullBranchAddresses(branchData); } void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetEmptyBranches(TTree *inputTree, TTree &outputTree) { void *dummyValueAddress{}; - TBranch *dummyTBranchPtr{}; - void *dummyTBranchAddress{}; - RBranchSet outputBranches{}; - auto nBranches = fInputBranchNames.size(); - for (decltype(nBranches) i{}; i < nBranches; i++) { - SetBranchesHelper(inputTree, outputTree, outputBranches, fOptions.fBasketSize, fInputBranchNames[i], - fOutputBranchNames[i], *fInputColumnTypeIDs[i], dummyValueAddress, dummyTBranchPtr, - dummyTBranchAddress, fIsDefine[i]); + auto &branchData = fBranchData.front(); + for (std::size_t i = 0; i < branchData.size(); i++) { // branchData can grow due to insertions + SetBranchesHelper(inputTree, outputTree, branchData, i, fOptions.fBasketSize, dummyValueAddress); } } @@ -785,17 +839,30 @@ ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::MakeNew(void *newName, std::string_view) { const std::string finalName = *reinterpret_cast(newName); + std::vector inputBranchNames; + std::vector outputBranchNames; + std::vector isDefine; + std::vector inputColumnTypeIDs; + for (const auto &bd : fBranchData.front()) { + if (bd.fInputBranchName.empty()) + break; + inputBranchNames.push_back(bd.fInputBranchName); + outputBranchNames.push_back(bd.fOutputBranchName); + isDefine.push_back(bd.fIsDefine); + inputColumnTypeIDs.push_back(bd.fInputTypeID); + } + return ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT{fNSlots, finalName, fDirName, fTreeName, - fInputBranchNames, - fOutputBranchNames, + std::move(inputBranchNames), + std::move(outputBranchNames), fOptions, - std::vector(fIsDefine), + std::move(isDefine), fOutputLoopManager, fInputLoopManager, - fInputColumnTypeIDs}; + std::move(inputColumnTypeIDs)}; } ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::UntypedSnapshotRNTupleHelper( @@ -904,3 +971,260 @@ ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::MakeNew(void *newName) fInputLoopManager, fOutputLoopManager, std::vector(fIsDefine), fInputColumnTypeIDs}; } + +/* + * ------------------------------------ + * Snapshot with systematic variations + * ------------------------------------ + */ +namespace ROOT::Internal::RDF { +/// An object to store an output file and a tree in one common place to share them between instances +/// of Snapshot with systematic uncertainties. +struct SnapshotOutputWriter { + std::unique_ptr fFile; + std::unique_ptr fTree; + std::string fDirectoryName; + RLoopManager *fOutputLoopManager; + + // Bitmasks to indicate whether syst. uncertainties have been computed. Bound to TBranches, so need to be stable in + // memory. + struct Bitmask { + std::string branchName; + std::bitset<64> bitset{}; + std::unique_ptr branchBuffer{new uint64_t{}}; + }; + std::vector fBitMasks; + + std::unordered_map fBranchToVariationMapping; + // For the dictionary, see core/clingutils/src + std::unordered_map> fBranchToBitmaskMapping; + unsigned int fNBits = 0; + + SnapshotOutputWriter(TFile *file, TTree *tree) : fFile{file}, fTree{tree} {} + ~SnapshotOutputWriter() + { + if (!fBranchToBitmaskMapping.empty()) { + fFile->WriteObject(&fBranchToBitmaskMapping, + (std::string{"R_rdf_branchToBitmaskMapping_"} + fTree->GetName()).c_str()); + } + if (fTree) { + // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory + fTree->AutoSave("flushbaskets"); + + // Now connect the data source to the loop manager so it can be used for further processing + std::string tree = fTree->GetName(); + if (!fDirectoryName.empty()) + tree = fDirectoryName + '/' + tree; + std::string file = fFile->GetName(); + + fTree.reset(); + fFile.reset(); + + if (fOutputLoopManager) + fOutputLoopManager->SetDataSource(std::make_unique(tree, file)); + } + } + + /// Register a branch and corresponding systematic uncertainty. The index returned is the global index of this + /// systematic. + void RegisterBranch(std::string const &branchName, unsigned int variationIndex) + { + if (auto it = fBranchToVariationMapping.find(branchName); it != fBranchToVariationMapping.end()) { + if (variationIndex != it->second) { + throw std::logic_error("Branch " + branchName + " is being registered with different variation index."); + } + return; + } + + // Neither branch nor systematic are known, so a new entry needs to be created + fNBits = std::max(fNBits, variationIndex); + const auto vectorIndex = variationIndex / 64u; + const auto bitIndex = variationIndex % 64u; + + // Create bitmask branches as long as necessary to capture the bit + while (vectorIndex >= fBitMasks.size()) { + std::string bitmaskBranchName = + std::string{"R_rdf_mask_"} + fTree->GetName() + '_' + std::to_string(fBitMasks.size()); + fBitMasks.push_back(Bitmask{bitmaskBranchName}); + fTree->Branch(bitmaskBranchName.c_str(), fBitMasks.back().branchBuffer.get()); + } + + fBranchToVariationMapping[branchName] = variationIndex; + fBranchToBitmaskMapping[branchName] = std::make_pair(fBitMasks[vectorIndex].branchName, bitIndex); + } + + void ClearMaskBits() + { + for (auto &mask : fBitMasks) + mask.bitset.reset(); + } + + void SetMaskBit(unsigned int index) + { + const auto vectorIndex = index / 64; + const auto bitIndex = index % 64; + fBitMasks[vectorIndex].bitset.set(bitIndex, true); + } + + bool MaskEmpty() const + { + return std::none_of(fBitMasks.begin(), fBitMasks.end(), [](Bitmask const &mask) { return mask.bitset.any(); }); + } + + void Write() const + { + for (auto const &mask : fBitMasks) { + *mask.branchBuffer = mask.bitset.to_ullong(); + } + + fTree->Fill(); + } +}; + +ROOT::Internal::RDF::SnapshotHelperWithVariations::SnapshotHelperWithVariations( + std::string_view filename, std::string_view dirname, std::string_view treename, const ColumnNames_t &vbnames, + const ColumnNames_t &bnames, const RSnapshotOptions &options, std::vector &&isDefine, + ROOT::Detail::RDF::RLoopManager *outputLoopMgr, ROOT::Detail::RDF::RLoopManager *inputLoopMgr, + const std::vector &colTypeIDs) + : fOptions(options), fInputLoopManager{inputLoopMgr}, fOutputLoopManager{outputLoopMgr} +{ + EnsureValidSnapshotTTreeOutput(fOptions, std::string(treename), std::string(filename)); + + TFile::TContext fileCtxt; + fOutputHandle = std::make_shared( + TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", + ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel)), + nullptr); + if (!fOutputHandle->fFile) + throw std::runtime_error(std::string{"Snapshot: could not create output file "} + std::string{filename}); + + TDirectory *outputDir = fOutputHandle->fFile.get(); + if (!dirname.empty()) { + fOutputHandle->fDirectoryName = dirname; + TString checkupdate = fOptions.fMode; + checkupdate.ToLower(); + if (checkupdate == "update") + outputDir = + fOutputHandle->fFile->mkdir(std::string{dirname}.c_str(), "", true); // do not overwrite existing directory + else + outputDir = fOutputHandle->fFile->mkdir(std::string{dirname}.c_str()); + } + + fOutputHandle->fTree = std::make_unique(std::string{treename}.c_str(), std::string{treename}.c_str(), + fOptions.fSplitLevel, /*dir=*/outputDir); + fOutputHandle->fOutputLoopManager = fOutputLoopManager; + if (fOptions.fAutoFlush) + fOutputHandle->fTree->SetAutoFlush(fOptions.fAutoFlush); + + auto outputBranchNames = ReplaceDotWithUnderscore(bnames); + + fBranchData.reserve(vbnames.size()); + for (unsigned int i = 0; i < vbnames.size(); ++i) { + fOutputHandle->RegisterBranch(outputBranchNames[i], 0); + fBranchData.emplace_back(vbnames[i], outputBranchNames[i], isDefine[i], colTypeIDs[i]); + } +} + +SnapshotHelperWithVariations::~SnapshotHelperWithVariations() +{ + // FIXME: Check if this needs to be enabled: + // if (!fTreeName.empty() /*not moved from*/ && !fOutputFile /* did not run */ && fOptions.fLazy) { + // const auto fileOpenMode = [&]() { + // TString checkupdate = fOptions.fMode; + // checkupdate.ToLower(); + // return checkupdate == "update" ? "updated" : "created"; + // }(); + // Warning("Snapshot", + // "A lazy Snapshot action was booked but never triggered. The tree '%s' in output file '%s' was not %s. " + // "In case it was desired instead, remember to trigger the Snapshot operation, by storing " + // "its result in a variable and for example calling the GetValue() method on it.", + // fTreeName.c_str(), fFileName.c_str(), fileOpenMode); + // } +} + +void SnapshotHelperWithVariations::RegisterVariedColumn(unsigned int /*slot*/, unsigned int columnIndex, + unsigned int originalColumnIndex, unsigned int variationIndex, + std::string const &variationName) +{ + if (columnIndex == originalColumnIndex) { + fBranchData[columnIndex].fVariationIndex = variationIndex; // The base column has variations + fOutputHandle->RegisterBranch(fBranchData[columnIndex].fOutputBranchName, variationIndex); + } else if (columnIndex >= fBranchData.size()) { + // First task, need to create branches + fBranchData.resize(columnIndex + 1); + auto &bd = fBranchData[columnIndex]; + bd = fBranchData[originalColumnIndex]; + std::string newOutputName = bd.fOutputBranchName + "__" + variationName; + std::replace(newOutputName.begin(), newOutputName.end(), ':', '_'); + bd.fOutputBranchName = std::move(newOutputName); + bd.fVariationIndex = variationIndex; + + fOutputHandle->RegisterBranch(bd.fOutputBranchName, variationIndex); + } else { + assert(static_cast(fBranchData[columnIndex].fVariationIndex) == variationIndex); + } +} + +void SnapshotHelperWithVariations::InitTask(TTreeReader *, unsigned int) +{ + // We ask the input RLoopManager if it has a TTree. We cannot rely on getting this information when constructing + // this action helper, since the TTree might change e.g. when ChangeSpec is called in-between distributed tasks. + if (auto treeDS = dynamic_cast(fInputLoopManager->GetDataSource())) + fInputTree = treeDS->GetTree(); + + // Create all output branches; and bind them to empty values + for (std::size_t i = 0; i < fBranchData.size(); i++) { // fBranchData can grow due to insertions + SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, + fBranchData[i].EmptyInstance()); + } + + // TODO: Don't run this every time + AssertNoNullBranchAddresses(fBranchData); +} + +/// Connect all output fields to the values pointed to by `values`, fill the output dataset, and +/// Call the Fill of the output tree, and clear the mask bits that show whether a variation was reached. +/// This function must be called from exactly one snapshot action. +/// It triggers the fill of the shared tree at the end of each event. +void SnapshotHelperWithVariations::Exec(unsigned int slot, const std::vector &values, + std::vector const &filterPassed) +{ + // Rebind branch pointers to RDF values + assert(fBranchData.size() == values.size()); + for (std::size_t i = 0; i < values.size(); i++) { + const auto variationIndex = fBranchData[i].fVariationIndex; + if (variationIndex < 0) { + // Branch without variations + SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, values[i]); + } else if (filterPassed[variationIndex]) { + // Branch with variations + SetBranchesHelper(fInputTree, *fOutputHandle->fTree, fBranchData, i, fOptions.fBasketSize, values[i]); + fOutputHandle->SetMaskBit(variationIndex); + } + } + + if (fOutputHandle->MaskEmpty()) + return; + + if (!fOutputHandle->fTree) + throw std::runtime_error("The TTree associated to the Snapshot action doesn't exist, any more."); + + fOutputHandle->Write(); + fOutputHandle->ClearMaskBits(); + ResetBranchAddresses(slot); +} + +/// Reset all branches to empty values. +void SnapshotHelperWithVariations::ResetBranchAddresses(unsigned int /*slot*/) +{ + for (auto &branchData : fBranchData) { + branchData.ResetBranchAddressToEmtpyInstance(); + } +} + +void SnapshotHelperWithVariations::Finalize() +{ + fOutputHandle.reset(); +} + +} // namespace ROOT::Internal::RDF diff --git a/tree/dataframe/src/RDefineReader.cxx b/tree/dataframe/src/RDefineReader.cxx index e374219a41b2b..2126037e70248 100644 --- a/tree/dataframe/src/RDefineReader.cxx +++ b/tree/dataframe/src/RDefineReader.cxx @@ -32,19 +32,28 @@ ROOT::Internal::RDF::RDefinesWithReaders::GetReader(unsigned int slot, std::stri if (it != defineReaders.end()) return *it->second; + std::shared_ptr readerToReturn; auto *define = fDefine.get(); - if (*nameIt != "nominal") - define = &define->GetVariedDefine(std::string(variationName)); - -#if !defined(__clang__) && __GNUC__ >= 7 && __GNUC_MINOR__ >= 3 - const auto insertion = - defineReaders.insert({*nameIt, std::make_unique(slot, *define)}); - return *insertion.first->second; -#else - // gcc < 7.3 has issues with passing the non-movable std::pair temporary into the insert call - auto reader = std::make_unique(slot, *define); - auto &ret = *reader; - defineReaders[*nameIt] = std::move(reader); - return ret; -#endif + if (*nameIt == "nominal") { + readerToReturn = std::make_shared(slot, *define); + } else { + auto *variedDefine = &define->GetVariedDefine(std::string(variationName)); + if (variedDefine == define) { + // The column in not affected by variations. We can return the same reader as for nominal + if (auto nominalReaderIt = defineReaders.find("nominal"); nominalReaderIt != defineReaders.end()) { + readerToReturn = nominalReaderIt->second; + } else { + // The nominal reader doesn't exist yet + readerToReturn = std::make_shared(slot, *define); + auto nominalNameIt = fCachedColNames.Insert("nominal"); + defineReaders.insert({*nominalNameIt, readerToReturn}); + } + } else { + readerToReturn = std::make_shared(slot, *variedDefine); + } + } + + defineReaders.insert({*nameIt, readerToReturn}); + + return *readerToReturn; } diff --git a/tree/dataframe/src/RJittedDefine.cxx b/tree/dataframe/src/RJittedDefine.cxx index 99c369a2fb847..0ead1a3b0ce1d 100644 --- a/tree/dataframe/src/RJittedDefine.cxx +++ b/tree/dataframe/src/RJittedDefine.cxx @@ -66,5 +66,10 @@ void RJittedDefine::MakeVariations(const std::vector &variations) RDefineBase &RJittedDefine::GetVariedDefine(const std::string &variationName) { assert(fConcreteDefine != nullptr); - return fConcreteDefine->GetVariedDefine(variationName); + + auto &variedDefine = fConcreteDefine->GetVariedDefine(variationName); + if (&variedDefine == fConcreteDefine.get()) + return *this; // Ensures that the pointer is the same across all variations + else + return variedDefine; } diff --git a/tree/dataframe/test/dataframe_snapshotWithVariations.cxx b/tree/dataframe/test/dataframe_snapshotWithVariations.cxx new file mode 100644 index 0000000000000..faa8e39b7b42a --- /dev/null +++ b/tree/dataframe/test/dataframe_snapshotWithVariations.cxx @@ -0,0 +1,328 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include + +constexpr bool verbose = false; + +using ROOT::RDF::Experimental::VariationsFor; + +void printTree(TTree &tree) +{ + tree.Print(); + tree.SetScanField(50); + tree.Scan("*", "", "colsize=15", 50); +} + +template +void checkOutput(TTree &tree, std::vector const &systematics, F &&activeCuts) +{ + X_t x; + Y_t y; + + for (auto const &sysName : systematics) { + ASSERT_EQ(TTree::kMatch, tree.SetBranchAddress(("x" + sysName).c_str(), &x)) << ("x" + sysName); + ASSERT_EQ(TTree::kMatch, tree.SetBranchAddress(("y" + sysName).c_str(), &y)) << ("y" + sysName); + + for (unsigned int i = 0; i < tree.GetEntries(); ++i) { + ASSERT_GT(tree.GetEntry(i), 0); + + EXPECT_EQ(x, -1 * y); + if (!activeCuts(x, y)) { + EXPECT_EQ(x, X_t{}); + EXPECT_EQ(y, Y_t{}); + } + } + + tree.ResetBranchAddresses(); + } +} + +TEST(RDFVarySnapshot, SimpleRDFWithFilters) +{ + constexpr auto filename = "VarySnapshot.root"; + constexpr unsigned int N = 10; + ROOT::RDF::RSnapshotOptions options; + options.fLazy = true; + options.fOverwriteIfExists = true; + options.fIncludeVariations = true; + + auto cuts = [](float x, double y) { return x < 50 || y < -70; }; + + auto h = ROOT::RDataFrame(N) + .Define("entry_beforeX", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Define("x", [](ULong64_t e) -> float { return 10.f * e; }, {"rdfentry_"}) + .Define("entry_afterX", "rdfentry_") + .Vary( + "x", [](float x) { return ROOT::RVecF{x - 0.5f, x + 0.5f}; }, {"x"}, 2, "xVar") + .Define("y", [](float x) -> double { return -1. * x; }, {"x"}) + .Define("entry_afterVary", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Filter(cuts, {"x", "y"}) + .Snapshot("t", filename, {"x", "y", "entry_beforeX", "entry_afterX", "entry_afterVary"}, options); + h.GetPtr(); + + { + std::unique_ptr file{TFile::Open(filename)}; + auto tree = file->Get("t"); + if (verbose) + printTree(*tree); + + EXPECT_EQ(tree->GetEntries(), 9); // Event number 6 won't pass any filters + EXPECT_EQ(tree->GetNbranches(), 10); // 6 branches for x/y with variations, three not-varied branches, bitmask + for (const auto branchName : {"x", "y", "x__xVar_0", "x__xVar_1", "y__xVar_0", "y__xVar_0"}) + EXPECT_NE(tree->FindBranch(branchName), nullptr) << branchName; + + checkOutput(*tree, std::vector{"__xVar_0", "__xVar_1"}, cuts); + + if (HasFailure()) { + tree->Print(); + tree->Scan(); + } + } + + if (!HasFailure()) + std::remove(filename); +} + +TEST(RDFVarySnapshot, RDFFromTTree) +{ + constexpr auto inFile = "VarySnapshot_1.root"; + constexpr unsigned int N = 10; + auto in = ROOT::RDataFrame(N) + .Define("entry", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Define("x", [](ULong64_t e) -> int { return 10 * int(e); }, {"rdfentry_"}) + .Define("y", [](int x) -> float { return -1 * x; }, {"x"}) + .Snapshot("t", inFile, {"x", "y", "entry"}); + auto nextRDF = in.GetValue(); + { + std::unique_ptr fileIn{TFile::Open(inFile, "READ")}; + TTree *tree = fileIn->Get("t"); + TBranch *branch = nullptr; + + ASSERT_NE(branch = tree->GetBranch("x"), nullptr); + EXPECT_STREQ(static_cast(branch->GetListOfLeaves()->At(0))->GetTypeName(), "Int_t"); + + ASSERT_NE(branch = tree->GetBranch("y"), nullptr); + EXPECT_STREQ(static_cast(branch->GetListOfLeaves()->At(0))->GetTypeName(), "Float_t"); + if (verbose) + tree->Scan("*", "", "", 5); + } + + constexpr auto outfile = "VarySnapshot_2.root"; + ROOT::RDF::RSnapshotOptions options; + options.fLazy = true; + options.fIncludeVariations = true; + auto filter = [](int x, Long64_t y) { return (20 <= x && x < 70) && y != -50; }; + + auto var = ROOT::RDataFrame("t", {inFile, inFile}) + .Vary( + "x", [](int x) { return ROOT::RVecI{x - 1, x + 1}; }, {"x"}, 2, "xVariation") + .Redefine("y", [](int x) -> Long64_t { return -1 * x; }, {"x"}) + .Filter(filter, {"x", "y"}, "nominal filter") + .Snapshot("t", outfile, {"x", "y", "entry"}, options); + auto thirdRDF = var.GetValue(); + + { + std::unique_ptr file{TFile::Open(outfile)}; + auto tree = file->Get("t"); + if (verbose) { + tree->Print(); + tree->Scan("*", "", "colsize=20", 20); + } + + EXPECT_EQ(tree->GetEntries(), 2 * 6); // x=0, 10, 70, and y = -50 don't pass + EXPECT_EQ(tree->GetNbranches(), 8); // 3 variations of x and y, mask bits, and entry number + for (const auto [branchName, branchType] : + std::initializer_list>{{"entry", "ULong64_t"}, + {"x", "Int_t"}, + {"y", "Long64_t"}, + {"x__xVariation_0", "Int_t"}, + {"x__xVariation_1", "Int_t"}, + {"y__xVariation_0", "Long64_t"}, + {"y__xVariation_1", "Long64_t"}}) { + TBranch *branch; + ASSERT_NE(branch = tree->GetBranch(branchName), nullptr); + EXPECT_STREQ(static_cast(branch->GetListOfLeaves()->At(0))->GetTypeName(), branchType); + } + + checkOutput(*tree, std::vector{"__xVariation_0", "__xVariation_1"}, filter); + + if (HasFailure()) { + tree->Print(); + tree->Scan(); + } + } + + if (!HasFailure()) { + std::remove(inFile); + std::remove(outfile); + } +} + +TEST(RDFVarySnapshot, Bitmask) +{ + constexpr auto filename = "VarySnapshot_bitmask.root"; + std::string const treename = "testTree"; + constexpr unsigned int N = 15; + constexpr unsigned int NSystematics = 130; // Will use three bitmask branches + ROOT::RDF::RSnapshotOptions options; + options.fLazy = false; + options.fOverwriteIfExists = true; + options.fIncludeVariations = true; + + auto cuts = [](int x, int y) { return x % 2 == 0 && y % 3 == 0; }; + + ROOT::RDataFrame(N) + .Define("x", [](ULong64_t e) -> int { return e; }, {"rdfentry_"}) + .Define("entry", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Vary( + "x", + [](int x) { + std::vector systOffsets(NSystematics); + std::iota(systOffsets.begin(), systOffsets.end(), 0); + std::transform(systOffsets.begin(), systOffsets.end(), systOffsets.begin(), + [x](int offset) { return x + offset; }); + return ROOT::RVecI{systOffsets}; + }, + {"x"}, NSystematics, "xVar") + .Define("y", [](int x) -> int { return -1 * x; }, {"x"}) + .Filter(cuts, {"x", "y"}) + .Snapshot(treename, filename, {"entry", "x", "y"}, options); + + { + TFile file(filename, "READ"); + std::unique_ptr tree{file.Get(treename.data())}; + ASSERT_NE(tree, nullptr); + TBranch *branch = tree->GetBranch(("R_rdf_mask_" + treename + "_0").c_str()); + ASSERT_NE(branch, nullptr); + + auto *branchToIndexMap = file.Get>>( + ("R_rdf_branchToBitmaskMapping_" + treename).c_str()); + ASSERT_NE(branchToIndexMap, nullptr); + for (const auto branchName : {"x", "y", "x__xVar_0", "x__xVar_1", "y__xVar_0", "y__xVar_0"}) { + ASSERT_NE(branchToIndexMap->find(branchName), branchToIndexMap->end()); + } + EXPECT_EQ((*branchToIndexMap)["x"], (*branchToIndexMap)["y"]); + + for (unsigned int systematic = 0; systematic < NSystematics; ++systematic) { + int x, y; + ULong64_t rdfentry; + uint64_t bitmask; + std::string const systematicName = std::string{"__xVar_"} + std::to_string(systematic); + std::string const branchName_x = "x" + systematicName; + std::string const branchName_y = "y" + systematicName; + ASSERT_EQ(tree->SetBranchAddress(branchName_x.c_str(), &x), TTree::kMatch); + ASSERT_EQ(tree->SetBranchAddress(branchName_y.c_str(), &y), TTree::kMatch); + ASSERT_EQ(tree->SetBranchAddress("entry", &rdfentry), TTree::kMatch); + const auto it = branchToIndexMap->find(branchName_x); + ASSERT_NE(it, branchToIndexMap->end()); + ASSERT_EQ(tree->SetBranchAddress(it->second.first.c_str(), &bitmask), TTree::kMatch) << it->second.first; + ASSERT_NE(branchToIndexMap->find(branchName_y), branchToIndexMap->end()); + EXPECT_EQ(it->second.first, (*branchToIndexMap)[branchName_y].first); + EXPECT_EQ(it->second.second, (*branchToIndexMap)[branchName_y].second); + for (unsigned int i = 0; i < N; ++i) { + tree->GetEntry(i); + EXPECT_EQ(rdfentry, i); + const int xExpected = rdfentry + systematic; + const int yExpected = -1 * xExpected; + if (cuts(xExpected, yExpected)) { + EXPECT_EQ(x, xExpected) << "event=" << i << " systematic=" << systematic << " rdfentry=" << rdfentry; + EXPECT_EQ(y, yExpected) << "event=" << i << " systematic=" << systematic << " rdfentry=" << rdfentry; + } + + const std::bitset<64> bs{bitmask}; + const auto bitIndex = it->second.second; + EXPECT_EQ(cuts(xExpected, yExpected), bs[bitIndex]) + << "event=" << i << " syst=" << systematic << " x(" << branchName_x << ")=" << x << " y(" << branchName_y + << ")=" << y << " rdfentry=" << rdfentry << " bitset: " << bs << " bitIndex: " << bitIndex; + if (HasFailure()) + break; + } + tree->ResetBranchAddresses(); + if (HasFailure()) + break; + } + + if (HasFailure()) { + tree->Scan("entry:x:y:x__xVar_0:y__xVar_0:x__xVar_1:y__xVar_1:x__xVar_2:y__xVar_2"); + } + } + + if (!HasFailure()) + std::remove(filename); +} + +TEST(RDFVarySnapshot, SnapshotCollections) +{ + constexpr auto filename = "VarySnapshot.root"; + constexpr unsigned int N = 10; + ROOT::RDF::RSnapshotOptions options; + options.fIncludeVariations = true; + + gInterpreter->GenerateDictionary("std::vector >", "vector;ROOT/RVec.hxx"); + + auto cuts = [](int x, ROOT::RVecI const &y) { return x % 2 == 0 || y[0] == 5; }; + + auto variation = ROOT::RDataFrame(N) + .Define("x", [](ULong64_t e) -> int { return int(e); }, {"rdfentry_"}) + .Define("entry", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Define("dummyBranch", [](ULong64_t e) { return e; }, {"rdfentry_"}) + .Vary( + "x", [](int x) { return ROOT::RVecI{x + 1, x * 3}; }, {"x"}, 2, "xVariation") + .Define("y", [](int x) { return ROOT::RVecI{x, x + 1, x + 2, x + 3}; }, {"x"}) + .Filter(cuts, {"x", "y"}) + .Snapshot("t", filename, {"x", "y", "entry"}, options); + variation.GetPtr(); + + std::unique_ptr file{TFile::Open(filename)}; + auto tree = file->Get("t"); + if (verbose) + printTree(*tree); + + EXPECT_EQ(tree->GetEntries(), N); + EXPECT_EQ(tree->GetNbranches(), 8); + for (const auto branchName : + {"x", "y", "x__xVariation_0", "x__xVariation_0", "y__xVariation_0", "y__xVariation_0", "entry"}) + EXPECT_NE(tree->GetBranch(branchName), nullptr) << branchName; + EXPECT_EQ(tree->GetBranch("dummyBranch"), nullptr); + + for (std::string systematicName : {"", "__xVariation_0", "__xVariation_1"}) { + ULong64_t entry; + int x; + ROOT::RVecI *y = nullptr; + ASSERT_EQ(tree->SetBranchAddress("entry", &entry), TTree::kMatch); + ASSERT_EQ(tree->SetBranchAddress(("x" + systematicName).c_str(), &x), TTree::kMatch) + << ("x" + systematicName); + ASSERT_EQ(tree->SetBranchAddress(("y" + systematicName).c_str(), &y), TTree::kMatch) << ("y" + systematicName); + + for (unsigned int event = 0; event < N; ++event) { + ASSERT_GT(tree->GetEntry(event), 0); + const auto originalX = + (systematicName == "") + ? entry + : ((systematicName.find("xVariation_0") != std::string::npos) ? entry + 1 : entry * 3); + const bool passCuts = (originalX % 2 == 0) || originalX == 5; + + if (passCuts) + EXPECT_EQ(x, originalX) << "sys:'" << systematicName << "' originalX: " << originalX << " event: " << event; + else + EXPECT_EQ(x, 0) << "sys:'" << systematicName << "' originalX: " << originalX << " event: " << event; + + ASSERT_EQ(y->size(), passCuts ? 4 : 0) + << "sys:'" << systematicName << "' entry: " << entry << " originalX: " << originalX << " event: " << event; + for (unsigned int i = 0; i < y->size(); ++i) { + EXPECT_EQ((*y)[i], x + i); + } + } + tree->ResetBranchAddresses(); + } + + file.reset(); + gSystem->Unlink(filename); +} \ No newline at end of file diff --git a/tree/dataframe/test/dataframe_vary.cxx b/tree/dataframe/test/dataframe_vary.cxx index f881329c5a099..85e6cc74d2f90 100644 --- a/tree/dataframe/test/dataframe_vary.cxx +++ b/tree/dataframe/test/dataframe_vary.cxx @@ -1557,23 +1557,6 @@ TEST_P(RDFVary, VaryTake) EXPECT_EQ(sorted(rs["x:1"]), std::vector({1, 2, 3})); } -TEST_P(RDFVary, VarySnapshot) -{ - const auto fname = "dummy.root"; - auto h = ROOT::RDataFrame(10) - .Define("x", [](ULong64_t e) { return int(e); }, {"rdfentry_"}) - .Vary( - "x", [](int x) { return ROOT::RVecI{x - 1, x + 1}; }, {"x"}, 2) - .Snapshot("t", fname, {"x"}); - EXPECT_THROW( - try { VariationsFor(h); } catch (const std::logic_error &err) { - const auto msg = "Varying a Snapshot result is not implemented yet."; - EXPECT_STREQ(err.what(), msg); - throw; - }, - std::logic_error); -} - // this is a regression test, we used to read from wrong addresses in this case TEST_P(RDFVary, MoreVariedColumnsThanVariations) { diff --git a/tutorials/CMakeLists.txt b/tutorials/CMakeLists.txt index 600533d283035..f2f6ab3f5a436 100644 --- a/tutorials/CMakeLists.txt +++ b/tutorials/CMakeLists.txt @@ -1030,10 +1030,8 @@ if(ROOT_pyroot_FOUND) ${py_will_fail}) if(${t} IN_LIST multithreaded) - # Makes sure that this doesn't run in parallel with other multithreaded tutorials, and that cmake doesn't start too - # many other tests. That we use 4 processors is actually a lie, because IMT takes whatever it finds. - # However, even this poor indication of MT behaviour is a good hint for cmake to reduce congestion. - set_tests_properties(${tutorial_name} PROPERTIES RESOURCE_LOCK multithreaded PROCESSORS ${NProcessors}) + # Makes sure that this doesn't run in parallel with too many other multithreaded tutorials. + set_tests_properties(${tutorial_name} PROPERTIES PROCESSORS ${NProcessors}) endif() if(${t} IN_LIST distrdf_spark_tutorials)