From 6ca16e6bc57dcfe7857e30268d23a0ac5db82542 Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Wed, 12 Nov 2025 19:47:22 +0100 Subject: [PATCH 1/2] [df] Add more docs to the Snapshot with variations section --- tree/dataframe/src/RDataFrame.cxx | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/tree/dataframe/src/RDataFrame.cxx b/tree/dataframe/src/RDataFrame.cxx index 7e6fa33f5b1d1..153f7d016f500 100644 --- a/tree/dataframe/src/RDataFrame.cxx +++ b/tree/dataframe/src/RDataFrame.cxx @@ -1265,10 +1265,28 @@ In that case, RDataFrame will snapshot the filtered columns in a memory-efficien default-constructed object in case of classes. If none of the filters pass like in row 6, the entire event is omitted from the snapshot. To tell apart a genuine `0` (like `x` in row 0) from a variation that didn't pass the selection, RDataFrame writes a bitmask for each event, indicating which variations -are valid (see last column). A mapping of column names to this bitmask is placed in the same file as the output dataset, and automatically loaded when -RDataFrame opens a file that was snapshot with variations. -Attempting to read such missing values with RDataFrame will produce an error, but RDataFrame can either skip these values or fill in defaults as -described in the \ref missing-values "section on dealing with missing values". +are valid (see last column). The bitmask is implemented as a 64-bit `std::bitset` in memory, written to the output +dataset as a `std::uin64_t`. For every 64 columns, a new bitmask column is added to the output dataset. + +Each column that might contain invalid values is connected to exactly one bit in one bitmask. A mapping of column names +to the corresponding bitmask is placed in the same file as the output dataset, with a name that follows the pattern +`"R_rdf_branchToBitmaskMapping_"`. It is of type +`std::unordered_map>`, and maps a column name to the name of the +bitmask column and the index of the relevant bit. For example, in the same file as the dataset "Events" there would be +an object named `R_rdf_branchToBitmaskMapping_Events`. This object for example would describe a connection such as: + +~~~ +muon_pt --> (R_rdf_mask_Events_0, 42) +~~~ + +which means that the validity of the entries in `muon_pt` is established by the bit `42` in the bitmask found in the +column `R_rdf_mask_Events_0`. + +When RDataFrame opens a file, it checks for the existence of this mapping between columns and bitmasks, and loads it automatically if found. As such, +RDataFrame makes the treatment of the various bitmap maskings completely transparent to the user. + +In case certain values are labeled invalid by the corresponding bit, this will result in reading a missing value. The semantics of such a scenario follow the +rules described in the \ref missing-values "section on dealing with missing values" and can be dealt with accordingly. \note Snapshot with variations is currently restricted to single-threaded TTree snapshots. @@ -1780,6 +1798,9 @@ more of its entries. For example: - When joining different datasets horizontally according to some index value (e.g. the event number), if the index does not find a match in one or more other datasets for a certain entry. +- If, for a certain event, a column is invalid because it results from a Snapshot + with systematic variations, and that variation didn't pass its filters. For + more details, see \ref snapshot-with-variations. For example, suppose that column "y" does not have a value for entry 42: From 270fbb9438dfb84323e807d6e3ccbb69ef81a610 Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Thu, 13 Nov 2025 21:01:29 +0100 Subject: [PATCH 2/2] [df] Improve naming of column to bitmask mapping The object saved together with the output dataset by Snapshot with variations now features `column` rather than `branch` in the name, to be more generic. We also avoid mixing snake_case with CamelCase in the name, opting for the first. --- tree/dataframe/src/RDFSnapshotHelpers.cxx | 2 +- tree/dataframe/src/RDataFrame.cxx | 4 ++-- tree/dataframe/src/RTTreeDS.cxx | 2 +- tree/dataframe/test/dataframe_snapshotWithVariations.cxx | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 02f1f2e438882..ec06e521cb935 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -1035,7 +1035,7 @@ struct SnapshotOutputWriter { { if (!fBranchToBitmaskMapping.empty()) { fFile->WriteObject(&fBranchToBitmaskMapping, - (std::string{"R_rdf_branchToBitmaskMapping_"} + fTree->GetName()).c_str()); + (std::string{"R_rdf_column_to_bitmask_mapping_"} + fTree->GetName()).c_str()); } if (fTree) { // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory diff --git a/tree/dataframe/src/RDataFrame.cxx b/tree/dataframe/src/RDataFrame.cxx index 153f7d016f500..45bc4e7524115 100644 --- a/tree/dataframe/src/RDataFrame.cxx +++ b/tree/dataframe/src/RDataFrame.cxx @@ -1270,10 +1270,10 @@ dataset as a `std::uin64_t`. For every 64 columns, a new bitmask column is added Each column that might contain invalid values is connected to exactly one bit in one bitmask. A mapping of column names to the corresponding bitmask is placed in the same file as the output dataset, with a name that follows the pattern -`"R_rdf_branchToBitmaskMapping_"`. It is of type +`"R_rdf_column_to_bitmask_mapping_"`. It is of type `std::unordered_map>`, and maps a column name to the name of the bitmask column and the index of the relevant bit. For example, in the same file as the dataset "Events" there would be -an object named `R_rdf_branchToBitmaskMapping_Events`. This object for example would describe a connection such as: +an object named `R_rdf_column_to_bitmask_mapping_Events`. This object for example would describe a connection such as: ~~~ muon_pt --> (R_rdf_mask_Events_0, 42) diff --git a/tree/dataframe/src/RTTreeDS.cxx b/tree/dataframe/src/RTTreeDS.cxx index 502abfbddcc2d..824ff39c565a4 100644 --- a/tree/dataframe/src/RTTreeDS.cxx +++ b/tree/dataframe/src/RTTreeDS.cxx @@ -522,7 +522,7 @@ ROOT::Internal::RDF::RTTreeDS::CreateColumnReader(unsigned int /*slot*/, std::st if (TDirectory *treeDir = treeReader->GetTree()->GetDirectory(); treeDir) { using Map_t = std::unordered_map>; const std::string bitmaskMapName = - std::string{"R_rdf_branchToBitmaskMapping_"} + treeReader->GetTree()->GetName(); + std::string{"R_rdf_column_to_bitmask_mapping_"} + treeReader->GetTree()->GetName(); if (Map_t const *columnMaskMap = treeDir->Get(bitmaskMapName.c_str()); columnMaskMap) { if (auto it = columnMaskMap->find(std::string(col)); it != columnMaskMap->end()) { colReader = std::make_unique(*treeReader, std::move(colReader), it->second.first, diff --git a/tree/dataframe/test/dataframe_snapshotWithVariations.cxx b/tree/dataframe/test/dataframe_snapshotWithVariations.cxx index a4ef10fb390ca..7063183e92c8d 100644 --- a/tree/dataframe/test/dataframe_snapshotWithVariations.cxx +++ b/tree/dataframe/test/dataframe_snapshotWithVariations.cxx @@ -219,7 +219,7 @@ TEST(RDFVarySnapshot, Bitmask) ASSERT_NE(branch, nullptr); auto *branchToIndexMap = file.Get>>( - ("R_rdf_branchToBitmaskMapping_" + treename).c_str()); + ("R_rdf_column_to_bitmask_mapping_" + treename).c_str()); ASSERT_NE(branchToIndexMap, nullptr); for (const auto branchName : {"x", "y", "x__xVar_0", "x__xVar_1", "y__xVar_0", "y__xVar_0"}) { ASSERT_NE(branchToIndexMap->find(branchName), branchToIndexMap->end()); @@ -339,8 +339,8 @@ TEST(RDFVarySnapshot, TwoVariationsInSameFile) auto snap2 = rdf.Filter(cuts2, {"x", "y"}).Snapshot(treename2, filename, {"x", "y"}, options); std::unique_ptr file{TFile::Open(filename)}; - EXPECT_NE(file->GetKey(("R_rdf_branchToBitmaskMapping_" + treename1).c_str()), nullptr); - EXPECT_NE(file->GetKey(("R_rdf_branchToBitmaskMapping_" + treename2).c_str()), nullptr); + EXPECT_NE(file->GetKey(("R_rdf_column_to_bitmask_mapping_" + treename1).c_str()), nullptr); + EXPECT_NE(file->GetKey(("R_rdf_column_to_bitmask_mapping_" + treename2).c_str()), nullptr); // In Windows, an exception is thrown as expected, but it cannot be caught for the time being: #if !defined(_MSC_VER) || defined(R__ENABLE_BROKEN_WIN_TESTS)