eschmidt-nvidia · eschmidt-nvidia · Jun 8, 2023
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -322,6 +322,20 @@ if(BUILD_CUML_CPP_LIBRARY)
         src/decisiontree/batched-levelalgo/kernels/mse-float.cu
         src/decisiontree/batched-levelalgo/kernels/poisson-double.cu
         src/decisiontree/batched-levelalgo/kernels/poisson-float.cu
+
+        src/decisiontree/batched-levelalgo/kernels/entropy-double-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/entropy-float-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/gamma-double-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/gamma-float-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/gini-double-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/gini-float-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/inverse_gaussian-double-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/inverse_gaussian-float-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/mse-double-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/mse-float-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/poisson-double-honest.cu
+        src/decisiontree/batched-levelalgo/kernels/poisson-float-honest.cu
+
         src/decisiontree/batched-levelalgo/kernels/quantiles.cu
         src/decisiontree/decisiontree.cu)
   endif()

diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu
@@ -151,10 +151,14 @@ std::vector<Params> getInputs()
                        (1 << 20),          /* max_leaves */
                        1.f,                /* max_features */
                        32,                 /* max_n_bins */
-                       3,                  /* min_samples_leaf */
-                       3,                  /* min_samples_split */
+                       3,                   /* min_samples_leaf_splitting */
+                       3,                   /* min_samples_leaf_averaging */
+                       3,                   /* min_samples_split_splitting */
+                       3,                   /* min_samples_split_averaging */
                        0.0f,               /* min_impurity_decrease */
                        true,               /* bootstrap */
+                       false,              /* oob_honesty */
+                       true,               /* double_bootstrap */
                        1,                  /* n_trees */
                        1.f,                /* max_samples */
                        1234ULL,            /* seed */

diff --git a/cpp/bench/sg/filex.cu b/cpp/bench/sg/filex.cu
@@ -251,10 +251,14 @@ std::vector<Params> getInputs()
                        (1 << 20),          /* max_leaves */
                        1.f,                /* max_features */
                        32,                 /* max_n_bins */
-                       3,                  /* min_samples_leaf */
-                       3,                  /* min_samples_split */
+                       3,                   /* min_samples_leaf_splitting */
+                       3,                   /* min_samples_leaf_averaging */
+                       3,                   /* min_samples_split_splitting */
+                       3,                   /* min_samples_split_averaging */
                        0.0f,               /* min_impurity_decrease */
                        true,               /* bootstrap */
+                       false,               /* oob_honesty */
+                       true,               /* double_bootstrap */
                        1,                  /* n_trees */
                        1.f,                /* max_samples */
                        1234ULL,            /* seed */

diff --git a/cpp/bench/sg/rf_classifier.cu b/cpp/bench/sg/rf_classifier.cu
@@ -95,10 +95,14 @@ std::vector<Params> getInputs()
                        (1 << 20),           /* max_leaves */
                        0.3,                 /* max_features */
                        32,                  /* max_n_bins */
-                       3,                   /* min_samples_leaf */
-                       3,                   /* min_samples_split */
+                       3,                   /* min_samples_leaf_splitting */
+                       3,                   /* min_samples_leaf_averaging */
+                       3,                   /* min_samples_split_splitting */
+                       3,                   /* min_samples_split_averaging */
                        0.0f,                /* min_impurity_decrease */
                        true,                /* bootstrap */
+                       false,               /* oob_honesty */
+                       true,                /* double_bootstrap */
                        500,                 /* n_trees */
                        1.f,                 /* max_samples */
                        1234ULL,             /* seed */

diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -75,6 +75,26 @@ struct RF_params {
    * tree.
    */
   bool bootstrap;
+
+  /** 
+   * Control whether to use honesty features to allow causal inferencing
+   * 
+   * This indicates that the values used for averaging in the leaf node predictions
+   * should be a disjoint set with the labels used for splits during training. 
+   * See this issue for more detail: https://github.com/rapidsai/cuml/issues/5253
+  */
+  bool oob_honesty;
+
+  /** 
+   * Honesty double bootstrapping
+   * 
+   * With double bootstrapping, the set of samples that was not sampled for training
+   * is again sampled with replacement. This leaves some samples that could be used 
+   * for double OOB prediction. TODO: how can we make the user aware of which 
+   * samples could be used for double OOB prediction?
+  */
+  bool double_bootstrap;
+
   /**
    * Ratio of dataset rows used while fitting each tree.
    */
@@ -192,10 +212,14 @@ RF_params set_rf_params(int max_depth,
                         int max_leaves,
                         float max_features,
                         int max_n_bins,
-                        int min_samples_leaf,
-                        int min_samples_split,
+                        int min_samples_leaf_splitting,
+                        int min_samples_leaf_averaging,
+                        int min_samples_split_splitting,
+                        int min_samples_split_averaging,
                         float min_impurity_decrease,
                         bool bootstrap,
+                        bool oob_honesty,
+                        bool double_bootstrap,
                         int n_trees,
                         float max_samples,
                         uint64_t seed,

diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp
@@ -44,13 +44,21 @@ struct DecisionTreeParams {
    */
   int max_n_bins;
   /**
-   * The minimum number of samples (rows) in each leaf node.
+   * The minimum number of splitting samples (rows) in each leaf node.
    */
-  int min_samples_leaf;
+  int min_samples_leaf_splitting;
+    /**
+   * The minimum number of averaging samples (rows) in each leaf node.
+   */
+  int min_samples_leaf_averaging;
+  /**
+   * The minimum number of splitting samples (rows) needed to split an internal node.
+   */
+  int min_samples_split_splitting;
   /**
-   * The minimum number of samples (rows) needed to split an internal node.
+   * The minimum number of averaging samples (rows) needed to split an internal node.
    */
-  int min_samples_split;
+  int min_samples_split_averaging;
   /**
    * Node split criterion. GINI and Entropy for classification, MSE for regression.
    */
@@ -66,6 +74,11 @@ struct DecisionTreeParams {
    * used only for batched-level algo
    */
   int max_batch_size;
+
+  /**
+   * Whether to use oob honesty features
+  */
+  bool oob_honesty;
 };
 
 /**
@@ -75,9 +88,13 @@ struct DecisionTreeParams {
  * @param[in] cfg_max_leaves: maximum leaves; default -1
  * @param[in] cfg_max_features: maximum number of features; default 1.0f
  * @param[in] cfg_max_n_bins: maximum number of bins; default 128
- * @param[in] cfg_min_samples_leaf: min. rows in each leaf node; default 1
- * @param[in] cfg_min_samples_split: min. rows needed to split an internal node;
+ * @param[in] cfg_min_samples_leaf_splitting: min. splitting rows in each leaf node; default 1
+ * @param[in] cfg_min_samples_leaf_averaging: min. averaging rows in each leaf node when oobhonesty enabled; 
+ *            default 1
+ * @param[in] cfg_min_samples_split_splitting: min. splitting rows needed to split an internal node;
  *            default 2
+ * @param[in] cfg_min_samples_split_averaging: min. averaging rows needed to split an internal
+ *            node when oobhonest enabled; default 2
  * @param[in] cfg_min_impurity_decrease: split a node only if its reduction in
  *                                       impurity is more than this value
  * @param[in] cfg_split_criterion: split criterion; default CRITERION_END,
@@ -91,11 +108,14 @@ void set_tree_params(DecisionTreeParams& params,
                      int cfg_max_leaves              = -1,
                      float cfg_max_features          = 1.0f,
                      int cfg_max_n_bins              = 128,
-                     int cfg_min_samples_leaf        = 1,
-                     int cfg_min_samples_split       = 2,
+                     int cfg_min_samples_leaf_splitting = 1,
+                     int cfg_min_samples_leaf_averaging = 1,
+                     int cfg_min_samples_split_splitting = 2,
+                     int cfg_min_samples_split_averaging = 2,
                      float cfg_min_impurity_decrease = 0.0f,
                      CRITERION cfg_split_criterion   = CRITERION_END,
-                     int cfg_max_batch_size          = 4096);
+                     int cfg_max_batch_size          = 4096,
+                     bool cfg_oob_honesty            = false);
 
 template <class T, class L>
 struct TreeMetaDataNode {

diff --git a/cpp/src/decisiontree/batched-levelalgo/bins.cuh b/cpp/src/decisiontree/batched-levelalgo/bins.cuh
@@ -25,7 +25,7 @@ struct CountBin {
   HDI CountBin(int x_) : x(x_) {}
   HDI CountBin() : x(0) {}
 
-  DI static void IncrementHistogram(CountBin* hist, int n_bins, int b, int label)
+  DI static void IncrementHistogram(CountBin* hist, int n_bins, int b, int label, bool /*is_split_row*/, bool /*split_op*/)
   {
     auto offset = label * n_bins + b;
     CountBin::AtomicAdd(hist + offset, {1});
@@ -43,6 +43,41 @@ struct CountBin {
   }
 };
 
+struct HonestCountBin : CountBin {
+  int x_averaging;
+
+  HonestCountBin(HonestCountBin const&) = default;
+  HDI HonestCountBin(int x_train, int x_averaging) : CountBin(x_train), x_averaging(x_averaging) {}
+  HDI HonestCountBin() : CountBin(), x_averaging(0) {}
+
+  DI static void IncrementHistogram(HonestCountBin* hist, int n_bins, int b, int label, bool is_split_row, bool /*is_split_op*/)
+  {
+    auto offset = label * n_bins + b;    
+    if (is_split_row) {
+      atomicAdd(&(hist + offset)->x, {1});
+    } else {
+      atomicAdd(&(hist + offset)->x_averaging, {1});
+    }
+  }
+
+  DI static void AtomicAdd(HonestCountBin* address, HonestCountBin val) 
+  { 
+    atomicAdd(&address->x, val.x); 
+    atomicAdd(&address->x_averaging, val.x_averaging); 
+  }
+  HDI HonestCountBin& operator+=(const HonestCountBin& b)
+  {
+    CountBin::operator+=(b);
+    x_averaging += b.x_averaging;
+    return *this;
+  }
+  HDI HonestCountBin operator+(HonestCountBin b) const
+  {    
+    b += *this;
+    return b;
+  }
+};
+
 struct AggregateBin {
   double label_sum;
   int count;
@@ -51,7 +86,7 @@ struct AggregateBin {
   HDI AggregateBin() : label_sum(0.0), count(0) {}
   HDI AggregateBin(double label_sum, int count) : label_sum(label_sum), count(count) {}
 
-  DI static void IncrementHistogram(AggregateBin* hist, int n_bins, int b, double label)
+  DI static void IncrementHistogram(AggregateBin* hist, int n_bins, int b, double label, bool, bool)
   {
     AggregateBin::AtomicAdd(hist + b, {label, 1});
   }
@@ -72,5 +107,45 @@ struct AggregateBin {
     return b;
   }
 };
+
+struct HonestAggregateBin : AggregateBin {
+  int count_averaging;
+
+  HonestAggregateBin(HonestAggregateBin const&) = default;
+  HDI HonestAggregateBin() : AggregateBin(), count_averaging(0) {}
+  HDI HonestAggregateBin(double label_sum, int count, int count_averaging) 
+    : AggregateBin(label_sum, count), count_averaging(count_averaging) 
+  {}
+
+  DI static void IncrementHistogram(
+      HonestAggregateBin* hist, int n_bins, int b, double label, bool is_split_row, bool is_split_op)
+  {
+    HonestAggregateBin* address = hist + b;
+    const int train_incr = static_cast<int>(is_split_row);
+    const int avg_incr = 1 - train_incr;
+
+    // Either split row and split op, or neither. Otherwise no increment of the label.
+    const double label_incr = not (is_split_row xor is_split_op) ? label : 0.0; 
+    HonestAggregateBin::AtomicAdd(address, {label_incr, train_incr, avg_incr});
+  }
+  DI static void AtomicAdd(HonestAggregateBin* address, HonestAggregateBin val)
+  {
+    atomicAdd(&address->label_sum, val.label_sum);
+    atomicAdd(&address->count, val.count);
+    atomicAdd(&address->count_averaging, val.count_averaging);
+  }
+  HDI HonestAggregateBin& operator+=(const HonestAggregateBin& b)
+  {
+    AggregateBin::operator+=(b);
+    count_averaging += b.count_averaging;
+    return *this;
+  }
+  HDI HonestAggregateBin operator+(HonestAggregateBin b) const
+  {
+    b += *this;
+    return b;
+  }
+};
+
 }  // namespace DT
 }  // namespace ML