Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,20 @@ if(BUILD_CUML_CPP_LIBRARY)
src/decisiontree/batched-levelalgo/kernels/mse-float.cu
src/decisiontree/batched-levelalgo/kernels/poisson-double.cu
src/decisiontree/batched-levelalgo/kernels/poisson-float.cu

src/decisiontree/batched-levelalgo/kernels/entropy-double-honest.cu
src/decisiontree/batched-levelalgo/kernels/entropy-float-honest.cu
src/decisiontree/batched-levelalgo/kernels/gamma-double-honest.cu
src/decisiontree/batched-levelalgo/kernels/gamma-float-honest.cu
src/decisiontree/batched-levelalgo/kernels/gini-double-honest.cu
src/decisiontree/batched-levelalgo/kernels/gini-float-honest.cu
src/decisiontree/batched-levelalgo/kernels/inverse_gaussian-double-honest.cu
src/decisiontree/batched-levelalgo/kernels/inverse_gaussian-float-honest.cu
src/decisiontree/batched-levelalgo/kernels/mse-double-honest.cu
src/decisiontree/batched-levelalgo/kernels/mse-float-honest.cu
src/decisiontree/batched-levelalgo/kernels/poisson-double-honest.cu
src/decisiontree/batched-levelalgo/kernels/poisson-float-honest.cu

src/decisiontree/batched-levelalgo/kernels/quantiles.cu
src/decisiontree/decisiontree.cu)
endif()
Expand Down
8 changes: 6 additions & 2 deletions cpp/bench/sg/fil.cu
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,14 @@ std::vector<Params> getInputs()
(1 << 20), /* max_leaves */
1.f, /* max_features */
32, /* max_n_bins */
3, /* min_samples_leaf */
3, /* min_samples_split */
3, /* min_samples_leaf_splitting */
3, /* min_samples_leaf_averaging */
3, /* min_samples_split_splitting */
3, /* min_samples_split_averaging */
0.0f, /* min_impurity_decrease */
true, /* bootstrap */
false, /* oob_honesty */
true, /* double_bootstrap */
1, /* n_trees */
1.f, /* max_samples */
1234ULL, /* seed */
Expand Down
8 changes: 6 additions & 2 deletions cpp/bench/sg/filex.cu
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,14 @@ std::vector<Params> getInputs()
(1 << 20), /* max_leaves */
1.f, /* max_features */
32, /* max_n_bins */
3, /* min_samples_leaf */
3, /* min_samples_split */
3, /* min_samples_leaf_splitting */
3, /* min_samples_leaf_averaging */
3, /* min_samples_split_splitting */
3, /* min_samples_split_averaging */
0.0f, /* min_impurity_decrease */
true, /* bootstrap */
false, /* oob_honesty */
true, /* double_bootstrap */
1, /* n_trees */
1.f, /* max_samples */
1234ULL, /* seed */
Expand Down
8 changes: 6 additions & 2 deletions cpp/bench/sg/rf_classifier.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,14 @@ std::vector<Params> getInputs()
(1 << 20), /* max_leaves */
0.3, /* max_features */
32, /* max_n_bins */
3, /* min_samples_leaf */
3, /* min_samples_split */
3, /* min_samples_leaf_splitting */
3, /* min_samples_leaf_averaging */
3, /* min_samples_split_splitting */
3, /* min_samples_split_averaging */
0.0f, /* min_impurity_decrease */
true, /* bootstrap */
false, /* oob_honesty */
true, /* double_bootstrap */
500, /* n_trees */
1.f, /* max_samples */
1234ULL, /* seed */
Expand Down
28 changes: 26 additions & 2 deletions cpp/include/cuml/ensemble/randomforest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,26 @@ struct RF_params {
* tree.
*/
bool bootstrap;

/**
* Control whether to use honesty features to allow causal inferencing
*
* This indicates that the values used for averaging in the leaf node predictions
* should be a disjoint set with the labels used for splits during training.
* See this issue for more detail: https://github.com/rapidsai/cuml/issues/5253
*/
bool oob_honesty;

/**
* Honesty double bootstrapping
*
* With double bootstrapping, the set of samples that was not sampled for training
* is again sampled with replacement. This leaves some samples that could be used
* for double OOB prediction. TODO: how can we make the user aware of which
* samples could be used for double OOB prediction?
*/
bool double_bootstrap;

/**
* Ratio of dataset rows used while fitting each tree.
*/
Expand Down Expand Up @@ -192,10 +212,14 @@ RF_params set_rf_params(int max_depth,
int max_leaves,
float max_features,
int max_n_bins,
int min_samples_leaf,
int min_samples_split,
int min_samples_leaf_splitting,
int min_samples_leaf_averaging,
int min_samples_split_splitting,
int min_samples_split_averaging,
float min_impurity_decrease,
bool bootstrap,
bool oob_honesty,
bool double_bootstrap,
int n_trees,
float max_samples,
uint64_t seed,
Expand Down
38 changes: 29 additions & 9 deletions cpp/include/cuml/tree/decisiontree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,21 @@ struct DecisionTreeParams {
*/
int max_n_bins;
/**
* The minimum number of samples (rows) in each leaf node.
* The minimum number of splitting samples (rows) in each leaf node.
*/
int min_samples_leaf;
int min_samples_leaf_splitting;
/**
* The minimum number of averaging samples (rows) in each leaf node.
*/
int min_samples_leaf_averaging;
/**
* The minimum number of splitting samples (rows) needed to split an internal node.
*/
int min_samples_split_splitting;
/**
* The minimum number of samples (rows) needed to split an internal node.
* The minimum number of averaging samples (rows) needed to split an internal node.
*/
int min_samples_split;
int min_samples_split_averaging;
/**
* Node split criterion. GINI and Entropy for classification, MSE for regression.
*/
Expand All @@ -66,6 +74,11 @@ struct DecisionTreeParams {
* used only for batched-level algo
*/
int max_batch_size;

/**
* Whether to use oob honesty features
*/
bool oob_honesty;
};

/**
Expand All @@ -75,9 +88,13 @@ struct DecisionTreeParams {
* @param[in] cfg_max_leaves: maximum leaves; default -1
* @param[in] cfg_max_features: maximum number of features; default 1.0f
* @param[in] cfg_max_n_bins: maximum number of bins; default 128
* @param[in] cfg_min_samples_leaf: min. rows in each leaf node; default 1
* @param[in] cfg_min_samples_split: min. rows needed to split an internal node;
* @param[in] cfg_min_samples_leaf_splitting: min. splitting rows in each leaf node; default 1
* @param[in] cfg_min_samples_leaf_averaging: min. averaging rows in each leaf node when oobhonesty enabled;
* default 1
* @param[in] cfg_min_samples_split_splitting: min. splitting rows needed to split an internal node;
* default 2
* @param[in] cfg_min_samples_split_averaging: min. averaging rows needed to split an internal
* node when oobhonest enabled; default 2
* @param[in] cfg_min_impurity_decrease: split a node only if its reduction in
* impurity is more than this value
* @param[in] cfg_split_criterion: split criterion; default CRITERION_END,
Expand All @@ -91,11 +108,14 @@ void set_tree_params(DecisionTreeParams& params,
int cfg_max_leaves = -1,
float cfg_max_features = 1.0f,
int cfg_max_n_bins = 128,
int cfg_min_samples_leaf = 1,
int cfg_min_samples_split = 2,
int cfg_min_samples_leaf_splitting = 1,
int cfg_min_samples_leaf_averaging = 1,
int cfg_min_samples_split_splitting = 2,
int cfg_min_samples_split_averaging = 2,
float cfg_min_impurity_decrease = 0.0f,
CRITERION cfg_split_criterion = CRITERION_END,
int cfg_max_batch_size = 4096);
int cfg_max_batch_size = 4096,
bool cfg_oob_honesty = false);

template <class T, class L>
struct TreeMetaDataNode {
Expand Down
79 changes: 77 additions & 2 deletions cpp/src/decisiontree/batched-levelalgo/bins.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct CountBin {
HDI CountBin(int x_) : x(x_) {}
HDI CountBin() : x(0) {}

DI static void IncrementHistogram(CountBin* hist, int n_bins, int b, int label)
DI static void IncrementHistogram(CountBin* hist, int n_bins, int b, int label, bool /*is_split_row*/, bool /*split_op*/)
{
auto offset = label * n_bins + b;
CountBin::AtomicAdd(hist + offset, {1});
Expand All @@ -43,6 +43,41 @@ struct CountBin {
}
};

struct HonestCountBin : CountBin {
int x_averaging;

HonestCountBin(HonestCountBin const&) = default;
HDI HonestCountBin(int x_train, int x_averaging) : CountBin(x_train), x_averaging(x_averaging) {}
HDI HonestCountBin() : CountBin(), x_averaging(0) {}

DI static void IncrementHistogram(HonestCountBin* hist, int n_bins, int b, int label, bool is_split_row, bool /*is_split_op*/)
{
auto offset = label * n_bins + b;
if (is_split_row) {
atomicAdd(&(hist + offset)->x, {1});
} else {
atomicAdd(&(hist + offset)->x_averaging, {1});
}
}

DI static void AtomicAdd(HonestCountBin* address, HonestCountBin val)
{
atomicAdd(&address->x, val.x);
atomicAdd(&address->x_averaging, val.x_averaging);
}
HDI HonestCountBin& operator+=(const HonestCountBin& b)
{
CountBin::operator+=(b);
x_averaging += b.x_averaging;
return *this;
}
HDI HonestCountBin operator+(HonestCountBin b) const
{
b += *this;
return b;
}
};

struct AggregateBin {
double label_sum;
int count;
Expand All @@ -51,7 +86,7 @@ struct AggregateBin {
HDI AggregateBin() : label_sum(0.0), count(0) {}
HDI AggregateBin(double label_sum, int count) : label_sum(label_sum), count(count) {}

DI static void IncrementHistogram(AggregateBin* hist, int n_bins, int b, double label)
DI static void IncrementHistogram(AggregateBin* hist, int n_bins, int b, double label, bool, bool)
{
AggregateBin::AtomicAdd(hist + b, {label, 1});
}
Expand All @@ -72,5 +107,45 @@ struct AggregateBin {
return b;
}
};

struct HonestAggregateBin : AggregateBin {
int count_averaging;

HonestAggregateBin(HonestAggregateBin const&) = default;
HDI HonestAggregateBin() : AggregateBin(), count_averaging(0) {}
HDI HonestAggregateBin(double label_sum, int count, int count_averaging)
: AggregateBin(label_sum, count), count_averaging(count_averaging)
{}

DI static void IncrementHistogram(
HonestAggregateBin* hist, int n_bins, int b, double label, bool is_split_row, bool is_split_op)
{
HonestAggregateBin* address = hist + b;
const int train_incr = static_cast<int>(is_split_row);
const int avg_incr = 1 - train_incr;

// Either split row and split op, or neither. Otherwise no increment of the label.
const double label_incr = not (is_split_row xor is_split_op) ? label : 0.0;
HonestAggregateBin::AtomicAdd(address, {label_incr, train_incr, avg_incr});
}
DI static void AtomicAdd(HonestAggregateBin* address, HonestAggregateBin val)
{
atomicAdd(&address->label_sum, val.label_sum);
atomicAdd(&address->count, val.count);
atomicAdd(&address->count_averaging, val.count_averaging);
}
HDI HonestAggregateBin& operator+=(const HonestAggregateBin& b)
{
AggregateBin::operator+=(b);
count_averaging += b.count_averaging;
return *this;
}
HDI HonestAggregateBin operator+(HonestAggregateBin b) const
{
b += *this;
return b;
}
};

} // namespace DT
} // namespace ML
Loading