Merge pull request #5356 from adamnsch/dt-opt

adamnsch · web-flow · commit 54a3f9d883f0 · 2022-05-17T10:59:53.000+02:00
Improve decision tree training performance on datasets with many classes
diff --git a/ml/ml-algo/src/main/java/org/neo4j/gds/ml/decisiontree/Splitter.java b/ml/ml-algo/src/main/java/org/neo4j/gds/ml/decisiontree/Splitter.java
@@ -32,8 +32,6 @@ public class Splitter {
     private final FeatureBagger featureBagger;
     private final int minLeafSize;
     private final HugeLongArray sortCache;
-    private final ImpurityCriterion.ImpurityData bestLeftImpurityDataForIdx;
-    private final ImpurityCriterion.ImpurityData bestRightImpurityDataForIdx;
     private final ImpurityCriterion.ImpurityData rightImpurityData;
 
     Splitter(long trainSetSize, ImpurityCriterion impurityCriterion, FeatureBagger featureBagger, Features features, int minLeafSize) {
@@ -42,8 +40,6 @@ public class Splitter {
         this.features = features;
         this.minLeafSize = minLeafSize;
         this.sortCache = HugeLongArray.newArray(trainSetSize);
-        this.bestLeftImpurityDataForIdx = impurityCriterion.groupImpurity(HugeLongArray.of(), 0, 0);
-        this.bestRightImpurityDataForIdx = impurityCriterion.groupImpurity(HugeLongArray.of(), 0, 0);
         this.rightImpurityData = impurityCriterion.groupImpurity(HugeLongArray.of(), 0, 0);
     }
 
@@ -52,7 +48,7 @@ static long memoryEstimation(long numberOfTrainingSamples, long sizeOfImpurityDa
                // sort cache
                + HugeLongArray.memoryEstimation(numberOfTrainingSamples)
                // impurity data cache
-               + 6 * sizeOfImpurityData
+               + 4 * sizeOfImpurityData
                // group cache
                + 4 * HugeLongArray.memoryEstimation(numberOfTrainingSamples);
     }
@@ -77,10 +73,6 @@ DecisionTreeTrainer.Split findBestSplit(Group group) {
         int[] featureBag = featureBagger.sample();
 
         for (int featureIdx : featureBag) {
-            double bestImpurityForIdx = Double.MAX_VALUE;
-            double bestValueForIdx = Double.MAX_VALUE;
-            long bestLeftGroupSizeForIdx = -1;
-
             // By doing a sort of the group by this particular feature, all possible splits will simply be represented
             // by each index in the ordered group.
             HugeSerialIndirectMergeSort.sort(rightChildArray, group.size(), (long l) -> features.get(l)[featureIdx], sortCache);
@@ -99,6 +91,7 @@ DecisionTreeTrainer.Split findBestSplit(Group group) {
             }
 
             var leftImpurityData = impurityCriterion.groupImpurity(leftChildArray, 0, minLeafSize - 1L);
+            boolean foundImprovementWithIdx = false;
 
             // Continue moving feature vectors, but now actually compute combined impurity since left group is large enough.
             for (long leftGroupSize = minLeafSize; leftGroupSize <= group.size() - minLeafSize; leftGroupSize++) {
@@ -112,24 +105,18 @@ DecisionTreeTrainer.Split findBestSplit(Group group) {
 
                 // We track best split for a single feature idx in order to keep using `leftChildArray` and `rightChildArray`
                 // throughout search for splits for this particular idx.
-                if (combinedImpurity < bestImpurityForIdx) {
-                    bestValueForIdx = features.get(splittingFeatureVectorIdx)[featureIdx];
-                    bestImpurityForIdx = combinedImpurity;
-                    leftImpurityData.copyTo(bestLeftImpurityDataForIdx);
-                    rightImpurityData.copyTo(bestRightImpurityDataForIdx);
-                    bestLeftGroupSizeForIdx = leftGroupSize;
+                if (combinedImpurity < bestImpurity) {
+                    foundImprovementWithIdx = true;
+                    bestIdx = featureIdx;
+                    bestValue = features.get(splittingFeatureVectorIdx)[featureIdx];
+                    bestImpurity = combinedImpurity;
+                    bestLeftGroupSize = leftGroupSize;
+                    leftImpurityData.copyTo(bestLeftImpurityData);
+                    rightImpurityData.copyTo(bestRightImpurityData);
                 }
             }
 
-            if (bestImpurityForIdx < bestImpurity) {
-                bestIdx = featureIdx;
-                bestValue = bestValueForIdx;
-                bestImpurity = bestImpurityForIdx;
-                bestLeftGroupSize = bestLeftGroupSizeForIdx;
-
-                bestLeftImpurityDataForIdx.copyTo(bestLeftImpurityData);
-                bestRightImpurityDataForIdx.copyTo(bestRightImpurityData);
-
+            if (foundImprovementWithIdx) {
                 // At this time it's fine to swap array pointers since we will have to do a resort for the next feature
                 // anyway.
                 var tmpChildArray = bestRightChildArray;
diff --git a/ml/ml-algo/src/test/java/org/neo4j/gds/ml/decisiontree/DecisionTreeClassifierTest.java b/ml/ml-algo/src/test/java/org/neo4j/gds/ml/decisiontree/DecisionTreeClassifierTest.java
@@ -244,10 +244,10 @@ void estimateDecisionTree(
     @ParameterizedTest
     @CsvSource(value = {
         // Scales with training set size even if maxDepth limits tree size.
-        "  6,  1_000,  41_272,    56_032",
-        "  6, 10_000, 401_272,   488_032",
+        "  6,  1_000,  41_008,    55_768",
+        "  6, 10_000, 401_008,   487_768",
         // Scales with maxDepth when maxDepth is limiting tree size.
-        " 20, 10_000, 401_272, 1_443_712",
+        " 20, 10_000, 401_008, 1_443_448",
     })
     void trainMemoryEstimation(int maxDepth, long numberOfTrainingSamples, long expectedMin, long expectedMax) {
         var config = DecisionTreeTrainerConfigImpl.builder()
diff --git a/ml/ml-algo/src/test/java/org/neo4j/gds/ml/decisiontree/DecisionTreeRegressorTest.java b/ml/ml-algo/src/test/java/org/neo4j/gds/ml/decisiontree/DecisionTreeRegressorTest.java
@@ -227,10 +227,10 @@ void considersMinLeafSize() {
     @ParameterizedTest
     @CsvSource(value = {
         // Scales with training set size even if maxDepth limits tree size.
-        "  6,  1_000,  40_704,    55_968",
-        "  6, 10_000, 400_704,   487_968",
+        "  6,  1_000,  40_600,    55_864",
+        "  6, 10_000, 400_600,   487_864",
         // Scales with maxDepth when maxDepth is limiting tree size.
-        " 20, 10_000, 400_704, 1_523_136",
+        " 20, 10_000, 400_600, 1_523_032",
     })
     void trainMemoryEstimation(int maxDepth, long numberOfTrainingSamples, long expectedMin, long expectedMax) {
         var config = DecisionTreeTrainerConfigImpl.builder()
diff --git a/ml/ml-algo/src/test/java/org/neo4j/gds/ml/decisiontree/SplitterTest.java b/ml/ml-algo/src/test/java/org/neo4j/gds/ml/decisiontree/SplitterTest.java
@@ -195,10 +195,10 @@ void shouldFindBestSplit(
     @ParameterizedTest
     @CsvSource(value = {
         // Scales with training set size.
-        "  1_000,  20,   40_368",
-        " 10_000,  20,  400_368",
+        "  1_000,  20,   40_320",
+        " 10_000,  20,  400_320",
         // Changes a little with impurity data size.
-        "  1_000, 100,   40_848",
+        "  1_000, 100,   40_640",
     })
     void memoryEstimation(long numberOfTrainingSamples, long sizeOfImpurityData, long expectedSize) {
         long size  = Splitter.memoryEstimation(numberOfTrainingSamples, sizeOfImpurityData);
diff --git a/ml/ml-algo/src/test/java/org/neo4j/gds/ml/models/randomforest/RandomForestClassifierTest.java b/ml/ml-algo/src/test/java/org/neo4j/gds/ml/models/randomforest/RandomForestClassifierTest.java
@@ -270,19 +270,19 @@ void predictOverheadMemoryEstimation(
 
     @ParameterizedTest
     @CsvSource(value = {
-        "     6, 100_000,  10, 10, 1,   1, 0.1, 1.0,   5_214_370, 6_027_194",
+        "     6, 100_000,  10, 10, 1,   1, 0.1, 1.0,   5_214_106, 6_026_930",
         // Should increase fairly little with more trees if training set big.
-        "    10, 100_000,  10, 10, 1,  10, 0.1, 1.0,   5_215_018, 7_096_578",
+        "    10, 100_000,  10, 10, 1,  10, 0.1, 1.0,   5_214_754, 7_096_314",
         // Should be capped by number of training examples, despite high max depth.
-        " 8_000,     500,  10, 10, 1,   1, 0.1, 1.0,        27_930, 187_730",
+        " 8_000,     500,  10, 10, 1,   1, 0.1, 1.0,        27_666, 187_466",
         // Should increase very little when having more classes.
-        "    10, 100_000, 100, 10, 1,  10, 0.1, 1.0,   5_220_058, 7_101_618",
+        "    10, 100_000, 100, 10, 1,  10, 0.1, 1.0,   5_218_354, 7_099_914",
         // Should increase very little when using more features for splits.
-        "    10, 100_000, 100, 10, 1,  10, 0.9, 1.0,   5_220_098, 7_101_750",
+        "    10, 100_000, 100, 10, 1,  10, 0.9, 1.0,   5_218_394, 7_100_046",
         // Should decrease a lot when sampling fewer training examples per tree.
-        "    10, 100_000, 100, 10, 1,  10, 0.1, 0.2,   1_370_058, 2_611_618",
+        "    10, 100_000, 100, 10, 1,  10, 0.1, 0.2,   1_368_354, 2_609_914",
         // Should almost be x4 when concurrency * 4.
-        "    10, 100_000, 100, 10, 4,  10, 0.1, 1.0, 19_677_808, 24_257_808",
+        "    10, 100_000, 100, 10, 4,  10, 0.1, 1.0, 19_670_992, 24_250_992",
     })
     void trainMemoryEstimation(
         int maxDepth,
diff --git a/ml/ml-algo/src/test/java/org/neo4j/gds/ml/models/randomforest/RandomForestRegressorTest.java b/ml/ml-algo/src/test/java/org/neo4j/gds/ml/models/randomforest/RandomForestRegressorTest.java
@@ -166,17 +166,17 @@ void predictOverheadMemoryEstimation() {
 
     @ParameterizedTest
     @CsvSource(value = {
-        "     6, 100_000, 10, 1,   1, 0.1, 1.0,   4_813_754, 5_627_586",
+        "     6, 100_000, 10, 1,   1, 0.1, 1.0,   4_813_650, 5_627_482",
         // Should increase fairly little with more trees if training set big.
-        "    10, 100_000, 10, 1,  10, 0.1, 1.0,   4_814_474, 6_786_058",
+        "    10, 100_000, 10, 1,  10, 0.1, 1.0,   4_814_370, 6_785_954",
         // Should be capped by number of training examples, despite high max depth.
-        " 8_000,     500, 10, 1,   1, 0.1, 1.0,        25_314, 193_098",
+        " 8_000,     500, 10, 1,   1, 0.1, 1.0,        25_210, 192_994",
         // Should increase very little when using more features for splits.
-        "    10, 100_000, 10, 1,  10, 0.9, 1.0,   4_814_514, 6_786_190",
+        "    10, 100_000, 10, 1,  10, 0.9, 1.0,   4_814_410, 6_786_086",
         // Should decrease a lot when sampling fewer training examples per tree.
-        "    10, 100_000, 10, 1,  10, 0.1, 0.2,     964_474, 2_296_058",
+        "    10, 100_000, 10, 1,  10, 0.1, 0.2,     964_370, 2_295_954",
         // Should almost be x4 when concurrency * 4.
-        "    10, 100_000, 10, 4,  10, 0.1, 1.0, 19_255_376, 23_949_952",
+        "    10, 100_000, 10, 4,  10, 0.1, 1.0, 19_254_960, 23_949_536",
     })
     void trainMemoryEstimation(
         int maxDepth,
diff --git a/pipeline/src/test/java/org/neo4j/gds/ml/pipeline/linkPipeline/train/LinkPredictionTrainTest.java b/pipeline/src/test/java/org/neo4j/gds/ml/pipeline/linkPipeline/train/LinkPredictionTrainTest.java
@@ -180,15 +180,15 @@ static Stream<Arguments> paramsForEstimationsWithParamSpace() {
                     .build()
                     .toTunableConfig()
                 ),
-                MemoryRange.of(66_896, 899_376)
+                MemoryRange.of(66_352, 898_832)
             ),
             Arguments.of(
                 "Default RF and default LR",
                 List.of(
                     LogisticRegressionTrainConfig.DEFAULT.toTunableConfig(),
                     RandomForestClassifierTrainerConfig.DEFAULT.toTunableConfig()
                 ),
-                MemoryRange.of(73_976, 2_738_824)
+                MemoryRange.of(73_432, 2_738_280)
             ),
             Arguments.of(
                 "Default RF and default LR with range",
@@ -199,7 +199,7 @@ static Stream<Arguments> paramsForEstimationsWithParamSpace() {
                     ),
                     RandomForestClassifierTrainerConfig.DEFAULT.toTunableConfig()
                 ),
-                MemoryRange.of(73_976, 2_738_824)
+                MemoryRange.of(73_432, 2_738_280)
             ),
             Arguments.of(
                 "Default RF and default LR with batch size range",
diff --git a/pipeline/src/test/java/org/neo4j/gds/ml/pipeline/nodePipeline/classification/train/NodeClassificationTrainPipelineExecutorTest.java b/pipeline/src/test/java/org/neo4j/gds/ml/pipeline/nodePipeline/classification/train/NodeClassificationTrainPipelineExecutorTest.java
@@ -323,7 +323,7 @@ public static Stream<Arguments> trainerMethodConfigs() {
             ),
             Arguments.of(
                 List.of(RandomForestClassifierTrainerConfig.DEFAULT.toTunableConfig()),
-                MemoryRange.of(91_186, 207_958)
+                MemoryRange.of(90_938, 207_710)
             ),
             Arguments.of(
                 List.of(LogisticRegressionTrainConfig.DEFAULT.toTunableConfig(), RandomForestClassifierTrainerConfig.DEFAULT.toTunableConfig()),